In [None]:
##importing a few general use case libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit

import warnings
warnings.filterwarnings('ignore')


In [None]:
# reading the .data file using pandas

cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('/content/drive/MyDrive/Project/auto-mpg.data', names=cols, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)

data = df.copy()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [None]:
data=strat_train_set.drop("MPG",axis=1)
data_labels=strat_train_set['MPG'].copy()
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [None]:
def preprocess_origin_cols(df):
  df['Origin']=df['Origin'].map({1:"India",2:"USA",3:"Germany"})
  return df

data_tr=preprocess_origin_cols(data)
data_tr.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,Germany
151,4,79.0,67.0,2000.0,16.0,74,USA
388,4,156.0,92.0,2585.0,14.5,82,India
48,6,250.0,88.0,3139.0,14.5,71,India
114,4,98.0,90.0,2265.0,15.5,73,USA


In [None]:
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Cylinders     318 non-null    int64  
 1   Displacement  318 non-null    float64
 2   Horsepower    314 non-null    float64
 3   Weight        318 non-null    float64
 4   Acceleration  318 non-null    float64
 5   Model Year    318 non-null    int64  
 6   Origin        318 non-null    object 
dtypes: float64(4), int64(2), object(1)
memory usage: 19.9+ KB


In [None]:
#isoloating the origin column (we seprate the categorical values )
data_cat=data_tr[["Origin"]]
data_cat.head()

Unnamed: 0,Origin
145,Germany
151,USA
388,India
48,India
114,USA


In [None]:
#encode the categorical value using onehotencoder
from sklearn.preprocessing import OneHotEncoder

cat_encoder=OneHotEncoder()
data_cat_1hot=cat_encoder.fit_transform(data_cat)
data_cat_1hot #returns a sparse matrix(a matrix containing most of the values as zero)

<318x3 sparse matrix of type '<class 'numpy.float64'>'
	with 318 stored elements in Compressed Sparse Row format>

In [None]:
data_cat_1hot.toarray()[:5] #convert to an array matrix 

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [None]:
cat_encoder.categories_

[array(['Germany', 'India', 'USA'], dtype=object)]

In [None]:
#segregating the numerical values
num_data=data_tr.iloc[:,:-1];
num_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Cylinders     318 non-null    int64  
 1   Displacement  318 non-null    float64
 2   Horsepower    314 non-null    float64
 3   Weight        318 non-null    float64
 4   Acceleration  318 non-null    float64
 5   Model Year    318 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 17.4 KB


In [None]:
#handling missing values
from sklearn.impute import SimpleImputer

imputer =SimpleImputer(strategy="median")
imputer.fit(num_data)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [None]:
#median from imputer
imputer.statistics_

array([   4. ,  146. ,   92. , 2844. ,   15.5,   76. ])

In [None]:
#median from pandas method
num_data.median().values

array([   4. ,  146. ,   92. , 2844. ,   15.5,   76. ])

In [None]:
#transforming the value which return a 2d array
x=imputer.transform(num_data)
x

array([[   4. ,   83. ,   61. , 2003. ,   19. ,   74. ],
       [   4. ,   79. ,   67. , 2000. ,   16. ,   74. ],
       [   4. ,  156. ,   92. , 2585. ,   14.5,   82. ],
       ...,
       [   4. ,  135. ,   84. , 2295. ,   11.6,   82. ],
       [   4. ,  113. ,   95. , 2372. ,   15. ,   70. ],
       [   6. ,  146. ,  120. , 2930. ,   13.8,   81. ]])

In [None]:
#converting 2d array back to dataframe
data_tr=pd.DataFrame(x,columns=num_data.columns,index=num_data.index)
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Cylinders     318 non-null    float64
 1   Displacement  318 non-null    float64
 2   Horsepower    318 non-null    float64
 3   Weight        318 non-null    float64
 4   Acceleration  318 non-null    float64
 5   Model Year    318 non-null    float64
dtypes: float64(6)
memory usage: 27.4 KB


In [None]:
data_tr.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year
145,4.0,83.0,61.0,2003.0,19.0,74.0
151,4.0,79.0,67.0,2000.0,16.0,74.0
388,4.0,156.0,92.0,2585.0,14.5,82.0
48,6.0,250.0,88.0,3139.0,14.5,71.0
114,4.0,98.0,90.0,2265.0,15.5,73.0


In [None]:
#In order to make changes to datasets and create new variables, sklearn offers the BaseEstimator class using which we can develop new features
# by defining our own class.
#We have created a class to add 2 new features as found in the EDA step above:
#acc_on_power — Acceleration divided by Horsepower
#acc_on_cyl — Acceleration divided by the number of Cylinders

from sklearn.base import BaseEstimator, TransformerMixin

acc_ix, hpower_ix, cyl_ix = 4, 2, 0

##custom class inheriting the BaseEstimator and TransformerMixin
class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True):
        self.acc_on_power = acc_on_power  # new optional variable
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix] # required new variable
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl] # returns a 2D array
        
        return np.c_[X, acc_on_cyl]
    
attr_adder = CustomAttrAdder(acc_on_power=True)
data_tr_extra_attrs = attr_adder.transform(data_tr.values)
data_tr_extra_attrs[0]


array([4.0000000e+00, 8.3000000e+01, 6.1000000e+01, 2.0030000e+03,
       1.9000000e+01, 7.4000000e+01, 3.1147541e-01, 4.7500000e+00])

In [None]:
#using pipeline class to automate all the steps of numerical value
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

numerics={'int64','float64'}
num_data=num_data.select_dtypes(include=numerics)

#pipeline for numerical data
# impute(missing values) -> customclassAdder(to add the custom columns) -> standardScaler
num_pipeline=Pipeline([
                       ('imputer',SimpleImputer(strategy="median")),
                       ('atters_adder',CustomAttrAdder()),
                       ('std-scaler',StandardScaler()),
                    ])

num_data_tr=num_pipeline.fit_transform(num_data)
num_data_tr[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517])

In [None]:
##Transform different columns or subsets using ColumnTransformer
from sklearn.compose import ColumnTransformer

num_attrs = list(num_data)
cat_attrs = ["Origin"]

##complete pipeline to transform 
##both numerical and cat. attributes
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attrs),
        ("cat", OneHotEncoder(), cat_attrs),
    ])

prepared_data = full_pipeline.fit_transform(data)
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])