In [328]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit

In [329]:
#defining the column names
cols = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
           'Acceleration', 'Model Year', 'Origin']
#reading the .data file using pandas
df = pd.read_csv('./auto-mpg.data', names=cols, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)
#making a copy of the dataframe
data = df.copy()
data.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [330]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
#for each training or test index in the split data
#the training or testing set is set equal to the information found in that data row
for train_index, test_index in split.split(data, data["Cylinders"]) :
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [331]:
data.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [332]:
#data is set equal to the training while dropping the mpg column
#setting axis=1 drops the column
data = strat_train_set.drop("MPG", axis=1)
#data_labels is the MPG column data
data_labels = strat_train_set["MPG"].copy()
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


# Preprocessing the Origin Column

In [333]:
#makes a function called preprocess_origin_cols(df) which takes in a data file and maps onto it
def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df
data_tr = preprocess_origin_cols(data)
data_tr.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,Germany
151,4,79.0,67.0,2000.0,16.0,74,USA
388,4,156.0,92.0,2585.0,14.5,82,India
48,6,250.0,88.0,3139.0,14.5,71,India
114,4,98.0,90.0,2265.0,15.5,73,USA


# One Hot Encoding the Origin Column

In [334]:
#we see there are 318 non-null except in horsepower where there are only 314
#we have to process that
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Cylinders     318 non-null    int64  
 1   Displacement  318 non-null    float64
 2   Horsepower    314 non-null    float64
 3   Weight        318 non-null    float64
 4   Acceleration  318 non-null    float64
 5   Model Year    318 non-null    int64  
 6   Origin        318 non-null    object 
dtypes: float64(4), int64(2), object(1)
memory usage: 19.9+ KB


In [335]:
#isolating the origin column
data_cat = data_tr[["Origin"]]
data_cat.head()

Unnamed: 0,Origin
145,Germany
151,USA
388,India
48,India
114,USA


In [336]:
from sklearn.preprocessing import OneHotEncoder
#uses hotencoder to transform origin column to spare matrix
cat_encoder = OneHotEncoder()
data_cat_1hot = cat_encoder.fit_transform(data_cat)
data_cat_1hot

<318x3 sparse matrix of type '<class 'numpy.float64'>'
	with 318 stored elements in Compressed Sparse Row format>

In [337]:
data_cat_1hot.toarray()[:5]

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [338]:
cat_encoder.categories_

[array(['Germany', 'India', 'USA'], dtype=object)]

# Handling Missing Values Using SimpleImputer

In [339]:
#using iloc to get dataframe with all the rows and i believe all columns with the exception of the last one
#which in this case is origin
num_data = data.iloc[:, :-1]
num_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Cylinders     318 non-null    int64  
 1   Displacement  318 non-null    float64
 2   Horsepower    314 non-null    float64
 3   Weight        318 non-null    float64
 4   Acceleration  318 non-null    float64
 5   Model Year    318 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 17.4 KB


In [340]:
from sklearn.impute import SimpleImputer
#the median strategy takes the median of the data and fills in the null data with the median
#we do this to be able to use all the data
imputer = SimpleImputer(strategy="median")
imputer.fit(num_data)

SimpleImputer(strategy='median')

In [341]:
#median of columns for imputer
imputer.statistics_

array([   4. ,  146. ,   92. , 2844. ,   15.5,   76. ])

In [342]:
#check median from pandas dataframe
data.median().values

array([   4. ,  146. ,   92. , 2844. ,   15.5,   76. ])

In [343]:
#imputing missing values by transforming the dataframe of num_data
X = imputer.transform(num_data)
#returns a 2D numpy array
X

array([[   4. ,   83. ,   61. , 2003. ,   19. ,   74. ],
       [   4. ,   79. ,   67. , 2000. ,   16. ,   74. ],
       [   4. ,  156. ,   92. , 2585. ,   14.5,   82. ],
       ...,
       [   4. ,  135. ,   84. , 2295. ,   11.6,   82. ],
       [   4. ,  113. ,   95. , 2372. ,   15. ,   70. ],
       [   6. ,  146. ,  120. , 2930. ,   13.8,   81. ]])

In [344]:
#converting the 2D numpy array back to a datafram
#make data_tr a pd DataFram using the 2D Array X and the columns will be column labels from num_data
#indexing will be based off num_data as well 
data_tr = pd.DataFrame(X, columns=num_data.columns,
                    index=num_data.index)
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Cylinders     318 non-null    float64
 1   Displacement  318 non-null    float64
 2   Horsepower    318 non-null    float64
 3   Weight        318 non-null    float64
 4   Acceleration  318 non-null    float64
 5   Model Year    318 non-null    float64
dtypes: float64(6)
memory usage: 17.4 KB


# Adding Attributes using BaseEstimator and Transformer

In [356]:
#checking information we're working with
num_data.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year
145,4.0,83.0,61.0,2003.0,19.0,74.0
151,4.0,79.0,67.0,2000.0,16.0,74.0
388,4.0,156.0,92.0,2585.0,14.5,82.0
48,6.0,250.0,88.0,3139.0,14.5,71.0
114,4.0,98.0,90.0,2265.0,15.5,73.0


In [359]:
from sklearn.base import BaseEstimator, TransformerMixin

acc_ix, hpower_ix, cyl_ix = 4, 2, 0

#attributes were found in previous steps to see which new attributes contributed to our target variable the most
#create custom class to add these custom attributes to our data set

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    #first function, init
    def __init__(self, acc_on_power=True):
        self.acc_on_power = acc_on_power
    #second function, just returns self?
    def fit(self, X, y=None):
        return self
    #transforms data
    def transform(self, X):
        #acceleration/cylinders, takes in Data(aka X) and takes all rows and accerlation column(4)
        #does the same for cylinder column(2)
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        #if it is initiated then put data(X) and append acc_on_power and acc_on_cyl column
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]

#this inits the CustomAttrAdder class
attr_addr = CustomAttrAdder(acc_on_power=True)
#transforms the data by appending new attributes
data_tr_extra_attrs = attr_adder.transform(data_tr.values)
data_tr_extra_attrs[0]

array([4.0000000e+00, 8.3000000e+01, 6.1000000e+01, 2.0030000e+03,
       1.9000000e+01, 7.4000000e+01, 3.1147541e-01, 4.7500000e+00])

# Create Pipeline of tasks

In [360]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
numerics = ['float64', 'int64']

num_data = data_tr.select_dtypes(include=numerics)
#pipeline for numerical attributes
#pipeline goes through imputer to impute null values
#then it adds the attributes we need the data to have using CustomAttrAdder()
#then it will scale the data for better performance

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
    ])

num_data_tr = num_pipeline.fit_transform(num_data)
num_data_tr[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517])

# Transforming Numerical and Categorical Attributes

In [362]:
##Transform different columns or subsets using ColumnTransformer
from sklearn.compose import ColumnTransformer

num_attrs = list(num_data)
cat_attrs = ["Origin"]

print(num_attrs)

##complete pipeline to transform 
##both numerical and cat. attributes
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attrs),
        ("cat", OneHotEncoder(), cat_attrs),
    ])

prepared_data = full_pipeline.fit_transform(data)
prepared_data[0]

['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year']


array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])