# Selection and Training
## Training Linear Regression, Decision Tree, RandomForest, SVM Regressor

In [100]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [101]:
#reading the data file

cols = ["MPG", "Cylinders", "Displacement", "Horsepower", "Weight", "Acceleration",
           "Model Year", "Origin"]

df = pd.read_csv('./auto-mpg.data', names=cols, na_values="?",
                comment='\t',
                sep=" ",
                skipinitialspace=True)
data = df.copy()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [102]:
strat_train_set.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,32.0,4,83.0,61.0,2003.0,19.0,74,3
151,31.0,4,79.0,67.0,2000.0,16.0,74,2
388,26.0,4,156.0,92.0,2585.0,14.5,82,1
48,18.0,6,250.0,88.0,3139.0,14.5,71,1
114,26.0,4,98.0,90.0,2265.0,15.5,73,2


In [103]:
#data is set equal to the training while dropping the mpg column
#setting axis=1 drops the column
data = strat_train_set.drop("MPG", axis=1)
#data_labels is the MPG column data
data_labels = strat_train_set["MPG"].copy()
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [104]:
#preprocess Origin column data
def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df

In [105]:
#custom class to add attributes to data
from sklearn.base import BaseEstimator, TransformerMixin

acc_ix, hpower_ix, cyl_ix = 4, 2, 0

#attributes were found in previous steps to see which new attributes contributed to our target variable the most
#create custom class to add these custom attributes to our data set

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    #first function, init
    def __init__(self, acc_on_power=True):
        self.acc_on_power = acc_on_power
    #second function, just returns self?
    def fit(self, X, y=None):
        return self
    #transforms data
    def transform(self, X):
        #acceleration/cylinders, takes in Data(aka X) and takes all rows and accerlation column(4)
        #does the same for cylinder column(2)
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        #if it is initiated then put data(X) and append acc_on_power and acc_on_cyl column
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]
        return np.c_[X, acc_on_cyl]

In [106]:
from sklearn.compose import ColumnTransformer

#class for num pipeline: imputes null data, adds attributes, and scales data
def num_pipeline_transformer(data):
    numerics = ['float64', 'int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('attrs_adder', CustomAttrAdder()),
            ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline

#full pipeline for numerical and categorical data: concats above datafram with categorical data fram
def pipeline_transformer(data):
    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

In [107]:
#returning raw processed data
preprocessed_df = preprocess_origin_cols(data)
preprocessed_df
prepared_data = pipeline_transformer(preprocessed_df)

In [108]:
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

# Linear Regression Model

In [109]:
from sklearn.linear_model import LinearRegression

#used linear regression to plot onto prepared training data (prepared_data)
lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

#testing with sample data
#sample data gets 5 rows
sample_data = data.iloc[:5]
#data_labels is MPG strat_train_set data and uses the same rows, data is all data while data_labels is just MPG
sample_labels = data_labels.iloc[:5]

#gave pipeline some of data which is training set of strat_traininga_set, to prepare data
sample_data_prepared = pipeline_transformer(sample_data)

#prints out prediction based on our model and giving it sample training data
print("Prediction of samples: ",
     lin_reg.predict(sample_data_prepared))

Prediction of samples:  [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [110]:
print("Actual Labels of samples: ", list(sample_labels))

Actual Labels of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


In [111]:
#Mean squared error of above to evavluate errors
from sklearn.metrics import mean_squared_error

mpg_predictions = lin_reg.predict(prepared_data)
#takes in 2 parameters, data_labels which is true values and mpg_predictions which is predicted values
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2.9590402225760872

# Decision Tree

In [112]:
from sklearn.tree import DecisionTreeRegressor

regr = DecisionTreeRegressor()

#training model with decision tree regressor
regr.fit(prepared_data,data_labels)

DecisionTreeRegressor()

In [113]:
#evaluate mean squared error

#use the trained model to make a prediction
mpg_regr_predictions = regr.predict(prepared_data)
regr_mse = mean_squared_error(data_labels, mpg_regr_predictions)
regr_rmse = np.sqrt(regr_mse)
regr_rmse

0.0

# Cross Validation

In [114]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(regr,
                        prepared_data,
                        data_labels,
                        scoring="neg_mean_squared_error",
                        cv=10)
tree_reg_rmse_scores = np.sqrt(-scores)

In [115]:
#scores will change everytime ran because it's using different subsets
tree_reg_rmse_scores

array([3.81547015, 3.04030221, 2.9734765 , 3.41453694, 2.48621198,
       3.30648038, 3.6604303 , 3.71874475, 4.11107866, 2.48985036])

In [116]:
tree_reg_rmse_scores.mean()

3.301658223036749

In [117]:
#checking linear regression model
lin_scores = cross_val_score(lin_reg, prepared_data, data_labels, scoring="neg_mean_squared_error", cv=10)
lin_reg_rmse_scores = np.sqrt(-lin_scores)
lin_reg_rmse_scores

array([3.43254597, 3.45157629, 3.6621715 , 2.59652976, 2.48023405,
       2.74798115, 3.32524647, 2.42208917, 3.78133275, 2.8573747 ])

In [118]:
lin_reg_rmse_scores.mean()

3.0757081793709324

# Random Forest Model

In [119]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg,
                                      prepared_data,
                                      data_labels,
                                      scoring="neg_mean_squared_error",
                                      cv=10)
forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_scores

array([2.12599115, 2.56258536, 2.68837237, 2.46716942, 1.97558972,
       2.47393344, 2.60080969, 2.62631363, 4.13851285, 1.93319894])

In [120]:
forest_reg_rmse_scores.mean()

2.5592476574703746

# Hyperparamter Tuning using GridSearchCV

In [136]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
                           scoring='neg_mean_squared_error',
                           return_train_score=True,
                           cv=10,
                          )

grid_search.fit(prepared_data, data_labels)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [137]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [138]:
cv_scores = grid_search.cv_results_

#printing parameters and their scores
for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores
                             ["params"]):
        print(np.sqrt(-mean_score), params)

3.26375532022892 {'max_features': 2, 'n_estimators': 3}
3.002171193486474 {'max_features': 2, 'n_estimators': 10}
2.953439005465557 {'max_features': 2, 'n_estimators': 30}
3.3060839160878923 {'max_features': 4, 'n_estimators': 3}
2.8467404482030814 {'max_features': 4, 'n_estimators': 10}
2.8278287796609702 {'max_features': 4, 'n_estimators': 30}
3.4277066425032543 {'max_features': 6, 'n_estimators': 3}
2.782139433714463 {'max_features': 6, 'n_estimators': 10}
2.7225762987488453 {'max_features': 6, 'n_estimators': 30}
3.1539725050678573 {'max_features': 8, 'n_estimators': 3}
2.795697154672723 {'max_features': 8, 'n_estimators': 10}
2.6747444351673644 {'max_features': 8, 'n_estimators': 30}
3.3106487861602987 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.780600422311222 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.28615003218815 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.867992780800852 {'bootstrap': False, 'max_features': 3, 'n_estimator

In [139]:
#feature importances

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.17627315, 0.33052271, 0.13872785, 0.14230135, 0.01522654,
       0.1284148 , 0.02764167, 0.03474223, 0.00190369, 0.00276995,
       0.00147606])

In [140]:
extra_attrs = ["acc_on_power", "acc_on_cyl"]
numerics = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importances), reverse=True)

[('acc_on_power', 0.02764166850943347),
 ('acc_on_cyl', 0.03474222532403147),
 ('Weight', 0.14230135333912927),
 ('Model Year', 0.12841480174734227),
 ('Horsepower', 0.13872785114188374),
 ('Displacement', 0.3305227057202297),
 ('Cylinders', 0.17627315425716772),
 ('Acceleration', 0.015226540751290062)]

# Evaluating the entire system

In [141]:
#capturing the best configuration that we've found
final_model = grid_search.best_estimator_

#segregating target variable
X_test = strat_test_set.drop("MPG", axis=1)
y_test = strat_test_set["MPG"].copy()

#preprocessing the test data origin column
X_test_preprocessed = preprocess_origin_cols(X_test)

#prepraing the data with final transformation
X_test_prepared = pipeline_transformer(X_test_preprocessed)

#final predictions
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [142]:
final_rmse

3.0505447373288024

# Function for entire flow

In [143]:
def predict_mpg(config, model):
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
    
    preproc_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(preproc_df)
    y_pred = model.predict(prepared_df)
    return y_pred

In [144]:
#random sample to check flow
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

array([34.37666667, 16.93333333, 19.33666667])

# Save the Model

In [145]:
import pickle

In [146]:
#saving model
with open("model.bin", 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [147]:
#loading model from file to ensure saved
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)
    
predict_mpg(vehicle_config, model)

array([34.37666667, 16.93333333, 19.33666667])