#### Aim:

To predict the MPG value for a vehicle, given that we have other attributes of that vehicle.

In [101]:
##importing a few general use case libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer



import warnings
warnings.filterwarnings('ignore')

In [102]:
# reading the .data file using pandas

cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./data folder/auto-mpg.data', names=cols, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)

data = df.copy()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

Seperating the features from the target variable

In [103]:
data = strat_train_set.drop('MPG', axis=1)
data_labels = strat_train_set['MPG'].copy()

data.head()


Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2


Separating the origin column in the data set

In [104]:
def preprocess_origin_cols(df):
    df['Origin'] = df['Origin'].map({1: "India", 2: "USA", 3: "Germany"})
    return df

Creating a custom attribute adder class

In [105]:
acc_ix, hpower_ix, cyl_ix = 4, 2, 0

class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True):
        self.acc_on_power = acc_on_power
        
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_cyl, acc_on_cyl]
        return np.c_[X, acc_on_cyl]

In [106]:
def num_pipeline_transformer(data):
    '''
    Function to process numerical transformations
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
        
    '''
    numerics = ['float64', 'int64']
    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
    ])
    return num_attrs, num_pipeline

    

In [107]:
def pipeline_transformer(data):
    '''
    Complete transformation pipeline for both
    nuerical and categorical data.
    
    Argument:
        data: original dataframe 
    Returns:
        prepared_data: transformed data, ready to use
    '''
    cat_attrs = ['Origin']
    num_attrs, num_pipeline = num_pipeline_transformer(data)

    full_pipeline = ColumnTransformer([
        ('num', num_pipeline, list(num_attrs)),
        ('cat', OneHotEncoder(), cat_attrs),
    ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

Going from raw data to processed data in two steps

In [108]:
##from raw data to processed data in 2 steps
preprocessed_df = preprocess_origin_cols(data)
preprocessed_df.head()
prepared_data = pipeline_transformer(preprocessed_df)
prepared_data

array([[-0.85657842, -1.07804475, -1.15192977, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.1174582 , -0.9900351 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.78244384, -0.23452666, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.        ,
         0.        ,  0.        ]])

In [109]:
prepared_data[0]

array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.29565517,  1.29565517,  1.        ,  0.        ,
        0.        ])

Selecting and Training Models
1. Linear Regression
2. Decision Tree
3. Random Forest
4. SVM regressor

In [110]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_labels)

In [111]:
sample_data = data.iloc[:5]
sample_labels = data_labels[:5]

sample_data_prepared = pipeline_transformer(sample_data)

print('Prediction of sameples: ', lin_reg.predict(sample_data_prepared))

Prediction of sameples:  [30.46788598 28.65142911 24.86599749 12.94573202 22.44718936]


In [112]:
print("Actual Labels of samples: ", list(sample_labels))

Actual Labels of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


Mean Square Error

In [116]:
from sklearn.metrics import mean_squared_error

def mse(prepared_data, data_labels):
    mpg_predictions = lin_reg.predict(prepared_data)
    lin_mse = mean_squared_error(data_labels, mpg_predictions)
    print('MSE: ',lin_mse)
    lin_rmse = np.sqrt(lin_mse)
    return 'RMSE: ', lin_rmse

mse(prepared_data, data_labels)
    

MSE:  10.085389327691761


('RMSE: ', 3.1757501991957366)

Decision Tree

In [117]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)


In [118]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

But no model is perfect, this means that our model has overfit the data to a great extent.

We won't be touching out test data until we finalize our model. So, how do we check for what's happening?

#### Model Evaluation using Cross Validation

Scikit-Learn’s K-fold cross-validation feature randomly splits the training set into K distinct subsets called folds, then it trains and evaluates the model K times, picking a different fold for evaluation every time and training on the other K-1 folds.

The result is an array containing the K evaluation scores:

In [119]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, prepared_data, data_labels, scoring='neg_mean_squared_error', cv=10)

tree_reg_rmse_scores = np.sqrt(-scores)


In [120]:
tree_reg_rmse_scores

array([2.9144575 , 3.22877686, 3.01936459, 2.90172363, 2.44342229,
       3.25341167, 3.31653057, 2.88898771, 4.4370144 , 2.48550637])

In [121]:
tree_reg_rmse_scores.mean()

3.0889195580969244

In [122]:
scores = cross_val_score(lin_reg, prepared_data, data_labels, scoring='neg_mean_squared_error', cv=10)
lin_reg_rmse_scores = np.sqrt(-scores)


In [123]:
lin_reg_rmse_scores

array([2.75806203, 3.68053303, 3.86343747, 2.70168768, 2.77754965,
       3.28024255, 3.65459843, 2.61688796, 4.0913401 , 3.19425607])

In [124]:
lin_reg_rmse_scores.mean()

3.2618594963723018

Random Forest Model

In [125]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(prepared_data, data_labels)
forest_reg_cv_scores = cross_val_score(forest_reg,
                                         prepared_data,
                                         data_labels,
                                         scoring='neg_mean_squared_error',
                                         cv = 10)

forest_reg_rmse_scores = np.sqrt(-forest_reg_cv_scores)
forest_reg_rmse_scores.mean()

2.547129547707004

Support Vector Machine Regressor

In [126]:
from sklearn.svm import SVR 

svm_reg = SVR(kernel='linear')
svm_reg.fit(prepared_data, data_labels)

In [127]:
svm_cv_scores = cross_val_score(svm_reg, prepared_data, data_labels,
                                scoring='neg_mean_squared_error',
                                cv = 10)
svm_rmse_scores = np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()

3.359030205555343

Hyperparameter Tuning using GridSearchCV


In [128]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, scoring='neg_mean_squared_error', return_train_score=True, cv=10)

grid_search.fit(prepared_data, data_labels)

In [129]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [130]:
cv_scores = grid_search.cv_results_

# printing all parameters along with their scores

for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores['params']):
    print(np.sqrt(-mean_score), params)

3.6440350096333725 {'max_features': 2, 'n_estimators': 3}
3.101514013088946 {'max_features': 2, 'n_estimators': 10}
3.0023189504816354 {'max_features': 2, 'n_estimators': 30}
3.3228545642560303 {'max_features': 4, 'n_estimators': 3}
2.8397857799858213 {'max_features': 4, 'n_estimators': 10}
2.7890928521962195 {'max_features': 4, 'n_estimators': 30}
3.063932807976068 {'max_features': 6, 'n_estimators': 3}
2.855334998494753 {'max_features': 6, 'n_estimators': 10}
2.6681910507725903 {'max_features': 6, 'n_estimators': 30}
3.014271070279753 {'max_features': 8, 'n_estimators': 3}
2.8427966037411005 {'max_features': 8, 'n_estimators': 10}
2.6895029757783186 {'max_features': 8, 'n_estimators': 30}
3.611281233930367 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
3.0270421355213695 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.279220412588473 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.8472762061933574 {'bootstrap': False, 'max_features': 3, 'n_estim

In [131]:
#### Checking feature importance

In [132]:
feature_importances = grid_search.best_estimator_.feature_importances_

feature_importances

array([0.2514844 , 0.22312775, 0.11961259, 0.17483422, 0.01903889,
       0.12086635, 0.04877422, 0.03459638, 0.00336399, 0.00283819,
       0.00146301])

In [133]:
extra_attrs = ['acc_on_power', 'acc_on_cyl']
numerics = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs

sorted(zip(attrs, feature_importances), reverse=True)

[('acc_on_power', 0.04877422160046643),
 ('acc_on_cyl', 0.03459637920609204),
 ('Weight', 0.17483421920584308),
 ('Model Year', 0.12086635278482874),
 ('Horsepower', 0.11961258778084907),
 ('Displacement', 0.22312775283131253),
 ('Cylinders', 0.2514843985832042),
 ('Acceleration', 0.019038894529865926)]

#### Evaluating the entire system on test data

In [134]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop('MPG', axis=1)
y_test = strat_test_set['MPG'].copy()

X_test_processed = preprocess_origin_cols(X_test)
X_test_prepared = pipeline_transformer(X_test_processed)

final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [135]:
final_mse

9.494764444444451

In [136]:
final_rmse

3.0813575651722815

Creating a function to cover the entire flow


In [140]:
def predict_mpg(config, model):
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
    preproc_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(preproc_df)
    y_pred = model.predict(prepared_df)
    return y_pred


In [141]:
##checking it on a random sample
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

array([35.28666667, 18.99333333, 22.08      ])

Saving the model

In [142]:
import pickle

In [144]:
# saving the model
with open('model.bin', 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [145]:
# Loading the model from the saved file

with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)
    

In [146]:
predict_mpg(vehicle_config, model)

array([35.28666667, 18.99333333, 22.08      ])