In [None]:
##importing a few general use case libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import r2_score



import warnings
warnings.filterwarnings('ignore')

In [None]:
# reading the .data file using pandas

cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('/content/drive/MyDrive/Project/auto-mpg.data', names=cols, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)

data = df.copy()

In [None]:

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [None]:
data=strat_train_set.drop("MPG",axis=1)
data_labels=strat_train_set['MPG'].copy()

In [None]:
def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df

In [None]:
acc_ix, hpower_ix, cyl_ix = 4, 2, 0

##custom class inheriting the BaseEstimator and TransformerMixin
class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True):
        self.acc_on_power = acc_on_power  # new optional variable
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix] # required new variable
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl] # returns a 2D array
        
        return np.c_[X, acc_on_cyl]

In [None]:
def num_pipeline_transformer(df):
  numerics={'int64','float64'}
  num_data=data.select_dtypes(include=numerics)
  #pipeline for numerical data
  # impute(missing values) -> customclassAdder(to add the custom columns) -> standardScaler
  num_pipeline=Pipeline([
                        ('imputer',SimpleImputer(strategy="median")),
                        ('atters_adder',CustomAttrAdder()),
                        ('std-scaler',StandardScaler()),
                      ])
  return num_data, num_pipeline

In [None]:
def pipeline_transformer(df):
  cat_attrs = ["Origin"]
  num_attrs,num_pipeline=num_pipeline_transformer(df)
  ##complete pipeline to transform 
  ##both numerical and cat. attributes
  full_pipeline = ColumnTransformer([
          ("num", num_pipeline, list(num_attrs)),
          ("cat", OneHotEncoder(), cat_attrs),
      ])
  prepared_data = full_pipeline.fit_transform(df)
  return prepared_data

In [None]:
##from raw data to processed data in 2 steps
preprocessed_df = preprocess_origin_cols(data)
prepared_data = pipeline_transformer(preprocessed_df)
prepared_data

array([[-0.85657842, -1.07804475, -1.15192977, ...,  1.        ,
         0.        ,  0.        ],
       [-0.85657842, -1.1174582 , -0.9900351 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.85657842, -0.3587492 , -0.31547399, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.85657842, -0.56566984, -0.53133355, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85657842, -0.78244384, -0.23452666, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.32260746, -0.45728283,  0.44003446, ...,  1.        ,
         0.        ,  0.        ]])

In [None]:
from sklearn.linear_model import LinearRegression

linear_reg=LinearRegression()
linear_reg.fit(prepared_data,data_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
sample_data=data.iloc[:5]
sample_data_label=data_labels.iloc[:5]

sample_data_prepared=pipeline_transformer(sample_data)

print("Prediction of sample values:",linear_reg.predict(sample_data_prepared))

Prediction of sample values: [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]


In [None]:
print("Actual Values:",list(sample_data_label))

Actual Values: [32.0, 31.0, 26.0, 18.0, 26.0]


In [None]:
from sklearn.metrics import mean_squared_error

mpg_predictions = linear_reg.predict(prepared_data)
lin_mse = mean_squared_error(data_labels, mpg_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

2.9590402225760872

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(prepared_data, data_labels)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [None]:
mpg_predictions = tree_reg.predict(prepared_data)
tree_mse = mean_squared_error(data_labels, mpg_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [None]:
from sklearn.model_selection import cross_val_score

score=cross_val_score(tree_reg,prepared_data,data_labels,scoring="neg_mean_squared_error",cv=10)
rmse_scores=np.sqrt(-score)

In [None]:
rmse_scores

array([3.29246298, 2.87005009, 2.96431906, 3.26759659, 2.37480262,
       3.03865719, 3.44537734, 5.08806815, 4.18337724, 2.56005292])

In [None]:
rmse_scores.mean()

3.3084764190722575

In [None]:
score=cross_val_score(linear_reg,prepared_data,data_labels,scoring="neg_mean_squared_error",cv=10)
rmse_scores=np.sqrt(-score)
rmse_scores.mean()

3.0757081793709333

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg=RandomForestRegressor()
forest_reg.fit(prepared_data,data_labels)
score=cross_val_score(forest_reg,prepared_data,data_labels,scoring="neg_mean_squared_error",cv=10)
rmse_scores=np.sqrt(-score)
rmse_scores.mean()

2.579460743437091

In [None]:
from sklearn.svm import SVR

svm_reg = SVR(kernel='linear')
svm_reg.fit(prepared_data, data_labels)
svm_cv_scores = cross_val_score(svm_reg, prepared_data, data_labels,
                                scoring='neg_mean_squared_error',
                                cv = 10)
svm_rmse_scores = np.sqrt(-svm_cv_scores)
svm_rmse_scores.mean()

3.08659162080283

In [None]:
#since we know the best model in this case is random forest we try to find the best parameters so to improve the accuracy
#we hypetune the paramter using GridSearchCV

from sklearn.model_selection import GridSearchCV

param_grid=[{'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
            {'bootstrap':[False], 'n_estimators':[3,10], 'max_features':[2,3,4]}]

forest_reg=RandomForestRegressor()

grid_search=GridSearchCV(forest_reg,param_grid,scoring='neg_mean_squared_error',return_train_score=True,cv=10)

grid_search.fit(prepared_data,data_labels)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_j

In [None]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [None]:
cv_scores = grid_search.cv_results_

##printing all the parameters along with their scores
for mean_score, params in zip(cv_scores['mean_test_score'], cv_scores["params"]):
    print(np.sqrt(-mean_score), params)

3.3310735451595255 {'max_features': 2, 'n_estimators': 3}
3.1317069446384624 {'max_features': 2, 'n_estimators': 10}
2.877361994799055 {'max_features': 2, 'n_estimators': 30}
3.2736159145376202 {'max_features': 4, 'n_estimators': 3}
2.991366554083116 {'max_features': 4, 'n_estimators': 10}
2.7990124624604587 {'max_features': 4, 'n_estimators': 30}
3.3067071143305324 {'max_features': 6, 'n_estimators': 3}
2.768742858433956 {'max_features': 6, 'n_estimators': 10}
2.753241980547627 {'max_features': 6, 'n_estimators': 30}
3.1744770510260487 {'max_features': 8, 'n_estimators': 3}
2.8133107809844304 {'max_features': 8, 'n_estimators': 10}
2.725966951973682 {'max_features': 8, 'n_estimators': 30}
3.333322177400686 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.8805578388051183 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.0613128779558534 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.8201015213509257 {'bootstrap': False, 'max_features': 3, 'n_estim

In [None]:
feature_importance=grid_search.best_estimator_.feature_importances_
feature_importance

array([0.13735092, 0.29609195, 0.13084366, 0.24382932, 0.01564226,
       0.11475719, 0.03301246, 0.02172493, 0.0021654 , 0.00233959,
       0.00224233])

In [None]:
extra_attrs=["acconpower","acconcylinder"]
numerics={'int64','float64'}
num_attrs=list(data.select_dtypes(include=numerics))
attrs=num_attrs+extra_attrs
sorted(zip(attrs, feature_importance), reverse=True)

[('acconpower', 0.033012464883022785),
 ('acconcylinder', 0.021724931770768636),
 ('Weight', 0.24382932103421512),
 ('Model Year', 0.11475718846462013),
 ('Horsepower', 0.13084366267641367),
 ('Displacement', 0.29609194517379867),
 ('Cylinders', 0.1373509176178564),
 ('Acceleration', 0.01564225766012984)]

In [None]:
final_model=grid_search.best_estimator_

X_test=strat_test_set.drop("MPG",axis=1)
y_test=strat_test_set.copy()['MPG']

X_test_preprocessed=preprocess_origin_cols(X_test)
X_test_prepared=pipeline_transformer(X_test_preprocessed)

final_prediction=final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_prediction)
final_rmse = np.sqrt(final_mse)
final_rmse

2.906789776635999

In [None]:
#Creating a function to cover this entire flow
def predict_mpg(config, model):
    
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
    
    preproc_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(preproc_df)
    y_pred = model.predict(prepared_df)
    return y_pred

In [None]:
##checking it on a random sample
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

array([32.31333333, 17.37666667, 23.36333333])

In [None]:
#save the model

import pickle

In [None]:
with open("model.bin", 'wb') as f_out:
    pickle.dump(final_model, f_out)
    f_out.close()

In [None]:
##loading the model from the saved file
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)

predict_mpg(vehicle_config, model)

array([32.31333333, 17.37666667, 23.36333333])