In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline



In [46]:
cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

df=pd.read_csv("auto-mpg.data",names=cols,na_values='?',comment='\t',sep=" ",skipinitialspace=True)
data=df.copy()

split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(data,data['Cylinders']):
    strat_train_set=data.loc[train_index]
    strat_test_set=data.loc[test_index]

In [47]:
data=strat_train_set.drop('MPG',axis=1)
data_label=strat_train_set["MPG"].copy()
data.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2


In [48]:
def preprocess_origin_col(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df

In [49]:
index_acc,index_hp,index_cyl=4,2,0

class FeatureAdder(BaseEstimator,TransformerMixin):
    def __init__(self,acc_power=True):
        self.acc_power=acc_power
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        acc_cylinder=X[:,index_acc]/X[:,index_cyl]
        if self.acc_power:
            acc_power=X[:,index_acc]/X[:,index_hp]
            return np.c_[X,acc_power,acc_cylinder]
        return np.c_[X,acc_cylinder]
    

def numeric_pipeline(df):
    numerics=['float64','int64']
    attr=df.select_dtypes(include=numerics)
    
    pipeline=Pipeline([
        ("Imputer", SimpleImputer(strategy="median")),
        ("Feature Add",FeatureAdder()),
        ("Scaler",StandardScaler()),
    ])
    return attr,pipeline


def data_pipeline(df):
    cat=["Origin"]
    attr,pipeline=numeric_pipeline(df)
    full_pipeline=ColumnTransformer([
        ("Numeric_transform",pipeline,list(attr))
        ("Encode",OneHotEncoder(),cat),
     ])
    data=full_pipeline.fit_transform(df)
    return data


In [50]:
def numeric_pipeline(data):
    '''
    Function to process numerical transformations
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
        
    '''
    numerics = ['float64', 'int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', FeatureAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline


def data_pipeline(data):
    '''
    Complete transformation pipeline for both
    nuerical and categorical data.
    
    Argument:
        data: original dataframe 
    Returns:
        prepared_data: transformed data, ready to use
    '''
    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = numeric_pipeline(data)
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

In [51]:
preprocess_df=preprocess_origin_col(data)
prepared_data=data_pipeline(preprocess_df)
prepared_data[0]


array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

Traning Models:
1.Linear Regression
2.Decision Tree
3.SVM
4.Random Forest

In [52]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(prepared_data, data_label)

sample_data = data.iloc[:5]
sample_labels = data_label.iloc[:5]

sample_data_prepared = data_pipeline(sample_data)

print("Prediction of samples: ", lin_reg.predict(sample_data_prepared))

print("Actual Labels of samples: ", list(sample_labels))

Prediction of samples:  [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]
Actual Labels of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


In [53]:
from sklearn.metrics import mean_squared_error

lg_prediction=lin_reg.predict(prepared_data)
lin_mse= mean_squared_error(data_label,lg_prediction)
lin_rmse= np.sqrt(lin_mse)
lin_rmse

2.9590402225760872

In [54]:
#Decision Tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

tree_reg= RandomForestRegressor()
tree_reg.fit(prepared_data,data_label)
tree_cv= cross_val_score(tree_reg,prepared_data,data_label,scoring='neg_mean_squared_error',cv=10)

tree_mse=np.sqrt(-tree_cv)
tree_mse.mean()

2.6202931042537623

In [55]:
from sklearn.model_selection import GridSearchCV

param_grid=[
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg=RandomForestRegressor()

grid_search= GridSearchCV(forest_reg,param_grid,scoring='neg_mean_squared_error',return_train_score=True,cv=10)
grid_search.fit(prepared_data,data_label)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [56]:
grid_search.best_params_

{'max_features': 4, 'n_estimators': 30}

In [57]:
cv_scores=grid_search.cv_results_

for mean_square,param in zip(cv_scores['mean_test_score'],cv_scores['params']):
    print(np.sqrt(-mean_square),param)

3.7246683087030217 {'max_features': 2, 'n_estimators': 3}
2.971575714637367 {'max_features': 2, 'n_estimators': 10}
2.8715652462525845 {'max_features': 2, 'n_estimators': 30}
3.0696096429701045 {'max_features': 4, 'n_estimators': 3}
2.8877207515812366 {'max_features': 4, 'n_estimators': 10}
2.7114037477620583 {'max_features': 4, 'n_estimators': 30}
3.001799770024476 {'max_features': 6, 'n_estimators': 3}
2.896003522306158 {'max_features': 6, 'n_estimators': 10}
2.7519077636442932 {'max_features': 6, 'n_estimators': 30}
3.0678982121257152 {'max_features': 8, 'n_estimators': 3}
2.8614508589429177 {'max_features': 8, 'n_estimators': 10}
2.739046968631785 {'max_features': 8, 'n_estimators': 30}
3.341072031667161 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.8208932378369433 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.275290744023392 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.9780424128785046 {'bootstrap': False, 'max_features': 3, 'n_estim

In [58]:
#Feature Importance

feature_importance=grid_search.best_estimator_.feature_importances_
feature_importance

array([0.14221324, 0.2378602 , 0.11031219, 0.16277943, 0.01826813,
       0.11690313, 0.08145281, 0.11912565, 0.00496064, 0.0029929 ,
       0.0031317 ])

In [59]:
extra=["acc_power","acc_cyl"]
num=["float64","int64"]

num_aatr= list(data.select_dtypes(include=num))

attr=num_aatr+extra

sorted(zip(attr,feature_importance),reverse=True)

[('acc_power', 0.08145280978657944),
 ('acc_cyl', 0.11912564674824973),
 ('Weight', 0.16277942605230872),
 ('Model Year', 0.11690312860126634),
 ('Horsepower', 0.11031218543529696),
 ('Displacement', 0.23786019680265372),
 ('Cylinders', 0.1422132398118388),
 ('Acceleration', 0.018268125484568)]

In [60]:
final=grid_search.best_estimator_

X_test=strat_test_set.drop("MPG",axis=1)
y_test=strat_test_set["MPG"].copy()

X_test_process=preprocess_origin_col(X_test)
X_test_prepared=data_pipeline(X_test_process)

final_predict=final.predict(X_test_prepared)
final_mse=mean_squared_error(final_predict,y_test)
final_rmse=np.sqrt(final_mse)
final_rmse


2.881425307995257

In [61]:
#Function

def predict_func(config,model):
    if type(config)==dict:
        df=pd.DataFrame(config)
    else:
        df=config
    preproc_df=preprocess_origin_col(df)
    prep_data=data_pipeline(preproc_df)
    y_pred=model.predict(prep_data)
    return y_pred

In [62]:
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

predict_func(vehicle_config, final)

array([32.57333333, 17.54333333, 20.40333333])

In [63]:
import pickle

In [64]:
with open("model.bin","wb") as f_out:
    pickle.dump(final,f_out)
    f_out.close()

In [65]:
with open("model.bin","rb") as f_in:
    model=pickle.load(f_in)

predict_func(vehicle_config,model)

array([32.57333333, 17.54333333, 20.40333333])