Downloading Data

In [3]:
!wget "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
#utility libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#sklearn libraries
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator , TransformerMixin
from sklearn.pipeline import Pipeline

#preprocessing sklearn libs
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.compose import ColumnTransformer

#ignoring Warnings .
import warnings
warnings.filterwarnings('ignore')



In [None]:
cols = ['MPG' , 'Cylinders' , 'Displacement' , 'Horsepower' , 'Weight',
       'Accleration' , 'Model Year' , 'Origin' ]

df = pd.read_csv('auto-mpg.data' , names=cols , sep=' '  ,comment = '\t',
                 skipinitialspace = True)
data = df.copy()
#splitting Data using Stratified Shuffle Split
split = StratifiedShuffleSplit(n_splits = 1 , test_size=0.2 , random_state=12)
for train_index  , test_index in split.split(data , data['Cylinders']):
  strat_train_set = data.loc[train_index]
  strat_test_set = data.loc[test_index]


Creating Data Features and labels

In [None]:
data = strat_train_set.drop('MPG' , axis = 1)
data_labels = strat_train_set['MPG'].copy()
data

Preprocessing Origin Column into Countries and Adding the Custom Attrribute Adder Class

In [None]:
def preprocess_origin_cols(df):
  df['Origin'] = df['Origin'].map({1:'India' , 2:'USA' , 3:'Germany'})
  return df

In [None]:
#Adding Custom Attribute adder class
acc_col , hpower_col , cyl_col = 4 , 2 , 0

class CustomAttrAdder(BaseEstimator , TransformerMixin):
  def __init__(self , acc_on_power=True):
    self.acc_on_power = acc_on_power
  
  def fit(self , X , y=None):
    return self
  def transform(self , X):
    acc_on_cyl = X[:, acc_col] / X[: , cyl_col]
    if self.acc_on_power :
      acc_on_power = X[: , acc_col]/X[: , hpower_col]
      return np.c_[X , acc_on_power , acc_on_cyl]
    
    return np.c_[X , acc_on_cyl]

Creating 2 Functions to Preprocess Categroical Data and Numerical Data

In [None]:
def num_pipeline_transformer(data):
    '''
    Function to process numerical transformations
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
        
    '''
    numerics = ['float64', 'int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline


def pipeline_transformer(data):
    '''
    Complete transformation pipeline for both
    nuerical and categorical data.
    
    Argument:
        data: original dataframe 
    Returns:
        prepared_data: transformed data, ready to use
    '''
    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

Transforming Raw Data to Processed Data

In [None]:
preprocessed_df = preprocess_origin_cols(data)
processed_data = pipeline_transformer(preprocessed_df)
processed_data

In [None]:
processed_data[0]

## Using Different Kinds of Models
1. Linear Regression
2. Decision Tree
3. Random Forest
4. SVM Regressor

# 1. Linear Regression 

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(processed_data , data_labels)

In [None]:
data[45:55]['Origin'].value_counts()

In [None]:
sample_data = data.iloc[45:55].copy()

sample_labels = data_labels.iloc[45:55].copy()

sample_data_processe = pipeline_transformer(sample_data)
pred = lin_reg.predict(sample_data_processe)

print('The Predicted Values are ' , pred)
print('The Actual Values are ' , list(sample_labels))

Using Mean Squared Error


In [None]:
from sklearn.metrics import mean_squared_error as mse

mpg_pred = lin_reg.predict(processed_data)
lin_mse = mse(data_labels , mpg_pred)
#finding Root Mean Squared Error

lin_rmse = np.sqrt(lin_mse)
lin_rmse

# 2. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

dec_tree = DecisionTreeRegressor()
dec_tree.fit(processed_data , data_labels)

In [None]:
pred = dec_tree.predict(processed_data)
dtree_mse = mse(data_labels , pred)

#rmse

dtree_rmse = np.sqrt(dtree_mse)
dtree_rmse #Model OverFits 

#Model Evaluation Using CV (K-Fold CV)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(dec_tree , processed_data , data_labels , cv = 10 , scoring = 'neg_mean_squared_error')

tree_rmse_score = np.sqrt(-scores)
tree_rmse_score

In [None]:
#finding the mean rmse
tree_rmse_score.mean()

In [None]:
#Performing Cross_val_score on Linear Reg
lin_cv_sc = cross_val_score(lin_reg , processed_data , data_labels , cv = 10 , scoring = 'neg_mean_squared_error')
lin_rmse_cv = np.sqrt(-lin_cv_sc)
print(lin_rmse_cv)
lin_rmse_cv.mean()

# 3. Random Forest (Ensemble Method)

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()
forest.fit(processed_data , data_labels)
forest_cv_sc = cross_val_score(forest , processed_data , data_labels , cv = 10 , scoring = 'neg_mean_squared_error')
for_rmse_cv = np.sqrt(-forest_cv_sc)
for_rmse_cv.mean()


# 4. Support Vector Machine Regression

In [None]:
from sklearn.svm import SVR

svr = SVR(kernel = 'linear')
svr.fit(processed_data , data_labels)
svr_cv =  cross_val_score(svr ,  processed_data  , data_labels , cv=10 , scoring= 'neg_mean_squared_error')
svr_rmse = np.sqrt(-svr_cv)
svr_rmse.mean()

# HyperParameter Tuning 

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
              {'n_estimators':[3 , 10 , 30] , 'max_features':[2 , 4 , 6 , 8]},
              {'bootstrap':[False] , 'n_estimators':[3 , 10] , 'max_features':[2 , 3 ,4 ]}

]

forest = RandomForestRegressor()

grid_s = GridSearchCV(forest , param_grid , scoring = 'neg_mean_squared_error' , return_train_score = True , cv = 10)
grid_s.fit(processed_data , data_labels)
grid_s.best_params_

In [None]:
cv_scores = grid_s.cv_results_

#print all Parameters with Score
for mean_score , params in zip(cv_scores['mean_test_score'] , cv_scores['params']):
  print(np.sqrt(-mean_score) , params)

In [None]:
feature_importances = grid_s.best_estimator_.feature_importances_
feature_importances

In [None]:
extra_attrs = ['acc_on_power' , 'acc_on_cyl']
numerics = ['float64' , 'int64']

num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs

sorted(zip(attrs , feature_importances) , reverse= True)

#Selecting Estimator with best Params
Testing on Test set

In [2]:
final_model = grid_s.best_estimator_

X_test = strat_test_set.drop('MPG' , axis = 1)
y_test = strat_test_set['MPG'].copy()

X_test_preprocessed = preprocess_origin_cols(X_test)
X_test_prep = pipeline_transformer(X_test_preprocessed)

final_preds = final_model.predict(X_test_prep)
final_mse = mse(y_test , final_preds)
rmse = np.sqrt(final_mse)
rmse

NameError: name 'grid_s' is not defined

Function to Automate for Prediciton

In [None]:
def predict_mpg(config, model):
    
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
    
    preproc_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(preproc_df)
    y_pred = model.predict(prepared_df)
    return y_pred

In [None]:
##checking it on a random sample
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Origin': [3, 2, 1]
}

predict_mpg(vehicle_config, final_model)

Saving The Model

In [None]:
import pickle

In [None]:
#saving the model
with open('model.bin' ,'wb') as f_out:
  pickle.dump(final_model , f_out)
  f_out.close()

In [None]:
#loading the model and predciting
with open('model.bin' , 'rb') as f_in:
  model  = pickle.load(f_in)

predict_mpg(vehicle_config , model)