# Predicting Fuel Efficiency of Vehicles.

   Selecting and training models
   1. Select and train a few algorithms.
   2. Evaluation using mean squared error.
   3. Model Evaluation using Cross Validation.
   4. Hyperparameter Tuning
   5. Check Feature Importance 
   6. Evaluate final model
   7. Save the model

In [1]:
#Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline



#### EDA has been done on same data in different file. Please see the same for more information.

In [2]:
#Reading the .data file using pandas

cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

auto_df=pd.read_csv("auto-mpg.data",names=cols,na_values='?',comment='\t',sep=" ",skipinitialspace=True)
data=auto_df.copy()

split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(data,data['Cylinders']):
    strat_train_set=data.loc[train_index]
    strat_test_set=data.loc[test_index]

### Segregate independent and dependent variables.

In [3]:
data=strat_train_set.drop('MPG',axis=1)
data_label=strat_train_set["MPG"].copy()
data.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2


### Encoding the categorical "Origin" column.

In [4]:
def preprocess_origin_col(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df

### Creating Custom Attribute Adder class
Creating new features, information about same is present in EDA.ipynb, refer to same to understand it more

In [5]:
index_acc,index_hp,index_cyl=4,2,0

class FeatureAdder(BaseEstimator,TransformerMixin):
    def __init__(self,acc_power=True):
        self.acc_power=acc_power
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        acc_cylinder=X[:,index_acc]/X[:,index_cyl]
        if self.acc_power:
            acc_power=X[:,index_acc]/X[:,index_hp]
            return np.c_[X,acc_power,acc_cylinder]
        return np.c_[X,acc_cylinder]
    

## Function:
    1) numeric_pipeline:Function to process numerical transformations
                        Argument: data->original dataframe.
                        Returns: num_attrs-> numerical dataframe
                                 num_pipeline->numerical pipeline object
                         
    2) data_pipeline: Complete transformation pipeline for both numerical and categorical data.
                      Argument: data-> original dataframe 
                      Returns: prepared_data-> transformed data, ready to use


In [6]:
def numeric_pipeline(data):
    numerics = ['float64', 'int64']
    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', FeatureAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline


def data_pipeline(data):
    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = numeric_pipeline(data) #Calling numeric_pipeline function
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

### From Raw Data to process data in 2 steps 

In [7]:
preprocess_df=preprocess_origin_col(data)
prepared_data=data_pipeline(preprocess_df)
prepared_data[0]


array([-0.85657842, -1.07804475, -1.15192977, -1.17220298,  1.21586943,
       -0.54436373,  1.70952741,  1.29565517,  1.        ,  0.        ,
        0.        ])

# Traning Models:
    1.Linear Regression
    2.Decision Tree
    3.Random Forest

# Linear Model

In [8]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()              #Calling Linear Regression
lin_reg.fit(prepared_data, data_label)    #Fitting our data to the model

LinearRegression()

In [9]:
# Testing the predictions with sample
sample_data = data.iloc[:5]
sample_labels = data_label.iloc[:5]

sample_data_prepared = data_pipeline(sample_data)

print("Prediction of samples: ", lin_reg.predict(sample_data_prepared))

print("Actual Labels of samples: ", list(sample_labels))

Prediction of samples:  [29.08069379 27.78336755 26.08031176 12.70419279 22.23454159]
Actual Labels of samples:  [32.0, 31.0, 26.0, 18.0, 26.0]


### Mean Squared Error

In [10]:
from sklearn.metrics import mean_squared_error

lg_prediction=lin_reg.predict(prepared_data)
lin_mse= mean_squared_error(data_label,lg_prediction)
lin_rmse= np.sqrt(lin_mse)
lin_rmse

2.9590402225760872

Through linear regression we get mean squared error of 2.95 which is good but still we make decision after comparing it with other models.

### RandomForestRegressor

In [11]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor

tree_reg= RandomForestRegressor()
tree_reg.fit(prepared_data,data_label)
tree_prediction = tree_reg.predict(prepared_data)
tree_mse=mean_squared_error(data_label,tree_prediction)
tree_rmse=np.sqrt(tree_mse)
tree_rmse

0.9991628933369083

Although the error has reduced but still we will go with one more model and compare the performance

### DecisionTreeRegressor

In [12]:
from sklearn.tree import DecisionTreeRegressor

dec_tree= DecisionTreeRegressor()
dec_tree.fit(prepared_data,data_label)

DecisionTreeRegressor()

In [13]:
dec_prediction = dec_tree.predict(prepared_data)
dec_mse=mean_squared_error(data_label,dec_prediction)
dec_rmse=np.sqrt(dec_mse)
dec_rmse

0.0

Here error which we received is 0 but no model can be perfect. This means overfitting has occured.
Because of similar scenario, we don't touch our test data until we are sure of the efficiency of our model.

### Model Validation using  Cross Validation

K-cross valiation technique divides the training data into K distinct subsets called folds, then it trains on individual fold and evaluate the model K times, picking a different fold for evaluation every time and training on other K-1 folds.

Result is an array containing the K evaluation scores:

In [14]:
from sklearn.model_selection import cross_val_score

# Function for cross_validation
def cross_validation(estimator,independent,dependent,cv=None):
    scores=cross_val_score(estimator,independent,dependent,scoring="neg_mean_squared_error",cv=cv)
    return scores

# Function to calculate Root mean square error
def root_mean_square(scores):
    rmse_array=np.sqrt(-scores)
    rmse=np.mean(rmse_array)
    return rmse


### Validating both Decission Tree and Linear Regression

In [15]:
# DecisionTreeRegressor
dec_tree_scores=cross_val_score(dec_tree,prepared_data,data_label,scoring="neg_mean_squared_error",cv=10) 
dec_cross_rmse=root_mean_square(dec_tree_scores)
print("Mean Root mean Square of DecisionTreeRegressor:",dec_cross_rmse)

# LinearRegression
linear_reg_scores=cross_val_score(lin_reg,prepared_data,data_label,scoring="neg_mean_squared_error",cv=10) 
linear_reg_rmse=root_mean_square(linear_reg_scores)
print("-"*70)
print("Mean Root mean Square of LinearRegression:",linear_reg_rmse)

#RandomForestRegressor
tree_cv=cross_validation(tree_reg,prepared_data,data_label,cv=10)
tree_rmse=root_mean_square(tree_cv)
print("-"*70)
print("Mean Root mean Square of LinearRegression:",tree_rmse)
print("-"*70)

Mean Root mean Square of DecisionTreeRegressor: 3.2157097076785552
----------------------------------------------------------------------
Mean Root mean Square of LinearRegression: 3.0757081793709324
----------------------------------------------------------------------
Mean Root mean Square of LinearRegression: 2.5651381120764616
----------------------------------------------------------------------


### Comparing the results of all three model

In [16]:
root_mean_values=np.array([tree_rmse,dec_cross_rmse,linear_reg_rmse])
root_mean_data=pd.DataFrame([root_mean_values],columns=["RandomForestRegressor","DecisionTreeRegressor","LinearRegression"])
root_mean_data

Unnamed: 0,RandomForestRegressor,DecisionTreeRegressor,LinearRegression
0,2.565138,3.21571,3.075708


As we can see that RandomForestRegressor provides us with the minimum error so we will continue with the same.

### Hyperparameter Tuning using GridSearchCV 

Hyperparameters are parameters that are not directly learnt with the model. It is possible and recommended to search the 
hyper-parameter space or best cross validation score.
The grid search provided by GridSearchCV exhaustively generates candidates from a grid of parameter values specified with the param_grid parameter. For instance, the following param_grid

To find the names and current values for all parameters for a given estimator use:

In [17]:
tree_reg.get_params() # Output every parameter for RandomForestRegressor

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [18]:
from sklearn.model_selection import GridSearchCV

# We create 2 grids to be explored
param_grid=[
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg=RandomForestRegressor()
grid_search= GridSearchCV(forest_reg,
                          param_grid,
                          scoring='neg_mean_squared_error',
                          return_train_score=True,
                          cv=10)
grid_search.fit(prepared_data,data_label)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [19]:
# Storing the results
cv_scores=grid_search.cv_results_

#### Printing all the parameters along with their scores

In [20]:
for mean_square,param in zip(cv_scores['mean_test_score'],cv_scores['params']):
    print(np.sqrt(-mean_square),param)

print("-"*60)
# TO check for the parameters selected by our estimator.
print(grid_search.best_estimator_)

3.6199329776316036 {'max_features': 2, 'n_estimators': 3}
3.0725535032350426 {'max_features': 2, 'n_estimators': 10}
2.874682901468295 {'max_features': 2, 'n_estimators': 30}
3.5210187486967226 {'max_features': 4, 'n_estimators': 3}
2.8531561824780822 {'max_features': 4, 'n_estimators': 10}
2.7717499718744514 {'max_features': 4, 'n_estimators': 30}
3.4351578857194887 {'max_features': 6, 'n_estimators': 3}
2.786296858098866 {'max_features': 6, 'n_estimators': 10}
2.7219653153725885 {'max_features': 6, 'n_estimators': 30}
3.141053942559995 {'max_features': 8, 'n_estimators': 3}
2.8086219625697284 {'max_features': 8, 'n_estimators': 10}
2.64981148375079 {'max_features': 8, 'n_estimators': 30}
3.307987238471828 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
2.949595813617465 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
3.2220672218157773 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
2.9353679685666494 {'bootstrap': False, 'max_features': 3, 'n_estima

#### Checking Feature Importance 

In [21]:
feature_importance=grid_search.best_estimator_.feature_importances_
feature_importance

array([0.13837777, 0.25679354, 0.14163484, 0.27639793, 0.01487727,
       0.12368389, 0.02538391, 0.01753645, 0.00228627, 0.00128474,
       0.0017434 ])

#### Viewing the Feature Importance 

In [22]:
extra=["acc_power","acc_cyl"]
num=["float64","int64"]

num_aatr= list(data.select_dtypes(include=num))

attr=num_aatr+extra

sorted(zip(attr,feature_importance),reverse=True)

[('acc_power', 0.025383910347666735),
 ('acc_cyl', 0.017536448121938126),
 ('Weight', 0.27639793248834454),
 ('Model Year', 0.1236838860172806),
 ('Horsepower', 0.14163484117213804),
 ('Displacement', 0.2567935376155446),
 ('Cylinders', 0.1383777712327786),
 ('Acceleration', 0.01487726597165504)]

We can see that features [Weight, Model Year, Horsepower, Displacement, Cylinders] have got the larger number.

### Evaluating model on the entire Test Data 

In [23]:
final=grid_search.best_estimator_

auto_test_data=strat_test_set.drop("MPG",axis=1)
auto_test_label=strat_test_set["MPG"].copy()

auto_test_process=preprocess_origin_col(auto_test_data)
auto_test_prepared=data_pipeline(auto_test_process)

final_predict=final.predict(auto_test_prepared)
final_mse=mean_squared_error(final_predict,auto_test_label)
final_rmse=np.sqrt(final_mse)
final_rmse


2.981193995514028

### Creating a function to cover entire workflow 

In [24]:
def predict_func(config,model):
    if type(config)==dict:                # Checking whether incoming data is a DataFrame or not, if not convert it into one
        df=pd.DataFrame(config)
    else:
        df=config
    preproc_df=preprocess_origin_col(df)  # Encoding
    prep_data=data_pipeline(preproc_df)   # Data and Numeric Pipeline
    pred=model.predict(prep_data)         # Prdiction
    return pred

In [25]:
#Creating our own example
vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

values=predict_func(vehicle_config, final)
print("Predictions:")
print("-"*60)
for i in range(len(values)):
    print(i,"->" ,values[i])

Predictions:
------------------------------------------------------------
0 -> 33.84666666666666
1 -> 17.65333333333333
2 -> 19.25


### Save the Model 

In [26]:
import pickle
with open("model.bin","wb") as f_out:
    pickle.dump(final,f_out)
    f_out.close()

In [27]:
with open("model.bin","rb") as f_in:
    model=pickle.load(f_in)

predict_func(vehicle_config,model)

array([33.84666667, 17.65333333, 19.25      ])