In [None]:
# !pip install wget
# !pip install pandas
# !pip install matplotlib
# !pip install seaborn

In [None]:
#!pip install -U scikit-learn

In [None]:
#data source
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"


In [None]:
#download data to local machine
import wget
import os

if not os.path.isfile('data/car_data'):
  if not os.path.isdir('data'):
    os.mkdir('data')
  wget.download(url=url, out='data/car_data', )

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
cols = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin', 'Car']

car= pd.read_table(url, header=None, sep='\s+', names=cols, na_values='?')

car.head()


In [None]:
# make a copy of the dataset

data = car.copy()

**Problem statement:**    
Our aim here is to predict the MPG value for a vehicle, given that we have other attributes of that vehicle.

**Exploratory Data Analysis**

In [None]:
data.info()

In [None]:
data.Horsepower.unique()

In [None]:
data.isna().sum()

In [None]:
data.describe(include='all')

In [None]:
## check the horsepower

sns.boxplot(x=data.Horsepower);

In [None]:
##imputing the values with median
median = data['Horsepower'].median()
data['Horsepower'] = data['Horsepower'].fillna(median)
data.info()

#### Categorical variables

In [None]:
##category distribution

data["Cylinders"].value_counts() / len(data)

In [None]:
data['Origin'].value_counts()

In [None]:
##pairplots to get an intuition of potential correlations

sns.pairplot(data[["MPG", "Cylinders", "Displacement", "Weight", "Horsepower"]], diag_kind="kde");

#### Data Split 

In [None]:
#drop car type from the dataset
car_type = data['Car']
data = data.drop('Car', axis=1)

In [None]:
data.head()

In [None]:
#applying srarified splitting 
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [None]:
# checking distribution 
print('training_dataset')
print(strat_train_set['Cylinders'].value_counts() / len(strat_train_set))

print()
print('test_dataset')
print(strat_test_set["Cylinders"].value_counts() / len(strat_test_set))

In [None]:
# strat_test_set.to_csv('ml_in_powerbi/data/test_data.csv', index=False)
# strat_train_set.to_csv('ml_in_powerbi/data/training_data.csv', index=False)

In [None]:
strat_train_set.columns

In [None]:
strat_train_set.head()

In [None]:
strat_test_set.head()

In [None]:
### converting the Origin column to strings
##converting integer classes to countries in Origin 
sample_data = strat_train_set.sample(10)
sample_data['Origin'] = sample_data['Origin'].map({1: 'India', 2: 'USA', 3 : 'Germany'})
sample_data.head(10)

#### Investigating Feature Engineering

In [None]:
##one hot encoding
x = pd.get_dummies(strat_train_set, prefix='', prefix_sep='')
x.head()

In [None]:
## testing new variables by checking their correlation w.r.t. MPG
x['displacement_on_power'] = x['Displacement'] / x['Horsepower']
x['weight_on_cylinder'] = x['Weight'] / x['Cylinders']
x['acceleration_on_power'] = x['Acceleration'] / x['Horsepower']
x['acceleration_on_cyl'] = x['Acceleration'] / x['Cylinders']

corr_matrix = x.corr()
corr_matrix['MPG'].sort_values(ascending=False)

In [None]:
 numerics = ['float64', 'int64']
x.select_dtypes(include=numerics)

### Creating Data Transformation Pipelines

#### Numeric data transformation

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

acc_ix, hpower_ix, cyl_ix = 4, 2, 0

##custom class inheriting the BaseEstimator and TransformerMixin
class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True):
        self.acc_on_power = acc_on_power  # new optional variable
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        acc_on_cyl = X[:, acc_ix] / X[:, cyl_ix] # required new variable
        if self.acc_on_power:
            acc_on_power = X[:, acc_ix] / X[:, hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl] # returns a 2D array
        
        return np.c_[X, acc_on_cyl]
    
attr_adder = CustomAttrAdder(acc_on_power=True)
data_tr_extra_attrs = attr_adder.transform(x.values)
data_tr_extra_attrs[0]

In [None]:
##handling missing values
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def num_pipeline_transformer(data):
    '''
    Function to process numerical transformations
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
        
    '''
    numerics = ['float64', 'int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline

### categorical variable transformation 

In [None]:
##preprocess the Origin column in data
from sklearn.preprocessing import OneHotEncoder

def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})    
    return df

### full data tranformation pipleine 

In [None]:
from sklearn.compose import ColumnTransformer

def pipeline_transformer(data):
    '''
    Complete transformation pipeline for both
    nuerical and categorical data.
    
    Argument:
        data: original dataframe 
    Returns:
        prepared_data: transformed data, ready to use
    '''
    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

In [None]:
##from raw data to processed data in 2 steps

raw_training_data= strat_train_set.copy()

In [None]:
raw_training_data.isna().sum()

In [None]:
preprocessed_df = preprocess_origin_cols(raw_training_data)
prepared_data = pipeline_transformer(preprocessed_df)

prepared_data[:5]

### Modeling - Regression problem

The steps 
1. Import the model class
2. Create an instance of the model class
3. Train the model using the fit method
4. Make prediction by passing your raw training dataset through the data processing pipeline
5. Evaluate the result (e.g. RMSE)

#### Using Random Forest with GridSearchCV

In [None]:
##segregating the target variable from test set
X_train = strat_train_set.drop("MPG", axis=1)
y_train = strat_train_set["MPG"].copy()


X_train_processed_df = preprocess_origin_cols(X_train)


X_train_prepared = pipeline_transformer(X_train_processed_df)


In [None]:
from sklearn.ensemble  import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid,
                           scoring='neg_mean_squared_error',
                           return_train_score=True,
                           cv=10,
                          )
grid_search.fit(X_train_prepared, y_train)

In [None]:
# feature importances
feature_importances = grid_search.best_estimator_.feature_importances_

extra_attrs = ["acc_on_power", "acc_on_cyl"]
numerics = ['float64', 'int64']
num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs + extra_attrs
sorted(zip(attrs, feature_importances), reverse=True)

### Entire ML Pipleline

In [None]:
##capturing the best configuration
final_model = grid_search.best_estimator_

##segregating the target variable from test set
X_test = strat_test_set.drop("MPG", axis=1)
y_test = strat_test_set["MPG"].copy()

##preprocessing the test data origin column
X_test_preprocessed = preprocess_origin_cols(X_test)

##preparing the data with final transformation
X_test_prepared = pipeline_transformer(X_test_preprocessed)


In [None]:
from sklearn.metrics import mean_squared_error

##making final predictions
final_predictions = final_model.predict(X_test_prepared)
final_predictions

In [None]:
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print('final rmse: ', final_rmse)

### Save Model as Pickle File

In [None]:
import pickle

##dump the model into a file
if not os.path.isdir('model'):
    os.mkdir('model')
with open('model/model.pkl', 'wb') as f_out:
    pickle.dump(final_model, f_out) # write final_model in .bin file
    f_out.close()  # close the file

### Loading the Pickle File and using it to make Prediction

In [None]:
# data set for  3 vehicles 

vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

In [None]:
import pickle
##loading the model from the saved file
with open('model/model.pkl', 'rb') as f_in:
    model = pickle.load(f_in)

In [None]:
def predict_mpg(config, model):
    
    if type(config) == dict:
        df = pd.DataFrame(config)
    else:
        df = config
    
    preproc_df = preprocess_origin_cols(df)
    #print(preproc_df)
    prepared_df = pipeline_transformer(preproc_df)

    #print(len(prepared_df[0]))
    y_pred = model.predict(prepared_df)
    return y_pred
    

In [None]:
predict_mpg(vehicle_config, model)