Predicted Prices for Cars using multiple algorithms

Objective: Predict the price for Toyota Corolla using:
  1. Regression
  2. Regression Trees
  3. NNet

Steps invovled in creating the above models:
  1. Load the packages required
  2. Load the data
  3. review the data
  4. make any necessary changes i.e. column headers etc
  5. split the dataset into train and validation sets
  6. train the model
  7. test the model
  8. evaluate performance of the model(s)


In [1]:
### STEP 1 - Load the packages
# !pip install dmba
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import matplotlib.pylab as plt
from dmba import plotDecisionTree, classificationSummary, regressionSummary

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier, MLPRegressor

import statsmodels.formula.api as sm
import matplotlib.pylab as plt

  from pandas import Int64Index as NumericIndex


In [2]:
### Step 2 - Load the data
car_df = pd.read_csv('ToyotaCorolla.csv')

In [3]:
### STEP 3 - review the data

car_df.head()
car_df.tail()
car_df.describe()
car_df.isnull().sum()
car_df.isna().sum()


Id                   0
Model                0
Price                0
Age_08_04            0
Mfg_Month            0
Mfg_Year             0
KM                   0
Fuel_Type            0
HP                   0
Met_Color            0
Color                0
Automatic            0
CC                   0
Doors                0
Cylinders            0
Gears                0
Quarterly_Tax        0
Weight               0
Mfr_Guarantee        0
BOVAG_Guarantee      0
Guarantee_Period     0
ABS                  0
Airbag_1             0
Airbag_2             0
Airco                0
Automatic_airco      0
Boardcomputer        0
CD_Player            0
Central_Lock         0
Powered_Windows      0
Power_Steering       0
Radio                0
Mistlamps            0
Sport_Model          0
Backseat_Divider     0
Metallic_Rim         0
Radio_cassette       0
Parking_Assistant    0
Tow_Bar              0
dtype: int64

In [5]:
# !pip install skimpy
from skimpy import skim
skim(car_df)

Collecting skimpy
  Downloading skimpy-0.0.5-py3-none-any.whl (15 kB)
Collecting typeguard<3.0.0,>=2.12.1
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Collecting click==7.1.2
  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
Collecting rich<11.0.0,>=10.9.0
  Downloading rich-10.16.2-py3-none-any.whl (214 kB)
Collecting commonmark<0.10.0,>=0.9.0
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
Installing collected packages: commonmark, typeguard, rich, click, skimpy
  Attempting uninstall: click
    Found existing installation: click 8.0.4
    Uninstalling click-8.0.4:
      Successfully uninstalled click-8.0.4
Successfully installed click-7.1.2 commonmark-0.9.1 rich-10.16.2 skimpy-0.0.5 typeguard-2.13.3


In [6]:
car_df.columns

Index(['Id', 'Model', 'Price', 'Age_08_04', 'Mfg_Month', 'Mfg_Year', 'KM',
       'Fuel_Type', 'HP', 'Met_Color', 'Color', 'Automatic', 'CC', 'Doors',
       'Cylinders', 'Gears', 'Quarterly_Tax', 'Weight', 'Mfr_Guarantee',
       'BOVAG_Guarantee', 'Guarantee_Period', 'ABS', 'Airbag_1', 'Airbag_2',
       'Airco', 'Automatic_airco', 'Boardcomputer', 'CD_Player',
       'Central_Lock', 'Powered_Windows', 'Power_Steering', 'Radio',
       'Mistlamps', 'Sport_Model', 'Backseat_Divider', 'Metallic_Rim',
       'Radio_cassette', 'Parking_Assistant', 'Tow_Bar'],
      dtype='object')

In [7]:
car_df = car_df.rename(columns={'Age_08_04':'Age','Quarterly_Tax':'tax' })
car_df.columns

Index(['Id', 'Model', 'Price', 'Age', 'Mfg_Month', 'Mfg_Year', 'KM',
       'Fuel_Type', 'HP', 'Met_Color', 'Color', 'Automatic', 'CC', 'Doors',
       'Cylinders', 'Gears', 'tax', 'Weight', 'Mfr_Guarantee',
       'BOVAG_Guarantee', 'Guarantee_Period', 'ABS', 'Airbag_1', 'Airbag_2',
       'Airco', 'Automatic_airco', 'Boardcomputer', 'CD_Player',
       'Central_Lock', 'Powered_Windows', 'Power_Steering', 'Radio',
       'Mistlamps', 'Sport_Model', 'Backseat_Divider', 'Metallic_Rim',
       'Radio_cassette', 'Parking_Assistant', 'Tow_Bar'],
      dtype='object')

In [8]:
predictors = ['Age', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC', 
              'Doors', 'tax', 'Weight']
outcome = 'Price'

In [10]:
car_df[predictors].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Age        1436 non-null   int64 
 1   KM         1436 non-null   int64 
 2   Fuel_Type  1436 non-null   object
 3   HP         1436 non-null   int64 
 4   Met_Color  1436 non-null   int64 
 5   Automatic  1436 non-null   int64 
 6   CC         1436 non-null   int64 
 7   Doors      1436 non-null   int64 
 8   tax        1436 non-null   int64 
 9   Weight     1436 non-null   int64 
dtypes: int64(9), object(1)
memory usage: 112.3+ KB


In [None]:
# partition data
X = pd.get_dummies(car_df[predictors], drop_first=True)
y = car_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

### create the REGRESSION MODEL
car_lm = LinearRegression()
car_lm.fit(train_X, train_y)

## print the coefficients
print('intercept ', car_lm.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': car_lm.coef_}))

##print performance measures
regressionSummary(valid_y, car_lm.predict(valid_X))


intercept  -258.6042161509922
           Predictor  coefficient
0                Age  -124.110305
1                 KM    -0.016059
2                 HP    75.549218
3          Met_Color    47.715778
4          Automatic   462.441526
5                 CC    -5.027585
6              Doors    58.417871
7                tax    13.009195
8             Weight    14.156177
9   Fuel_Type_Diesel  4481.088703
10  Fuel_Type_Petrol  2413.063717

Regression statistics

                      Mean Error (ME) : 190.6887
       Root Mean Squared Error (RMSE) : 3315.7836
            Mean Absolute Error (MAE) : 1095.1592
          Mean Percentage Error (MPE) : 0.2537
Mean Absolute Percentage Error (MAPE) : 10.3465


In [None]:
### create the REGRESSION TREE MODEL
param_grid = {
    'max_depth': [5, 10, 15, 20, 25], 
    'min_impurity_decrease': [0, 0.001, 0.005, 0.01], 
    'min_samples_split': [10, 20, 30, 40, 50], 
}
gridSearch = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5, n_jobs=-1)
gridSearch.fit(train_X, train_y)
print('Initial parameters: ', gridSearch.best_params_)

param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 
    'min_impurity_decrease': [0, 0.001, 0.002, 0.003, 0.005, 0.006, 0.007, 0.008], 
    'min_samples_split': [14, 15, 16, 18, 20, ], 
}
gridSearch = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5, n_jobs=-1)
gridSearch.fit(train_X, train_y)
print('Improved parameters: ', gridSearch.best_params_)

regTree = gridSearch.best_estimator_
print(regTree)

regressionSummary(train_y, regTree.predict(train_X))
regressionSummary(valid_y, regTree.predict(valid_X))


Initial parameters:  {'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 10}


In [None]:
# CREATE THE NNet Model

car_df = pd.read_csv('ToyotaCorolla.csv')
car_df = car_df.rename(columns={'Age_08_04':'Age','Quarterly_Tax':'tax' })

selected_var = ['Price', 'Age', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC', 
              'Doors', 'tax', 'Weight']
car_df = car_df[selected_var]

#then convert the categorical var
categorical_var = ['Fuel_Type']
car_df = pd.get_dummies(car_df, columns=['Fuel_Type'], drop_first=True)

# separate out predictors and response variables
X_df = car_df.drop(columns=['Price'])
Y_df = car_df[ ['Price'] ]

#normalize the data
scaleInput = MinMaxScaler()
scaleOutput = MinMaxScaler()
X = scaleInput.fit_transform(X_df)
y = scaleOutput.fit_transform(Y_df)

#partition the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.4, random_state=12345)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

#train NNet with 2 hidden nodes
car_nnet = MLPRegressor(hidden_layer_sizes=(2), activation='logistic', solver='lbfgs', random_state=1)
car_nnet.fit(X_train, y_train.ravel())

#RMSE for training set
print('Training data (2)')
y_actual = scaleOutput.inverse_transform(y_train).ravel()
y_pred = scaleOutput.inverse_transform([car_nnet.predict(X_train)]).ravel()
regressionSummary(y_pred, y_actual)

#RMSE for validation set
print('\nValidation data (2)')
y_actual = scaleOutput.inverse_transform(y_valid).ravel()
y_pred = scaleOutput.inverse_transform([car_nnet.predict(X_valid)]).ravel()
regressionSummary(y_pred, y_actual)


((861, 11), (575, 11), (861, 1), (575, 1))