### Data Exploration

In [3]:
# libraries imported

import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, f1_score
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

#### Basic Exploratory Analysis

In [5]:
# read in data file

toyota = pd.read_csv('C:/Users/ankit/Downloads/ToyotaCorolla.csv')

In [6]:
# check head of the data frame

toyota.head(10)

Unnamed: 0,Id,Price,Age_08_22,KM,Fuel_Type,HP,Color,Automatic,CC,Doors,...,Airbag_1,Airbag_2,Airco,CD_Player,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Metallic_Rim
0,,,,,,,,,,,...,,,,,,,,,,
1,1.0,13500.0,23.0,46986.0,Diesel,90.0,Blue,0.0,2000.0,3.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,,,,,,,,,,,...,,,,,,,,,,
3,2.0,13750.0,23.0,72937.0,Diesel,90.0,Silver,0.0,2000.0,3.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,,,,,,,,,,,...,,,,,,,,,,
5,3.0,13950.0,24.0,41711.0,Diesel,90.0,Blue,0.0,2000.0,3.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,,,,,,,,,,,...,,,,,,,,,,
7,4.0,14950.0,26.0,48000.0,Diesel,90.0,Black,0.0,2000.0,3.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,,,,,,,,,,,...,,,,,,,,,,
9,5.0,13750.0,30.0,38500.0,Diesel,90.0,Black,0.0,2000.0,3.0,...,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0


In [7]:
# as we can see, there are empty rows after every entry. we will remove them.

toyota = toyota.dropna(how='all')
toyota.reset_index(drop=True, inplace=True)

In [8]:
# print the shape of the data frame

toyota.shape

(1436, 24)

In [9]:
# print the data types of each column

toyota.dtypes

Id                 float64
Price              float64
Age_08_22          float64
KM                 float64
Fuel_Type           object
HP                 float64
Color               object
Automatic          float64
CC                 float64
Doors              float64
Cylinders          float64
Gears              float64
Mfr_Guarantee      float64
ABS                float64
Airbag_1           float64
Airbag_2           float64
Airco              float64
CD_Player          float64
Powered_Windows    float64
Power_Steering     float64
Radio              float64
Mistlamps          float64
Sport_Model        float64
Metallic_Rim       float64
dtype: object

In [10]:
# check for null values

missing_values = toyota.isnull().sum()
missing_values

Id                    0
Price                 1
Age_08_22             0
KM                    0
Fuel_Type             0
HP                    0
Color                 9
Automatic             0
CC                    5
Doors                 0
Cylinders             0
Gears                 0
Mfr_Guarantee         1
ABS                   0
Airbag_1              0
Airbag_2              0
Airco                 1
CD_Player             0
Powered_Windows       0
Power_Steering        0
Radio                 0
Mistlamps          1035
Sport_Model           0
Metallic_Rim          0
dtype: int64

In [11]:
# look at frequency of each occurence in the "Fuel_Type" column

print(toyota["Fuel_Type"].value_counts(dropna=False))

Fuel_Type
Petrol    1264
Diesel     155
CNG         17
Name: count, dtype: int64


In [12]:
# look at frequency of each occurence in the "Color" column

print(toyota["Color"].value_counts(dropna=False))

Color
Grey      297
Blue      281
Red       278
Green     218
Black     190
Silver    122
White      31
NaN         9
Violet      4
Yellow      3
Beige       3
Name: count, dtype: int64


In [13]:
# look at correlation between all of the predictors

predictors_df = toyota[['Id', 'Age_08_22', 'KM', 'Fuel_Type', 'HP', 'Color',
       'Automatic', 'CC', 'Doors', 'Cylinders', 'Gears', 'Mfr_Guarantee',
       'ABS', 'Airbag_1', 'Airbag_2', 'Airco', 'CD_Player', 'Powered_Windows',
       'Power_Steering', 'Radio', 'Mistlamps', 'Sport_Model', 'Metallic_Rim']]
response_df = toyota[ 'Price']

predictors_df.corr(numeric_only=True)

Unnamed: 0,Id,Age_08_22,KM,HP,Automatic,CC,Doors,Cylinders,Gears,Mfr_Guarantee,...,Airbag_1,Airbag_2,Airco,CD_Player,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Metallic_Rim
Id,1.0,0.906132,0.273298,-0.109375,0.066265,-0.116432,-0.130207,,-0.043343,-0.162997,...,-0.123465,-0.358316,-0.385579,-0.46452,-0.236723,-0.091587,-0.010971,-0.114208,-0.028704,-0.022232
Age_08_22,0.906132,1.0,0.505672,-0.156622,0.031717,-0.096328,-0.148359,,-0.005364,-0.16608,...,-0.105406,-0.329017,-0.402953,-0.510895,-0.283856,-0.069192,0.013791,-0.116181,-0.110988,-0.040045
KM,0.273298,0.505672,1.0,-0.333538,-0.081854,0.10345,-0.036197,,0.015023,-0.213188,...,-0.018012,-0.139275,-0.133222,-0.266826,-0.156242,0.007397,0.013661,-0.072343,-0.044784,-0.013599
HP,-0.109375,-0.156622,-0.333538,1.0,0.013144,0.036845,0.092424,,0.209477,0.139643,...,0.025137,0.017644,0.241429,0.1023,0.265593,0.04885,0.020998,0.232181,-0.006027,0.206784
Automatic,0.066265,0.031717,-0.081854,0.013144,1.0,0.067084,-0.027654,,-0.098555,0.02606,...,-0.011895,0.001171,-0.028196,-0.010967,-0.005864,-0.004469,-0.0146,0.06068,0.013175,-0.078095
CC,-0.116432,-0.096328,0.10345,0.036845,0.067084,1.0,0.078303,,0.014807,-0.056846,...,0.022492,0.024043,0.119354,0.059227,0.054373,0.032781,0.000127,-0.001237,-0.035626,0.002116
Doors,-0.130207,-0.148359,-0.036197,0.092424,-0.027654,0.078303,1.0,,-0.160141,0.038301,...,0.053828,0.021734,0.169967,0.094653,0.107626,0.059792,-0.008318,-0.005523,-0.129881,-0.039555
Cylinders,,,,,,,,,,,...,,,,,,,,,,
Gears,-0.043343,-0.005364,0.015023,0.209477,-0.098555,0.014807,-0.160141,,1.0,0.010744,...,0.002444,0.09521,0.145636,-0.047466,0.131423,0.0212,0.01509,0.141555,0.174117,0.295077
Mfr_Guarantee,-0.162997,-0.16608,-0.213188,0.139643,0.02606,-0.056846,0.038301,,0.010744,1.0,...,0.052203,0.20169,0.052436,0.155383,0.042085,0.029867,-0.050771,0.135895,0.05377,0.026442


### Data Preprocessing

#### Drop Predictors

In [16]:
# drop 'ID', 'Cylinders' and 'Mistlamps'

predictors_df = predictors_df.drop(['Id', 'Cylinders', 'Mistlamps'], axis=1)
predictors_df.columns

Index(['Age_08_22', 'KM', 'Fuel_Type', 'HP', 'Color', 'Automatic', 'CC',
       'Doors', 'Gears', 'Mfr_Guarantee', 'ABS', 'Airbag_1', 'Airbag_2',
       'Airco', 'CD_Player', 'Powered_Windows', 'Power_Steering', 'Radio',
       'Sport_Model', 'Metallic_Rim'],
      dtype='object')

#### Deal with NA values

In [18]:
# drop rows with na values

toyota = toyota.drop(['Mistlamps'], axis=1)
toyota = toyota.dropna()

predictors_df = toyota[['Age_08_22', 'KM', 'Fuel_Type', 'HP', 'Color', 'Automatic', 'CC',
       'Doors', 'Gears', 'Mfr_Guarantee', 'ABS', 'Airbag_1', 'Airbag_2',
       'Airco', 'CD_Player', 'Powered_Windows', 'Power_Steering', 'Radio',
       'Sport_Model', 'Metallic_Rim']]
response_df = toyota[ 'Price']

print(predictors_df.isnull().sum())

Age_08_22          0
KM                 0
Fuel_Type          0
HP                 0
Color              0
Automatic          0
CC                 0
Doors              0
Gears              0
Mfr_Guarantee      0
ABS                0
Airbag_1           0
Airbag_2           0
Airco              0
CD_Player          0
Powered_Windows    0
Power_Steering     0
Radio              0
Sport_Model        0
Metallic_Rim       0
dtype: int64


In [19]:
# check shape of predictors dataframe

predictors_df.shape

(1419, 20)

#### Deal with categorical variables

In [21]:
# one-hot encode the categorical variables

fuel_dummies = pd.get_dummies(predictors_df['Fuel_Type'], prefix='Fuel')
color_dummies = pd.get_dummies(predictors_df['Color'], prefix='Color')

In [22]:
# drop original columns and combine everything

predictors_df1 = predictors_df.drop(columns=['Fuel_Type', 'Color'])
predictors_df1 = pd.concat([predictors_df1, fuel_dummies, color_dummies], axis=1)

predictors_df1.head()

Unnamed: 0,Age_08_22,KM,HP,Automatic,CC,Doors,Gears,Mfr_Guarantee,ABS,Airbag_1,...,Color_Beige,Color_Black,Color_Blue,Color_Green,Color_Grey,Color_Red,Color_Silver,Color_Violet,Color_White,Color_Yellow
0,23.0,46986.0,90.0,0.0,2000.0,3.0,5.0,0.0,1.0,1.0,...,False,False,True,False,False,False,False,False,False,False
1,23.0,72937.0,90.0,0.0,2000.0,3.0,5.0,0.0,1.0,1.0,...,False,False,False,False,False,False,True,False,False,False
2,24.0,41711.0,90.0,0.0,2000.0,3.0,5.0,1.0,1.0,1.0,...,False,False,True,False,False,False,False,False,False,False
3,26.0,48000.0,90.0,0.0,2000.0,3.0,5.0,1.0,1.0,1.0,...,False,True,False,False,False,False,False,False,False,False
4,30.0,38500.0,90.0,0.0,2000.0,3.0,5.0,1.0,1.0,1.0,...,False,True,False,False,False,False,False,False,False,False


In [23]:
# shape of the predictors dataframe 1 & response dataframe

print('Predictors Dataframe:', predictors_df1.shape)
print('Response Dataframe:', response_df.shape)

Predictors Dataframe: (1419, 31)
Response Dataframe: (1419,)


In [24]:
# correlations of new predictors dataframe

predictors_df1.corr()

Unnamed: 0,Age_08_22,KM,HP,Automatic,CC,Doors,Gears,Mfr_Guarantee,ABS,Airbag_1,...,Color_Beige,Color_Black,Color_Blue,Color_Green,Color_Grey,Color_Red,Color_Silver,Color_Violet,Color_White,Color_Yellow
Age_08_22,1.0,0.502924,-0.156036,0.043666,-0.097749,-0.133099,-0.008561,-0.171304,-0.411433,-0.103782,...,0.022568,-0.025746,-0.035293,0.103253,-0.129805,0.096089,-0.024972,0.017124,0.048724,-0.042731
KM,0.502924,1.0,-0.337537,-0.08057,0.103691,-0.030873,0.013986,-0.213465,-0.174443,-0.016729,...,-0.007109,0.037488,-0.006108,-0.018948,-0.10506,0.049549,0.005765,0.018476,0.129816,-0.038519
HP,-0.156036,-0.337537,1.0,0.009248,0.035123,0.089607,0.21316,0.145907,0.056733,0.024776,...,0.026537,-0.00161,-0.027922,0.010416,0.027018,0.014345,0.016415,-0.012317,-0.094687,-0.00033
Automatic,0.043666,-0.08057,0.009248,1.0,0.067807,-0.035258,-0.099489,0.021621,-0.019531,-0.013233,...,-0.011025,-0.039373,-0.017148,0.053233,0.008033,-0.023395,0.050183,-0.012736,-0.035798,-0.011025
CC,-0.097749,0.103691,0.035123,0.067807,1.0,0.078904,0.014792,-0.057471,0.037441,0.022545,...,0.002545,-0.007727,0.030657,-0.011357,-0.007694,-0.010298,-0.003718,-0.015776,0.027496,-0.011857
Doors,-0.133099,-0.030873,0.089607,-0.035258,0.078904,1.0,-0.159654,0.041332,0.059066,0.052309,...,-0.033355,-0.098722,-0.045589,0.061439,0.056914,0.001087,0.02716,0.012662,-0.008714,0.014991
Gears,-0.008561,0.013986,0.21316,-0.099489,0.014792,-0.159654,1.0,0.010691,0.087171,0.002736,...,0.074442,0.13005,-0.023339,-0.049882,-0.026357,0.033898,-0.069666,-0.007512,-0.021114,-0.006503
Mfr_Guarantee,-0.171304,-0.213465,0.145907,0.021621,-0.057471,0.041332,0.010691,1.0,0.120506,0.052637,...,0.024005,0.008717,-0.02823,0.034122,0.012078,0.002893,-0.006271,0.036737,-0.095215,0.055199
ABS,-0.411433,-0.174443,0.056733,-0.019531,0.037441,0.059066,0.087171,0.120506,1.0,0.276941,...,-0.016989,-0.037621,0.003992,-0.024104,0.060091,0.014224,-0.021589,-0.008305,-0.026422,0.022211
Airbag_1,-0.103782,-0.016729,0.024776,-0.013233,0.022545,0.052309,0.002736,0.052637,0.276941,1.0,...,0.008039,0.019826,0.003004,0.028281,0.06876,-0.082266,-0.021643,-0.069144,-0.030791,0.008039


In [25]:
# drop 'Fuel_CNG' to avoid the dummy variable trap

predictors_df1 = predictors_df1.drop(columns=['Fuel_CNG'])

#### Normalize data

In [27]:
# normalize predictors using Z-score normalization

z_score_norm = preprocessing.StandardScaler()
predictor_df_normalized = z_score_norm.fit_transform(predictors_df1)
predictor_df_normalized = pd.DataFrame(predictor_df_normalized, columns = predictors_df1.columns)
predictor_df_normalized.head()

Unnamed: 0,Age_08_22,KM,HP,Automatic,CC,Doors,Gears,Mfr_Guarantee,ABS,Airbag_1,...,Color_Beige,Color_Black,Color_Blue,Color_Green,Color_Grey,Color_Red,Color_Silver,Color_Violet,Color_White,Color_Yellow
0,-1.81958,-0.581559,-0.770452,-0.239535,0.993976,-1.07477,-0.141281,-0.833871,0.482536,0.174646,...,-0.046029,-0.393189,2.016893,-0.426046,-0.511208,-0.491396,-0.303939,-0.053168,-0.149447,-0.046029
1,-1.81958,0.109448,-0.770452,-0.239535,0.993976,-1.07477,-0.141281,-0.833871,0.482536,0.174646,...,-0.046029,-0.393189,-0.495812,-0.426046,-0.511208,-0.491396,3.290137,-0.053168,-0.149447,-0.046029
2,-1.765016,-0.722019,-0.770452,-0.239535,0.993976,-1.07477,-0.141281,1.199227,0.482536,0.174646,...,-0.046029,-0.393189,2.016893,-0.426046,-0.511208,-0.491396,-0.303939,-0.053168,-0.149447,-0.046029
3,-1.655887,-0.554559,-0.770452,-0.239535,0.993976,-1.07477,-0.141281,1.199227,0.482536,0.174646,...,-0.046029,2.543309,-0.495812,-0.426046,-0.511208,-0.491396,-0.303939,-0.053168,-0.149447,-0.046029
4,-1.43763,-0.807519,-0.770452,-0.239535,0.993976,-1.07477,-0.141281,1.199227,0.482536,0.174646,...,-0.046029,2.543309,-0.495812,-0.426046,-0.511208,-0.491396,-0.303939,-0.053168,-0.149447,-0.046029


#### Splitting of dataset

In [29]:
# partition data into train and test sets

X = predictor_df_normalized
y = response_df
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=1)

### Model 1: Linear Regression

In [31]:
# train the linear regression model

linear_model = LinearRegression().fit(train_X, train_y)

In [32]:
print('intercept ', linear_model.intercept_)
print(pd.DataFrame({'Predictor': X.columns, 'coefficient': linear_model.coef_}))

intercept  10601.277894427347
          Predictor  coefficient
0         Age_08_22 -2389.299613
1                KM  -697.032227
2                HP   697.262390
3         Automatic   180.918080
4                CC    25.584690
5             Doors   178.990172
6             Gears    50.907699
7     Mfr_Guarantee   138.688901
8               ABS  -207.645341
9          Airbag_1    18.121007
10         Airbag_2   -53.264476
11            Airco   137.587089
12        CD_Player   134.946415
13  Powered_Windows   165.175399
14   Power_Steering   -18.880028
15            Radio   -61.888178
16      Sport_Model   313.147351
17     Metallic_Rim    90.633291
18      Fuel_Diesel   669.229549
19      Fuel_Petrol    98.827147
20      Color_Beige   -52.032692
21      Color_Black    67.514456
22       Color_Blue    25.609931
23      Color_Green   -94.734262
24       Color_Grey   121.191869
25        Color_Red   -64.067652
26     Color_Silver    -0.615930
27     Color_Violet    27.273111
28      Color

In [33]:
# print performance metrics on training set

predicted_y_training = linear_model.predict(train_X)
linear_train_rmse = round(mean_squared_error(train_y, predicted_y_training) ** 0.5, 4)
linear_train_mae = round(mean_absolute_error(train_y,predicted_y_training), 4)
linear_train_mape = round(mean_absolute_percentage_error(train_y, predicted_y_training) * 100, 4)

print("Root Mean Squared Error (RMSE): ", linear_train_rmse)
print("Mean Absolute Error (MAE): ", linear_train_mae)
print("Mean Absolute Percentage Error (MAPE): ", linear_train_mape)

Root Mean Squared Error (RMSE):  1304.0191
Mean Absolute Error (MAE):  976.7625
Mean Absolute Percentage Error (MAPE):  9.8429


In [34]:
# train the model on the test set

predicted_y_test = linear_model.predict(test_X)  # these are our y_hat values!!

result = pd.DataFrame({'Predicted': predicted_y_test, 'Actual': test_y,
                       'Residual': np.array(test_y) - np.array(predicted_y_test)})
result.head(20)

Unnamed: 0,Predicted,Actual,Residual
326,12223.654741,10950.0,-1273.654741
37,15876.557348,14950.0,-926.557348
126,16259.151833,15950.0,-309.151833
1102,7382.029312,6650.0,-732.029312
1175,7650.555389,9950.0,2299.444611
1097,7351.054526,7250.0,-101.054526
699,7809.517171,8950.0,1140.482829
624,8141.836568,7950.0,-191.836568
338,13357.702063,12750.0,-607.702063
1298,7526.632311,7500.0,-26.632311


In [35]:
# print performance metrics on test set

linear_test_rmse = round(mean_squared_error(test_y, predicted_y_test) ** 0.5, 4)
linear_test_mae = round(mean_absolute_error(test_y,predicted_y_test), 4)
linear_test_mape = round(mean_absolute_percentage_error(test_y, predicted_y_test) * 100, 4)

print("Root Mean Squared Error (RMSE): ", linear_test_rmse)
print("Mean Absolute Error (MAE): ", linear_test_mae)
print("Mean Absolute Percentage Error (MAPE): ", linear_test_mape)

Root Mean Squared Error (RMSE):  1435.7112
Mean Absolute Error (MAE):  1038.1893
Mean Absolute Percentage Error (MAPE):  9.8631


### Model 2: $k$-NN

In [37]:
# train the k-NN model

knn_r = KNeighborsRegressor(n_neighbors=5).fit(train_X, train_y)


Root Mean Squared Error (RMSE):  1370.4232


In [95]:
# performance on training dataset

predicted_y_training3 = knn_r.predict(train_X)

knn_train_rmse = round(mean_squared_error(train_y, predicted_y_training3) ** 0.5, 4)
knn_train_mae = round(mean_absolute_error(train_y, predicted_y_training3), 4)
knn_train_mape = round(mean_absolute_percentage_error(train_y, predicted_y_training3) * 100, 4)

print("Root Mean Squared Error (RMSE): ", knn_train_rmse)
print("Mean Absolute Error (MAE):", knn_train_mae)
print("Mean Absolute Percentage Error (MAPE):", knn_train_mape)

Root Mean Squared Error (RMSE):  1370.4232
Mean Absolute Error (MAE): 1021.5561
Mean Absolute Percentage Error (MAPE): 10.068


In [97]:
# performance on testing dataset

predicted_y_test3 = knn_r.predict(test_X)

knn_test_rmse = round(mean_squared_error(test_y, predicted_y_test3) ** 0.5, 4)
knn_test_mae = round(mean_absolute_error(test_y, predicted_y_test3), 4)
knn_test_mape = round(mean_absolute_percentage_error(test_y, predicted_y_test3) * 100, 4)

print("Root Mean Squared Error (RMSE): ", knn_test_rmse)
print("Mean Absolute Error (MAE):", knn_test_mae_k)
print("Mean Absolute Percentage Error (MAPE):", knn_test_mape_k)

Root Mean Squared Error (RMSE):  1779.9966
Mean Absolute Error (MAE): 1293.0927
Mean Absolute Percentage Error (MAPE): 12.483


#### Hyperparameter tuning

In [40]:
# find optimal k value

results = []
for k in range(1, 21):
    knn_r2 = KNeighborsRegressor(n_neighbors=k).fit(train_X, train_y)
    results.append({
        'k': k,
        'RMSE_train': round(mean_squared_error(train_y, knn_r2.predict(train_X)) ** 0.5, 4),
        'RMSE_test': round(mean_squared_error(test_y, knn_r2.predict(test_X)) ** 0.5, 4)
    })

results = pd.DataFrame(results)
print(results)

     k  RMSE_train  RMSE_test
0    1      0.0000  2266.3867
1    2   1034.2722  1892.3868
2    3   1222.7785  1806.4424
3    4   1293.6860  1771.4069
4    5   1370.4232  1779.9966
5    6   1431.3011  1800.1757
6    7   1485.5771  1806.3092
7    8   1521.9585  1830.6853
8    9   1525.0449  1850.9450
9   10   1544.4495  1859.5850
10  11   1578.3929  1867.9562
11  12   1590.4510  1869.6701
12  13   1608.0363  1885.9860
13  14   1637.8430  1894.9439
14  15   1660.0722  1913.4059
15  16   1680.4484  1920.9558
16  17   1702.6284  1925.3212
17  18   1721.4508  1938.6965
18  19   1744.7379  1954.3312
19  20   1755.0764  1953.5855


In [41]:
# extract optimal k value

optimal_k_row = results.loc[results['RMSE_test'].idxmin()]
optimal_k = optimal_k_row['k']

print("Optimal k =", optimal_k)

Optimal k = 4.0


In [42]:
# train the k-NN model with optimal k value

knn_r_k = KNeighborsRegressor(n_neighbors=int(optimal_k)).fit(train_X, train_y)

Root Mean Squared Error (RMSE):  1293.686


In [101]:
# performance on training dataset

predicted_y_training4 = knn_r_k.predict(train_X)

knn_train_rmse_k = round(mean_squared_error(train_y, predicted_y_training4) ** 0.5, 4)
knn_train_mae_k = round(mean_absolute_error(train_y, predicted_y_training4), 4)
knn_train_mape_k = round(mean_absolute_percentage_error(train_y, predicted_y_training4) * 100, 4)

print("Root Mean Squared Error (RMSE): ", knn_train_rmse_k)
print("Mean Absolute Error (MAE):", knn_train_mae_k)
print("Mean Absolute Percentage Error (MAPE):", knn_train_mape_k)

Root Mean Squared Error (RMSE):  1293.686
Mean Absolute Error (MAE): 973.9864
Mean Absolute Percentage Error (MAPE): 9.668


In [99]:
# performance on testing data set 

predicted_y_test4 = knn_r_k.predict(test_X)

knn_test_rmse_k = round(mean_squared_error(test_y, predicted_y_test4) ** 0.5, 4)
knn_test_mae_k = round(mean_absolute_error(test_y, predicted_y_test4), 4)
knn_test_mape_k = round(mean_absolute_percentage_error(test_y, predicted_y_test4) * 100, 4)

print("Root Mean Squared Error (RMSE): ", knn_test_rmse_k)
print("Mean Absolute Error (MAE):", knn_test_mae_k)
print("Mean Absolute Percentage Error (MAPE):", knn_test_mape_k)

Root Mean Squared Error (RMSE):  1771.4069
Mean Absolute Error (MAE): 1293.0927
Mean Absolute Percentage Error (MAPE): 12.483


### Model 3: Neural Networks

In [45]:
# train the neural network model

NN_r = MLPRegressor(hidden_layer_sizes=(128,128), activation='relu', solver='adam', 
                    learning_rate_init=0.01, max_iter=300, batch_size=64, verbose=True, random_state=1).fit(train_X, train_y)

Iteration 1, loss = 62241872.60959920
Iteration 2, loss = 55549043.39346359
Iteration 3, loss = 32532641.69537398
Iteration 4, loss = 9093557.36735852
Iteration 5, loss = 3595970.62966846
Iteration 6, loss = 2038654.21346403
Iteration 7, loss = 1516542.05526441
Iteration 8, loss = 1236383.27512160
Iteration 9, loss = 1079101.07627168
Iteration 10, loss = 968258.60865205
Iteration 11, loss = 887173.22421761
Iteration 12, loss = 824910.06757365
Iteration 13, loss = 781614.18244405
Iteration 14, loss = 737321.99198353
Iteration 15, loss = 704749.26804677
Iteration 16, loss = 680809.73883152
Iteration 17, loss = 657951.77920380
Iteration 18, loss = 644939.37162865
Iteration 19, loss = 641731.89840018
Iteration 20, loss = 611066.66130733
Iteration 21, loss = 596540.54516508
Iteration 22, loss = 583348.63527669
Iteration 23, loss = 577995.83176337
Iteration 24, loss = 562802.62523360
Iteration 25, loss = 550649.57637930
Iteration 26, loss = 544731.43659961
Iteration 27, loss = 540418.5023094

In [46]:
# performance on training dataset

predicted_y_training_NN = NN_r.predict(train_X)

nn_train_rmse = round(mean_squared_error(train_y, predicted_y_training_NN) ** 0.5, 4)
nn_train_mae = round(mean_absolute_error(train_y, predicted_y_training_NN), 4)
nn_train_mape = round(mean_absolute_percentage_error(train_y, predicted_y_training_NN) * 100, 4)

print("Root Mean Squared Error (RMSE): ", nn_train_rmse)
print("Mean Absolute Error (MAE): ", nn_train_mae)
print("Mean Absolute Percentage Error (MAPE): ", nn_train_mape)

Root Mean Squared Error (RMSE):  796.069
Mean Absolute Error (MAE):  598.5935
Mean Absolute Percentage Error (MAPE):  5.9188


In [47]:
# performance on test dataset

predicted_y_test_NN = NN_r.predict(test_X)

nn_test_rmse = round(mean_squared_error(test_y, predicted_y_test_NN) ** 0.5, 4)
nn_test_mae = round(mean_absolute_error(test_y, predicted_y_test_NN), 4)
nn_test_mape = round(mean_absolute_percentage_error(test_y, predicted_y_test_NN) * 100, 4)

print("Root Mean Squared Error (RMSE): ", nn_test_rmse)
print("Mean Absolute Error (MAE): ", nn_test_mae)
print("Mean Absolute Percentage Error (MAPE): ", nn_test_mape)

Root Mean Squared Error (RMSE):  1225.0593
Mean Absolute Error (MAE):  938.2758
Mean Absolute Percentage Error (MAPE):  9.266


#### Hyperparameter tuning

In [49]:
# tune the learning rate

learning_rates = [0.002, 0.005, 0.01, 0.05, 0.1]
nn_results = []

for lr in learning_rates:
    NN_r2 = MLPRegressor(hidden_layer_sizes=(128,128), activation='relu', solver='adam', 
                    learning_rate_init=0.01, max_iter=300, batch_size=64, verbose=True, random_state=1).fit(train_X, train_y)
    predicted_test = NN_r2.predict(test_X)
    test_rmse = round(mean_squared_error(test_y, predicted_test) ** 0.5, 4)
    nn_results.append({'learning_rate_init': lr,'test_RMSE': test_rmse})

nn_results_df = pd.DataFrame(nn_results)
print(nn_results_df)

Iteration 1, loss = 62241872.60959920
Iteration 2, loss = 55549043.39346359
Iteration 3, loss = 32532641.69537398
Iteration 4, loss = 9093557.36735852
Iteration 5, loss = 3595970.62966846
Iteration 6, loss = 2038654.21346403
Iteration 7, loss = 1516542.05526441
Iteration 8, loss = 1236383.27512160
Iteration 9, loss = 1079101.07627168
Iteration 10, loss = 968258.60865205
Iteration 11, loss = 887173.22421761
Iteration 12, loss = 824910.06757365
Iteration 13, loss = 781614.18244405
Iteration 14, loss = 737321.99198353
Iteration 15, loss = 704749.26804677
Iteration 16, loss = 680809.73883152
Iteration 17, loss = 657951.77920380
Iteration 18, loss = 644939.37162865
Iteration 19, loss = 641731.89840018
Iteration 20, loss = 611066.66130733
Iteration 21, loss = 596540.54516508
Iteration 22, loss = 583348.63527669
Iteration 23, loss = 577995.83176337
Iteration 24, loss = 562802.62523360
Iteration 25, loss = 550649.57637930
Iteration 26, loss = 544731.43659961
Iteration 27, loss = 540418.5023094

In [50]:
# tune the hidden layers 

hidden_layer_options = [(64,64), (128,128), (256,128), (256,256)]
results = []

for hidden_layers in hidden_layer_options:
    nn_model = MLPRegressor(hidden_layer_sizes=hidden_layers, activation='relu', solver='adam',
                            learning_rate_init=0.002, max_iter=2000, batch_size=64, random_state=1, verbose=False).fit(train_X, train_y)
    predicted_test = nn_model.predict(test_X)
    test_rmse = round(mean_squared_error(test_y, predicted_test) ** 0.5, 4)
    results.append({'hidden_layer_sizes': hidden_layers, 'Test_RMSE': test_rmse})

hidden_layer_tuning_results = pd.DataFrame(results)
print(hidden_layer_tuning_results)

  hidden_layer_sizes  Test_RMSE
0           (64, 64)  1435.3635
1         (128, 128)  1391.0915
2         (256, 128)  1256.3929
3         (256, 256)  1310.8242


In [103]:
# train the neural network model after tuning hyperparameters

NN_r_h = MLPRegressor(hidden_layer_sizes=(256,128), activation='relu', solver='adam', 
                    learning_rate_init=0.002, max_iter=2000, batch_size=64, verbose=True, random_state=1).fit(train_X, train_y)

Iteration 1, loss = 62778134.88572846
Iteration 2, loss = 62481179.56141254
Iteration 3, loss = 61680512.61134642
Iteration 4, loss = 59849702.63063973
Iteration 5, loss = 56308930.86064056
Iteration 6, loss = 50541185.45199186
Iteration 7, loss = 42425816.84839187
Iteration 8, loss = 32434921.01290184
Iteration 9, loss = 22398516.83784443
Iteration 10, loss = 14098002.75774028
Iteration 11, loss = 8960856.87116676
Iteration 12, loss = 6195388.11106335
Iteration 13, loss = 4536142.02676465
Iteration 14, loss = 3482792.16850174
Iteration 15, loss = 2818692.72128889
Iteration 16, loss = 2383748.53735909
Iteration 17, loss = 2099940.36312240
Iteration 18, loss = 1896417.82380031
Iteration 19, loss = 1747204.54852342
Iteration 20, loss = 1627508.96315037
Iteration 21, loss = 1526643.58598575
Iteration 22, loss = 1445231.74368730
Iteration 23, loss = 1367772.02361941
Iteration 24, loss = 1305112.54495194
Iteration 25, loss = 1249860.76549970
Iteration 26, loss = 1200420.28963689
Iteration 2

In [105]:
# performance on training dataset

predicted_y_training_NN_h = NN_r_h.predict(train_X)

nn_train_rmse_h = round(mean_squared_error(train_y, predicted_y_training_NN_h) ** 0.5, 4)
nn_train_mae_h = round(mean_absolute_error(train_y, predicted_y_training_NN_h), 4)
nn_train_mape_h = round(mean_absolute_percentage_error(train_y, predicted_y_training_NN_h) * 100, 4)

print("Root Mean Squared Error (RMSE): ", nn_train_rmse_h)
print("Mean Absolute Error (MAE): ", nn_train_mae_h)
print("Mean Absolute Percentage Error (MAPE): ", nn_train_mape_h)

Root Mean Squared Error (RMSE):  587.7752
Mean Absolute Error (MAE):  421.0617
Mean Absolute Percentage Error (MAPE):  4.0861


In [107]:
# performance on test dataset

predicted_y_test_NN_h = NN_r.predict(test_X)

nn_test_rmse_h = round(mean_squared_error(test_y, predicted_y_test_NN_h) ** 0.5, 4)
nn_test_mae_h = round(mean_absolute_error(test_y, predicted_y_test_NN_h), 4)
nn_test_mape_h = round(mean_absolute_percentage_error(test_y, predicted_y_test_NN_h) * 100, 4)

print("Root Mean Squared Error (RMSE): ", nn_test_rmse_h)
print("Mean Absolute Error (MAE): ", nn_test_mae_h)
print("Mean Absolute Percentage Error (MAPE): ", nn_test_mape_h)

Root Mean Squared Error (RMSE):  1225.0593
Mean Absolute Error (MAE):  938.2758
Mean Absolute Percentage Error (MAPE):  9.266


In [111]:
# final model comparison

model_comparison = pd.DataFrame({
    'Model': ['Linear Regression','k-NN (k=5)','k-NN (k=4)','Neural Network (128,128)','Neural Network (256,128)' ],
    'Tuning': ['No','Before tuning','After tuning','Before tuning','After tuning'],
    'Train_RMSE': [ linear_train_rmse, knn_train_rmse, knn_train_rmse_k, nn_train_rmse, nn_train_rmse_h ],
    'Test_RMSE': [ linear_test_rmse, knn_test_rmse, knn_test_rmse_k, nn_test_rmse, nn_test_rmse_h ]
})

model_comparison

Unnamed: 0,Model,Tuning,Train_RMSE,Test_RMSE
0,Linear Regression,No,1304.0191,1435.7112
1,k-NN (k=5),Before tuning,1370.4232,1779.9966
2,k-NN (k=4),After tuning,1293.686,1771.4069
3,"Neural Network (128,128)",Before tuning,796.069,1225.0593
4,"Neural Network (256,128)",After tuning,587.7752,1225.0593
