In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler

from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error


import time

In [None]:
df = pd.read_excel("mpg.data.xlsx")
df = df.dropna(axis=1, thresh= 200)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     406 non-null    int64  
 2   displayments  406 non-null    float64
 3   horsepower    400 non-null    float64
 4   weight        406 non-null    int64  
 5   acceleration  406 non-null    float64
 6   model year    406 non-null    int64  
 7   origin        406 non-null    int64  
 8   car name      406 non-null    object 
dtypes: float64(4), int64(4), object(1)
memory usage: 28.7+ KB


# Preprocessing



1.   Handling missing values
2.   Drop the 'car name' feature
3.   Split the Dataset
4.   Encoding the categorical feature 'origin'
5.   Scaling the features




In [None]:
df = df.dropna(axis = 0,subset=['mpg'])
df['horsepower'] =df['horsepower'].fillna(df['horsepower'].median())
df_prepro = df.copy()

In [None]:
dict_region = {1:'USA',2:'Europe',3:'Japan'}
df_prepro['origin'] = df_prepro['origin'].replace(dict_region)

In [None]:
y =  df_prepro['mpg']
X = df_prepro.iloc[:,[1,2,3,4,5,6,7]]

In [None]:
seed = 42  


X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=seed)

In [None]:
#OneHotEncoder for X_train
X_train = X_train.reset_index()
encoder = OneHotEncoder( handle_unknown="ignore")
enc = pd.DataFrame(encoder.fit_transform(X_train[['origin']]).toarray())
X_train = X_train.join(enc)
X_train.set_index('index', inplace=True)
#drop 'origin'
X_train = X_train.drop(columns=['origin'])


#OneHotEncoder for X_test
X_test = X_test.reset_index()
enc = pd.DataFrame(encoder.transform(X_test[['origin']]).toarray())
X_test = X_test.join(enc)
X_test.set_index('index', inplace=True)
#drop 'origin'
X_test = X_test.drop(columns=['origin'])

In [None]:
#StandardScaler 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

# ML process

In [None]:
lassoReg = linear_model.Lasso()
linear = LinearRegression()
knr = KNeighborsRegressor()
regr = DecisionTreeRegressor()
svr = SVR()
rfr = RandomForestRegressor()
hgbr = HistGradientBoostingRegressor()
model = [lassoReg, linear, knr, regr, svr, rfr, hgbr]
for i in model:
  i.fit(X_train, y_train)

  pred_test = i.predict(X_test)
  print(i)
  print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test, pred_test))}')
  print(f'Mean Absolute Error: {mean_absolute_error(y_test, pred_test)}')
  print(f'R2:{r2_score(y_test,pred_test)}\n')

  pred_test = i.predict(X_train)
  print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_train, pred_test))}')
  print(f'Mean Absolute Error: {mean_absolute_error(y_train, pred_test)}')
  print(f'R2:{r2_score(y_train, pred_test)}\n\n\n')
  

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)
Root Mean Squared Error: 2.941918461902098
Mean Absolute Error: 2.251484227647977
R2:0.8390281440832619

Root Mean Squared Error: 3.662204313444849
Mean Absolute Error: 2.709466472002321
R2:0.7860837558105516



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
Root Mean Squared Error: 2.8876748114372144
Mean Absolute Error: 2.2880728919667046
R2:0.8449094781983056

Root Mean Squared Error: 3.369816650789179
Mean Absolute Error: 2.604850203627875
R2:0.8188780150628445



KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')
Root Mean Squared Error: 2.278413922008027
Mean Absolute Error: 1.85
R2:0.9034496307001894

Root Mean Squared Err

# KNeighborsRegressor - Grid

In [None]:
start = time.time()
param_knr_grid = {'n_neighbors': np.arange(3,17, 2),
                  'algorithm':[ 'ball_tree', 'kd_tree', 'brute'],
                  'leaf_size':np.arange(5, 35, 1),
                  'p':[1,2,3]}   


knr = KNeighborsRegressor()
gknr = GridSearchCV(knr, param_knr_grid, cv=KFold(n_splits=5,
                                              shuffle=True, random_state=101))   
gknr.fit(X_train, y_train)

end = time.time()
print(f'{(end - start)/60:.2f}m')

0.76m


In [None]:
pred_test = gknr.predict(X_test)
print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test, pred_test))}')
print(f'Mean Absolute Error: {mean_absolute_error(y_test, pred_test)}')
print(f'R2:{r2_score(y_test,pred_test)}')

Root Mean Squared Error: 2.2463971153827633
Mean Absolute Error: 1.7474999999999998
R2:0.9061440622060856


In [None]:
pred_test = gknr.predict(X_train)
print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_train, pred_test))}')
print(f'Mean Absolute Error: {mean_absolute_error(y_train, pred_test)}')
print(f'R2:{r2_score(y_train, pred_test)}')

Root Mean Squared Error: 2.5280325210868937
Mean Absolute Error: 1.7334591194968554
R2:0.8980648565901117


In [None]:
print(gknr.best_estimator_)

KNeighborsRegressor(algorithm='ball_tree', leaf_size=5, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=1,
                    weights='uniform')


# HistGradientBoostingRegressor

In [None]:
start = time.time()
param_hgbr_grid = {'learning_rate': np.arange(0.1, 0.6, 0.1),
                  'max_iter': np.arange(90, 110, 5),
                  'min_samples_leaf': np.arange(16, 21, 1)
                  }


hgbr = HistGradientBoostingRegressor()
ghgbr = GridSearchCV(hgbr, param_hgbr_grid, cv=KFold(n_splits=5,
                                              shuffle=True, random_state=101))
ghgbr.fit(X_train, y_train)
end = time.time()
print(f'{(end - start)/60:.2f}m')

1.10m


In [None]:
pred_test = ghgbr.predict(X_test)
print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test, pred_test))}')
print(f'Mean Absolute Error: {mean_absolute_error(y_test, pred_test)}')
print(f'R2:{r2_score(y_test,pred_test)}')

Root Mean Squared Error: 2.1865367360895163
Mean Absolute Error: 1.6810654370524905
R2:0.9110794286395594


In [None]:
pred_test = ghgbr.predict(X_train)
print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_train, pred_test))}')
print(f'Mean Absolute Error: {mean_absolute_error(y_train, pred_test)}')
print(f'R2:{r2_score(y_train, pred_test)}')

Root Mean Squared Error: 1.5026043086660708
Mean Absolute Error: 1.0485981630450585
R2:0.96398794664418


In [None]:
print(ghgbr.best_estimator_)

HistGradientBoostingRegressor(l2_regularization=0.0, learning_rate=0.1,
                              loss='least_squares', max_bins=255,
                              max_depth=None, max_iter=100, max_leaf_nodes=31,
                              min_samples_leaf=20, n_iter_no_change=None,
                              random_state=None, scoring=None, tol=1e-07,
                              validation_fraction=0.1, verbose=0,
                              warm_start=False)


# RandomForestRegressor - Grid

In [None]:
start = time.time()
param_rfr_grid = {'n_estimators': np.arange(90, 110, 2),
              'max_features' : ['auto', 'sqrt'],
              'min_samples_split' : np.arange(2, 5, 1),
              'min_samples_leaf' : np.arange(1, 4, 1),
              'bootstrap' : [True, False]}


rfr = RandomForestRegressor()
grfr = GridSearchCV(rfr, param_rfr_grid, cv=KFold(n_splits=5,
                                              shuffle=True, random_state=101))   
grfr.fit(X_train, y_train)
end = time.time()
print(f'{(end - start)/60:.2f}m')

4.39m


In [None]:
pred_test = grfr.predict(X_test)
print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test, pred_test))}')
print(f'Mean Absolute Error: {mean_absolute_error(y_test, pred_test)}')
print(f'R2:{r2_score(y_test,pred_test)}')

Root Mean Squared Error: 2.092585226440881
Mean Absolute Error: 1.5533653846153839
R2:0.9185567684134097


In [None]:
pred_test = grfr.predict(X_train)
print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(y_train, pred_test))}')
print(f'Mean Absolute Error: {mean_absolute_error(y_train, pred_test)}')
print(f'R2:{r2_score(y_train, pred_test)}')

Root Mean Squared Error: 1.0651318958408704
Mean Absolute Error: 0.7417513304305754
R2:0.9819047159297446


In [None]:
print(grfr.best_estimator_)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=104, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
