In [6]:
import warnings, ydata_profiling, shap
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, VotingRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer

from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

from tqdm import tqdm
from colorama import Fore, Back, Style

sns.set_style('dark')
warnings.filterwarnings('ignore')
shap.initjs()

ModuleNotFoundError: No module named 'pydantic.v1'

In [None]:
#Upload the dataset 
df = pd.read_csv(r"C:\Users\admin\Documents\Bafokeng\Portfolio Projects\Used cars\autos.csv").drop('index', axis=1)
df.head()

: 

In [None]:
df.info()

: 

In [None]:
msno.bar(df, sort='ascending')

: 

In [None]:
ydata_profiling.ProfileReport(df)

: 

Problems with the data


### Missing Values:

vehicleType has 37869 (10.2%) missing values
gearbox has 20209 (5.4%) missing values
model has 20484 (5.5%) missing values
fuelType has 33386 (9.0%) missing values
notRepairedDamage has 72060 (19.4%) missing values
### DateTime
Dtype of dateCrawled. lastSeen, and dateCreated columns is object --> Convert to DateTime
### Zeros

price has 10778 (2.9%) zeros
powerPS has 40820 (11.0%) zeros
monthOfRegistration has 37675 (10.1%) zeros
### High Correlation

price is highly overall correlated with yearOfRegistration and 1 other field
yearOfRegistrationis highly overall correlated with price and 1 other field
powerPS is highly overall correlated with price High correlation
vehicleType is highly overall correlated with yearOfRegistration
### Other (Possible) Problems

German categorical values --> English
nrOfPictures is irrelevant --> Drop
I think dateCrawled and dateCreated columns are the same (time doesn't matter here). These two, plus maybe even the dateCreated column are irrelevant
I don't know how I feel about the abtest column! is it also irrelevant?

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14, 6))

df['seller'].value_counts().plot(kind='bar', title='Seller Type',  color=['red', 'blue'], ax=axes[0], rot=30, xlabel='')

df['offerType'].value_counts().plot(kind='bar', title='Offer Type',  color=['red', 'blue'], ax=axes[1], rot=30, xlabel='')

df['abtest'].value_counts().plot(kind='bar', title='Test Type',  color=['red', 'blue'], ax=axes[2], rot=30, xlabel='')

: 

#### I think later I'm gonna have to do more data preprocessing.
#### Since except for 3 rows all the other ads are private, I'm also gonna drop the seller column. It's basically useless.
#### As shown, the offerTyple column is also useless so I'm gonna drop this one, too.
#### Also, I noticed that the yearOfRegistration column has some odd and extreme values. So, I'm gonna define a solid range for this field. I think a range between 1980 and 2023 is decent.
#### German to English:
manuell --> Manual
automatik --> Automatic
ja --> Yes
nein --> No
benzin --> Petrol
andere --> Other

In [None]:
# 1. DateTime Conversion --> Drop 'dateCrawled' and 'lastSeen'
df.drop(['lastSeen', 'dateCrawled'], axis=1, inplace=True)
df['dateCreated'] = pd.to_datetime(df['dateCreated'])

# 2. Drop the 'nrOfPictures', 'seller', 'offerType' columns
df.drop('nrOfPictures', axis=1, inplace=True)
df.drop('seller', axis=1, inplace=True)
df.drop('offerType', axis=1, inplace=True)

# 3. Define a range for the 'yearOfRegistration'
df = df[df['yearOfRegistration'].between(1980, 2023)]

# 4. German to English, as best as I can
df['gearbox'] = df['gearbox'].apply(lambda g: 'Manual' if g == 'manuell' else 'Automatic')
df['notRepairedDamage'] = df['notRepairedDamage'].apply(lambda g: 'Yes' if g == 'ja' else 'No' if g =='nein' else 'NaN')
df.loc[df['fuelType'] == 'benzin', 'fuelType'] = 'Petrol'
df.loc[df['fuelType'] == 'andere', 'fuelType'] = 'Other'

df.head()

: 

In [None]:
colors_10_1 = sns.color_palette("Spectral", 10)
colors_10_2 = sns.color_palette("Set3", 10)
colors_5 = sns.color_palette('Accent', 5)

: 

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

df['brand'].value_counts().head(10).plot(kind='bar', title='Top 10 Brands - Ads Count', color=colors_10_1, ax=axes[0], xlabel='Brand', ylabel='Count', grid=True)
df['brand'].value_counts().tail(10).plot(kind='bar', title='Bottom 10 Brands - Ads Count', color=colors_10_1, ax=axes[1], xlabel='Brand', ylabel='Count', grid=True)

: 

In [None]:
group_by_brand = df.groupby('brand')['model'].value_counts().sort_values(ascending=False)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(24, 8))

group_by_brand.head(10).plot(kind='bar', grid=True, rot=30, xlabel="(Brand, Model) Pairs", ylabel='Count', title='Top 10 Models by Brand - Ads Count', ax=axes[0], color=colors_10_2)
group_by_brand.tail(10).plot(kind='bar', grid=True, rot=30, xlabel="(Brand, Model) Pairs", ylabel='Count', title='Bottom 10 Models by Brand - Ads Count', ax=axes[1], color=colors_10_2)

: 

In [None]:
group_by_vehicle_type = df.groupby('vehicleType')[['model', 'brand']].value_counts().sort_values(ascending=False)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(24, 8))

df['vehicleType'].value_counts().plot(kind='bar', color=colors_5, ax=axes[0], grid=True, rot=30, xlabel='Vehicle Type', ylabel='Count', title='')
group_by_vehicle_type.head(10).plot(kind='bar', grid=True, color=colors_10_1, ax=axes[1], title='Top 10 Cars by Vehicle Type, Brand, and Model - Ads Count')

: 

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(24, 8))

yr_mean = df['yearOfRegistration'].mean()
yr_median = df['yearOfRegistration'].median()

df['yearOfRegistration'].plot(kind='hist', bins=10, ax=axes[0], title='Histogram of the year of registration', xlabel='Year', color='blue')
axes[0].axvline(yr_mean, color='red', linestyle='dashed', linewidth=1, label='MEAN', alpha=.7)
axes[0].axvline(yr_median, color='red', linestyle='solid', linewidth=1, label='MEDIAN', alpha=.7)
axes[0].legend()

df['yearOfRegistration'].plot(kind='kde', ax=axes[1], title='KDE of the year of registration', xlabel='Year',  color='red')
axes[1].axvline(yr_mean, color='blue', linestyle='dashed', linewidth=1, label='MEAN', alpha=.5)
axes[1].axvline(yr_median, color='blue', linestyle='solid', linewidth=1, label='MEDIAN', alpha=.5)
axes[1].legend()

: 

I think there's also smth wrong with the kilometer field. As illustrated, the distribution is centered around 150K. Also, there are no values higher than this. Plus, I think the values are not variable. Either way, I will leave this field alone and just use it as is.

Min. of Kilometer: 5,000 -- Max. of Kilometer: 150,000

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(24, 8))

km_mean = df['kilometer'].mean()
km_median = df['kilometer'].median()

df['kilometer'].plot(kind='hist', bins= 5, ax=axes[0], title='Histogram of the Kilometer', xlabel='KM', color='blue')
axes[0].axvline(km_mean, color='red', linestyle='dashed', linewidth=2, label='MEAN', alpha=.5)
axes[0].axvline(km_median, color='red', linestyle='solid', linewidth=2, label='MEDIAN', alpha=.5)
axes[0].legend()

df['kilometer'].plot(kind='kde', ax=axes[1], title='KDE of the Kilometer', xlabel='KM',  color='red')
axes[1].axvline(km_mean, color='blue', linestyle='dashed', linewidth=2, label='MEAN', alpha=.5)
axes[1].axvline(km_median, color='blue', linestyle='solid', linewidth=2, label='MEDIAN', alpha=.5)
axes[1].legend()

print(f"Min. of Kilometer: {df['kilometer'].min()} -- Max. of Kilometer: {df['kilometer'].max()}\n")

: 

I think I should drop rows with the fuel types of lpg, cng, hybrid, other and elektro.

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(24, 8))

df['gearbox'].value_counts().plot(kind='bar', ax=axes[0], color=['red', 'blue'], title='No. of Cars by Gearbox', xlabel='Gearbox Type', ylabel='Count', rot=30, grid=True)
df['fuelType'].value_counts().plot(kind='bar', ax=axes[1], color=['red', 'blue', 'green', 'purple'], title='No. of Cars by Fuel Type', xlabel='Fuel Type', ylabel='Count', rot=30, grid=True)
df['notRepairedDamage'].value_counts().plot(kind='bar', ax=axes[2], color=['red', 'blue', 'green'], title='Repaired/Damaged?', xlabel='Repaired/Damaged', ylabel='Count', rot=30, grid=True)

: 

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(24, 8))

months = df[df['monthOfRegistration'] > 0]

months['monthOfRegistration'].plot(kind='hist', ax=axes[0], title='Histogram of the month of registration', xlabel='Month', color='blue')

months['monthOfRegistration'].plot(kind='kde', ax=axes[1], title='KDE of the month of registration', xlabel='Month', color='red')

: 

There's no correlation between Price and Month of registration, so I'm gonna drop it later.

In [None]:
a = df[['monthOfRegistration', 'price']]
b = df[['yearOfRegistration', 'price']]
methods = ['kendall', 'spearman', 'pearson']

fig, axes = plt.subplots(3, 2, figsize=(14, 14), sharey=True)
fig.suptitle('Correlations', fontweight='bold', fontsize='14')

for i in range(3):
        
    sns.heatmap(a.corr(method=methods[i]), annot=True, ax=axes[i,0])
    axes[i,0].set_title(f"Price vs. Month of Registration ({methods[i]})")

    sns.heatmap(b.corr(method=methods[i]), annot=True, ax=axes[i,1])
    axes[i,1].set_title(f"Price vs. Year of Registration ({methods[i]})")

: 

The range of the price bothers me:
No. of cars with a price value higher than 20K: 16,469
No. of cars with a price value higher than 30K: 5,556
No. of cars with a price value higher than 40K: 2,501
Mean of price: 16,611.7, Median of price: 2,944.0
Min. price: 0, Max. price: 2,147,483,647

In [None]:
print(f"No. of cars with a price value higher than 20K: {Style.BRIGHT}{len(df[df['price'] > 20_000])}")
print(f"{Style.RESET_ALL}No. of cars with a price value higher than 30K: {Style.BRIGHT}{len(df[df['price'] > 30_000])}")
print(f"{Style.RESET_ALL}No. of cars with a price value higher than 40K: {Style.BRIGHT}{len(df[df['price'] > 40_000])}")
print(f"{Style.RESET_ALL}Mean of price: {Style.BRIGHT}{df['price'].mean():.2f}{Style.RESET_ALL}, Median of price: {Style.BRIGHT}{df['price'].median()}")
print(f"{Style.RESET_ALL}Min. price: {Style.BRIGHT}{df['price'].min()}{Style.RESET_ALL}, Max. price: {Style.BRIGHT}{df['price'].max()}")

: 

As shown, the range of [200, 20K] is better than the other two and it eliminates a largre proportion of outliers. We can go even lower, but I don't like doing so. Just remember that the prices are skewed

In [None]:
df['price'].quantile(.95), df['price'].quantile(.05)

: 

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(24, 12))

bins = 10

price_20 = df.loc[df['price'].between(200, 20_000), 'price']
price_30 = df.loc[df['price'].between(200, 30_000), 'price']
price_40 = df.loc[df['price'].between(200, 40_000), 'price']

price_20.plot(kind='hist', bins=bins, ax=axes[0,0], title='Histogram of Price in Range [200, 20K]', xlabel='Price', color=colors_5[0])
axes[0,0].axvline(price_20.mean(), color='red', linestyle='dashed', linewidth=2, label='MEAN', alpha=.5)
axes[0,0].axvline(price_20.median(), color='red', linestyle='solid', linewidth=2, label='MEDIAN', alpha=.5)
axes[0,0].legend()
txt = f'MEDIAN: {price_20.median()}\nMEAN: {price_20.mean():.2f}'
axes[0,0].text(10_000, 85_000, txt)

price_30.plot(kind='hist', bins=bins, ax=axes[0,1], title='Histogram of Price in Range [200, 30K]', xlabel='Price', color=colors_5[1])
axes[0,1].axvline(price_30.mean(), color='red', linestyle='dashed', linewidth=2, label='MEAN', alpha=.5)
axes[0,1].axvline(price_30.median(), color='red', linestyle='solid', linewidth=2, label='MEDIAN', alpha=.5)
axes[0,1].legend()
txt = f'MEDIAN: {price_30.median()}\nMEAN: {price_30.mean():.2f}'
axes[0,1].text(10_000, 85_000, txt)

price_40.plot(kind='hist', bins=bins, ax=axes[0,2], title='Histogram of Price in Range [200, 40K]', xlabel='Price', color=colors_5[2])
axes[0,2].axvline(price_40.mean(), color='red', linestyle='dashed', linewidth=2, label='MEAN', alpha=.5)
axes[0,2].axvline(price_40.median(), color='red', linestyle='solid', linewidth=2, label='MEDIAN', alpha=.5)
axes[0,2].legend()
txt = f'MEDIAN: {price_40.median()}\nMEAN: {price_40.mean():.2f}'
axes[0,2].text(10_000, 85_000, txt)

price_20.plot(kind='kde', ax=axes[1,0], title='KDE of Price in Range [200, 20K]', xlabel='Price',  color=colors_5[0])
axes[1,0].axvline(price_20.mean(), color='blue', linestyle='dashed', linewidth=2, label='MEAN', alpha=.5)
axes[1,0].axvline(price_20.median(), color='blue', linestyle='solid', linewidth=2, label='MEDIAN', alpha=.5)
axes[1,0].legend()

price_30.plot(kind='kde', ax=axes[1,1], title='KDE of Price in Range [200, 30K]', xlabel='Price',  color=colors_5[1])
axes[1,1].axvline(price_30.mean(), color='blue', linestyle='dashed', linewidth=2, label='MEAN', alpha=.5)
axes[1,1].axvline(price_30.median(), color='blue', linestyle='solid', linewidth=2, label='MEDIAN', alpha=.5)
axes[1,1].legend()

price_40.plot(kind='kde', ax=axes[1,2], title='KDE of Price in Range [200, 40K]', xlabel='Price',  color=colors_5[2])
axes[1,2].axvline(price_40.mean(), color='blue', linestyle='dashed', linewidth=2, label='MEAN', alpha=.5)
axes[1,2].axvline(price_40.median(), color='blue', linestyle='solid', linewidth=2, label='MEDIAN', alpha=.5)
axes[1,2].legend()

: 

Data Preprocessing - Part 2

#### Drop null values
#### As mentioned above, the range of price will be set to[200, 20K]
#### Drop the extreme values of the powerPS
#### Drop the postalCode, dateCreated, name, and monthOfRegistration columns
#### Add a new column for the age of the car. Then drop the yearOfRegistration column
#### Convert categorical values to numerical (using LabelEncoder)

In [None]:
# drop null values
df.dropna(inplace=True)

# Drop cars with unacceptable prices
df = df[df['price'].between(200, 20_000)]

# Drop cars with extreme powerPS
df = df[(df['powerPS'] > 0) & (df['powerPS'] <= 1000)]

# Add the 'age' column
current_year = 2023
df['age'] = current_year - df['yearOfRegistration']

# Drop unnecessary columns and rows
df.drop(['postalCode', 'dateCreated', 'name', 'monthOfRegistration', 'yearOfRegistration'], axis=1, inplace=True)
df = df[df['fuelType'] != 'Other']
df = df[df['notRepairedDamage'] != 'NaN']

# Categorical to Numerical
categorical_features = ['vehicleType', 'fuelType', 'brand', 'model', 'abtest', 'notRepairedDamage', 'gearbox']
le = LabelEncoder()

for categorical_feature in categorical_features:
    df[categorical_feature] = le.fit_transform(df[categorical_feature])

: 

In [None]:
df.head()

: 

Now let's look at the correlations one more time, since I'm a bit skeptical about the brand.

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(24, 6), sharey=True)
fig.suptitle('Correlations', fontweight='bold', fontsize='14')
methods = ['pearson', 'spearman', 'kendall']

for i in range(3):
    corr = df.corr(method=methods[i])
    corr = corr[((corr >= 0.2) | (corr <= -0.2)) & (corr != 1)]
    sns.heatmap(corr, annot=True, ax=axes[i])
    axes[i].set_title(f"{methods[i]}")

: 

A few notes before jumping into the prediction phase

- I think the data itself is quite ready for this phase. However, I should apply Feature Scaling with Z-Score Norm
 
- I will also split the data with a ratio of %70, %15, and %15.

In [None]:
features = df.drop(['price'], axis=1)
target = df['price']

scaler = StandardScaler()
numerical_features = ['kilometer', 'powerPS', 'age']
features[numerical_features] = scaler.fit_transform(features[numerical_features])

: 

In [None]:
features.head()

: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=666, train_size=0.85)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=666, train_size=0.8235)

: 

In [None]:
print(f'{Fore.BLUE}X_train Shape: {Fore.GREEN}{X_train.shape}')
print(f'{Fore.BLUE}y_train Shape: {Fore.GREEN}{y_train.shape}')

print(f'{Fore.RED}{Back.LIGHTYELLOW_EX}--'*13)

print(f'{Back.RESET}{Fore.BLUE}X_test Shape: {Fore.GREEN}{X_test.shape}')
print(f'{Fore.BLUE}y_test Shape: {Fore.GREEN}{y_test.shape}')

print(f'{Fore.RED}{Back.LIGHTYELLOW_EX}--'*13)

print(f'{Back.RESET}{Fore.BLUE}X_val Shape:  {Fore.GREEN}{X_val.shape}')
print(f'{Fore.BLUE}y_val Shape:  {Fore.GREEN}{y_val.shape}')

: 

🖥 Modeling 🖥

🤖 DNN 🤖

In this phase, I'm going to train different models in order to predict the price of a used car based on other features (10 in total). I plan to use:
- A Deep Neural Network
- Conventional ML methods for regression
- Ensembling
- XAI using SHAP


General Overview
A DNN with the following number of neurons: 32, 64, 128
Dropout with a probability of 0.2
ReLU activiation function for the hidden layers
A Linear activation function for the output layer
Loss: MSE, Metric: MAE
Adam optimizer with the default learning rate
Epoch: 50, Batch Size: 512
EarlyStopping with a focus on the val_loss
Saving checkpoints based on the validation MAE
A Learning Rate Scheduler (Visualized further down)

In [None]:
epoch = 50
batch_size = 512
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, restore_best_weights=True)
mc = ModelCheckpoint('./best_model.h5', monitor='val_mean_absolute_error', mode='min', verbose=1, save_best_only=True)
lr_schedule = LearningRateScheduler(lambda epoch: 0.001 * np.exp(-epoch / 10.))

: 

In [None]:
model = Sequential()

model.add(Dense(32, input_dim=X_train.shape[1], activation='relu', name='Dense_1'))
model.add(Dropout(0.2, name='Dropout_1'))

model.add(Dense(64, 'relu', name='Dense_2'))
model.add(Dropout(0.2, name='Dropout_2'))

model.add(Dense(128, 'relu', name='Dense_3'))
model.add(Dropout(0.2, name='Dropout_3'))

model.add(Dense(64, 'relu', name='Dense_4'))
model.add(Dropout(0.2, name='Dropout_4'))

model.add(Dense(32, 'relu', name='Dense_5'))

model.add(Dense(1, 'linear', name='Predictor'))

model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=['mean_absolute_error'])

: 

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, show_shapes=True, rankdir="LR")

: 

In [None]:
history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    batch_size=batch_size,
                    epochs=epoch,
                    callbacks=[mc, lr_schedule, es])

: 

In [None]:
#save the model

from tensorflow.keras.models import load_model

dnn_model = load_model(r"C:\Users\admin\Documents\Bafokeng\Portfolio Projects\Used cars\best_model.h5")
dnn_eval = model.evaluate(X_test, y_test)
dnn_preds = model.predict(X_test)

print(f"{Fore.BLUE}DNN - Test set\n\nMSE: {Fore.RED}{dnn_eval[0]}\n{Fore.BLUE}MAE: {Fore.RED}{dnn_eval[1]:.2f}")
print(f"{Fore.BLUE}R2-Score: {Fore.RED}{r2_score(y_test, dnn_preds):.3f}")

: 

In [None]:
lr = history.history['lr']
loss = history.history['loss']
val_loss = history.history['val_loss']
t_mae = history.history['mean_absolute_error']
v_mae = history.history['val_mean_absolute_error']

plt.figure(figsize=(14, 28))

# MSE
plt.subplot(3, 1, 1)
plt.plot(loss, label='Training MSE', color='r')
plt.plot(val_loss, label='Validation MSE', color='b')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(loc='upper right', fontsize=13)
plt.ylabel('MSE', fontsize=16, weight='bold')
plt.title('Training & Validation Loss', fontsize=16, weight='bold')

# MAE
plt.subplot(3, 1, 2)
plt.plot(t_mae, label='Training MAE', color='r')
plt.plot(v_mae, label='Validation MAE', color='b')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(loc='best', fontsize=13)
plt.ylabel('MAE', fontsize=16, weight='bold')
plt.title('Training & Validation MAE', fontsize=15, weight='bold')

# Learning Rate
plt.subplot(3, 1, 3)
plt.plot(lr, label='Learning Rate', color='r')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(loc='upper right', fontsize=13)
plt.ylabel('LR', fontsize=16, weight='bold')
plt.title('Training Learning Rate', fontsize=15, weight='bold')
plt.xlabel('Epoch', fontsize=15, weight='bold')

plt.show()

: 

In [None]:
c = 50
for _ in range(10):
    print(y_test.values[c] - dnn_preds[c])
    c += c

: 

🦾 ML 🦾

General Overview
Regression Models: Decision Tree, Random Forest, Extra Trees, AdaBoost, XGBoost, CatBoost, and KNeighbors
Metrics: MSE, MAE, and R-squared model

 
Cross-validation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, test_size=0.2, random_state=15)
print(f"Train Shape: {X_train.shape}\nTest Shape: {X_test.shape}")

: 

In [None]:
scores_metric = {"R2-Score":make_scorer(r2_score),
                 "MAE":make_scorer(mean_absolute_error),
                 "MSE":make_scorer(mean_squared_error)}

: 

In [None]:
def score_vis(score):
    
    names = ['Decision Tree', 'Random Forest', 'Extra Trees','AdaBoost', 'XGBoost', 'CatBoost', 'KNN']

    plt.rcParams['figure.figsize']=20,8
    ax = sns.barplot(x=names, y=score, palette = "plasma", saturation =2.0)
    
    plt.xlabel('Model', fontsize = 20 )
    plt.ylabel('R2-Score(%)', fontsize = 20)
    plt.title('Model Performance Comparison - Test set', fontsize = 20)
    plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
    plt.yticks(fontsize = 12)
    
    for i in ax.patches:
        width, height = i.get_width(), i.get_height()
        x, y = i.get_xy() 
        ax.annotate(f'{round(height,2)}%', (x + width/2, y + height*1.02), ha='center', fontsize='x-large')
        
    plt.show()

: 

In [None]:
def trainer(X_train, y_train, X_test, y_test):
    
    models= [['Decision Tree', DecisionTreeRegressor()],
             ['Random Forest', RandomForestRegressor(n_estimators=124)],
             ['Extra Trees', ExtraTreesRegressor(n_estimators=124)],
             ['AdaBoost', AdaBoostRegressor(loss='square', n_estimators=1000)],
             ['XGBoost', XGBRegressor(tree_method='hist', n_estimators=1000)],
             ['CatBoost', CatBoostRegressor(logging_level='Silent')],
             ['KNN', KNeighborsRegressor(weights='distance')]]

    scores = []
    
    print(Back.RED + Fore.BLACK + Style.BRIGHT + '⁜⁜ STARTING THE PROCESS... ⁜⁜\n\n')
    print(Back.RESET)
    
    for model_name, model in models:
        
        print(Fore.LIGHTRED_EX + 'Cross validating the ' + model_name + ' model...\n')
        model = model
        result = cross_validate(model, X_train, y_train, cv=5, verbose=1, scoring=scores_metric)
        
        for key, value in result.items():
            print(f"{Fore.BLUE}{key}: {Fore.RED}{np.mean(value):.3f} {Fore.BLUE}+- {Fore.RED}{np.std(value):.3f}")
        
        print(Fore.LIGHTGREEN_EX + '\nTraining...')
        model.fit(X_train, y_train)
        
        print(Fore.LIGHTGREEN_EX + 'Predicting...\n')
        pred = model.predict(X_test)
        
        scores.append(r2_score(y_test, pred))
        print(f"{Fore.BLUE}{model_name} Test R2-Score: {Fore.RED}{r2_score(y_test, pred):.3f}")
        print(f"{Fore.BLUE}{model_name} Test MAE:      {Fore.RED}{mean_absolute_error(y_test, pred):.3f}")

        print('\n' + Fore.BLACK + Back.WHITE + '⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜⁜\n' + Back.RESET)
    
    print('\n\n\n' + Back.RED + Fore.BLACK + Style.BRIGHT + '⁜⁜ PROCESS COMPLETED! ⁜⁜')
    
    return scores

: 

In [None]:
scores = trainer(X_train, y_train, X_test, y_test)

: 

In [None]:
score_vis(scores)

: 

As shown, XGBoostRegressor offered the best performance, whereas AdaBoost is the worst. Here, we see that the  
R_squares score of the AdaBoost model is negative. Weired, right? Mathematically speaking, it is not impossible. Rather, this happened because the AdaBoost model completely failed to fit the data.

🧑🏻‍🏫 XAI 🧑🏻‍🏫

In this part, I'm going to use SHAP to only explain a few models, starting with XGBoost

In [None]:
xgbr_model = XGBRegressor(tree_method='hist', n_estimators=1000)
xgbr_model.fit(X_train, y_train)

: 

In [None]:
# This is gonna take quite some time

xgbr_pred = xgbr_model.predict(X_test, output_margin=True)
explainer = shap.TreeExplainer(xgbr_model)
shap_values = explainer.shap_values(X_test)
np.abs(shap_values.sum(1) + explainer.expected_value - xgbr_pred).max()

: 

In [None]:
features_list = ['abtest','vehicleType','gearbox','powerPS','model','kilometer','fuelType','brand','notRepairedDamage','age']
print(Fore.BLUE + 'SHAP - XGBoost')
shap.summary_plot(shap_values=shap_values, features=X_test, feature_names=features_list, plot_size=(10, 10)) # setting max_display=3 will only show the top 3 features.

: 

As shown, the age and powerPS features contributed the most to the prediction of the XGBoost Regressor model. In case you want to visualize less features, define the max_display argument and set the no. of top features that you want to see.

CatBoost

In [None]:
cbr_model = CatBoostRegressor(logging_level='Silent')
cbr_model.fit(X_train, y_train)

cbr_pred = cbr_model.predict(X_test)
explainer_cbr = shap.TreeExplainer(cbr_model)
shap_values_cbr = explainer.shap_values(X_test)

: 

In [None]:
print(Fore.BLUE + 'SHAP - CatBoost\n')
shap.summary_plot(shap_values=shap_values_cbr, features=X_test, feature_names=features_list, plot_size=(10, 10))

: 

The story is the same here as before.

🏅 Ensemble Learning 🏅

1. Simple Averaging
2. Weighted Averaging --> I will manually wet the weights here based on my take from the previous step. Note that the weights must sum up to 1.
3. Voting
4. Stacking

In [None]:
dtr_model = DecisionTreeRegressor()
rfr_model = RandomForestRegressor(n_estimators=124)
etr_model = ExtraTreesRegressor(n_estimators=124)
xbr_model = XGBRegressor(tree_method='hist', n_estimators=1000)
cbr_model = CatBoostRegressor(logging_level='Silent')
knr_model = KNeighborsRegressor(weights='distance')

: 

#Averaging
1. Simple Average
2. Weighted Average

In [None]:
%%time

print('Fitting the Decision Tree model...\n')
dtr_model.fit(X_train, y_train)

print('Fitting the Random Forest model...\n')
rfr_model.fit(X_train, y_train)

print('Fitting the Extra Trees model...\n')
etr_model.fit(X_train, y_train)

print('Fitting the XGBRegressor model...\n')
xbr_model.fit(X_train, y_train)

print('Fitting the CatBoostRegressor model...\n')
cbr_model.fit(X_train, y_train)

print('Fitting the KNeighborsRegressor model...\n')
knr_model.fit(X_train, y_train)

: 

In [None]:
%%time

dtr_pred = dtr_model.predict(X_test)
rfr_pred = rfr_model.predict(X_test)
etr_pred = etr_model.predict(X_test)
xbr_pred = xbr_model.predict(X_test)
cbr_pred = cbr_model.predict(X_test)
knr_pred = knr_model.predict(X_test)

: 

In [None]:
# 1. Simple Average
sum_preds = dtr_pred + rfr_pred + etr_pred + xbr_pred + cbr_pred + knr_pred
pred_final = np.round(sum_preds / 6)

print(Back.RED + Fore.BLACK + Style.BRIGHT + 'Simple Averaging Performance' + Back.RESET)

print(f"{Fore.BLUE}Test R2-Score: {Fore.RED}{r2_score(y_test, pred_final):.3f}")
print(f"{Fore.BLUE}Test MAE:      {Fore.RED}{mean_absolute_error(y_test, pred_final):.3f}")

: 

In [None]:
pred_final_w = np.round(0.4*xbr_pred + 0.2*cbr_pred + 0.2*rfr_pred + 0.1*etr_pred + 0.05*knr_pred + 0.05*dtr_pred)

: 

In [None]:
# 2. Weighted Average

print(Back.RED + Fore.BLACK + Style.BRIGHT + 'Weighted Averaging Performance' + Back.RESET)

print(f"{Fore.BLUE}Test R2-Score: {Fore.RED}{r2_score(y_test, pred_final_w):.3f}")
print(f"{Fore.BLUE}Test MAE:      {Fore.RED}{mean_absolute_error(y_test, pred_final_w):.3f}")

: 

Voting

In [None]:
%%time

estimators = [('dtr', dtr_model), ('rfr', rfr_model), ('etr', etr_model),
              ('xbr', xbr_model), ('cbr', cbr_model), ('knr', knr_model)]

voting_model = VotingRegressor(estimators=estimators)

voting_model.fit(X_train, y_train)

voting_pred = voting_model.predict(X_test)

: 

In [None]:
print(Back.RED + Fore.BLACK + Style.BRIGHT + 'Voting Performance' + Back.RESET)

print(f"{Fore.BLUE}Test R2-Score: {Fore.RED}{r2_score(y_test, voting_pred):.3f}")
print(f"{Fore.BLUE}Test MAE:      {Fore.RED}{mean_absolute_error(y_test, voting_pred):.3f}")

: 

Stacking

In [None]:
%%time

final_estimator = GradientBoostingRegressor(random_state=666)

stacking_model = StackingRegressor(estimators=estimators, final_estimator=final_estimator)

stacking_model.fit(X_train, y_train)

stacking_pred = stacking_model.predict(X_test)

: 

In [None]:
print(Back.RED + Fore.BLACK + Style.BRIGHT + 'Stacking Performance' + Back.RESET)

print(f"{Fore.BLUE}Test R2-Score: {Fore.RED}{r2_score(y_test, stacking_pred):.3f}")
print(f"{Fore.BLUE}Test MAE:      {Fore.RED}{mean_absolute_error(y_test, stacking_pred):.3f}")

: 

🙌🏻 Conclusion 🙌🏻

n this notebook, we analyzed a dataset of used cars, and based on that, performed data preprocessing for doing DL/ML.
We later tried to predict the price of a used car based on several features. A DNN and several regression models for used for this task. I was expecting the DNN model to outperform the rest, however, the XGBoost model offered the best performance.
We later used the SHAP framework to explain two of our models.
Finally, different ensemble learning techniques were used. However, this approach did not manage to outperform the other ones, at least in case of the  
R-squared scores, though the MAE was slightly decreased using these approaches.
