In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import chi2_contingency
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate, Embedding, Flatten
from tensorflow.keras.models import Model
from sklearn.ensemble import HistGradientBoostingRegressor

In [2]:
df = pd.read_csv('../../Dataset/cleaned_used_cars_v3.csv')

In [3]:
df.shape

(236195, 329)

In [4]:
df.shape

(236195, 329)

In [5]:
def r2(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [6]:
X = df.drop(['price'], axis=1)
y = df['price']
X

Unnamed: 0,seller,offerType,abtest,gearbox,powerPS,kilometer,notRepairedDamage,vehicleType.andere,vehicleType.bus,vehicleType.cabrio,...,"year_range.[1970, 1975)","year_range.[1975, 1980)","year_range.[1980, 1985)","year_range.[1985, 1990)","year_range.[1990, 1995)","year_range.[1995, 2000)","year_range.[2000, 2005)","year_range.[2005, 2010)","year_range.[2010, 2015)","year_range.[2015, 2020)"
0,0,0,0,0,-0.300689,0.590703,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,-0.343274,-1.085768,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,-0.109056,0.590703,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,-0.059373,0.590703,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,-0.478126,-2.482827,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236190,0,0,0,0,-0.811710,0.590703,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
236191,0,0,1,0,0.763938,0.590703,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
236192,0,0,1,0,-0.833002,0.590703,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
236193,0,0,0,1,-0.116153,-0.107827,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [7]:
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat) + 1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0, cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array, n_array)) / np.sum(n_array)
    numerator = np.sum(np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2)))
    denominator = np.sum(np.power(np.subtract(measurements, y_total_avg), 2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator / denominator)
    return eta

def check_multicollinearity(df, threshold=0.7):
    corr = df.corr()
    n_vars = len(df.columns)
    for i in range(n_vars):
        for j in range(i+1, n_vars):
            if i != j:
                var1 = df.columns[i]
                var2 = df.columns[j]
                if df[var1].dtype == 'O' and df[var2].dtype == 'O': # Categorical vs categorical
                    contingency_table = pd.crosstab(df[var1], df[var2])
                    _, p, _, _ = chi2_contingency(contingency_table)
                    eta = correlation_ratio(df[var1], df[var2])
                    if eta > threshold and p < 0.05:
                        print("Categorical")
                        print(f"Multicollinearity detected between {var1} and {var2} (correlation ratio: {eta:.2f}, p-value: {p:.2f})")
                elif df[var1].dtype != 'O' and df[var2].dtype != 'O': # Numeric vs numeric
                    corr_coef = np.abs(corr[var1][var2])
                    if corr_coef > threshold:
                        print("Numerical")
                        print(f"Multicollinearity detected between {var1} and {var2} (correlation coefficient: {corr_coef:.2f})")
                else: # Categorical vs numeric
                    eta = correlation_ratio(df[var1], df[var2])
                    if eta > threshold:
                        print("Categorical vs Numerical")
                        print(f"Multicollinearity detected between {var1} and {var2} (correlation ratio: {eta:.2f})")

check_multicollinearity(X)

Numerical
Multicollinearity detected between fuelType.benzin and fuelType.diesel (correlation coefficient: 0.96)
Numerical
Multicollinearity detected between brand.bmw and model.3er (correlation coefficient: 0.73)
Numerical
Multicollinearity detected between brand.lada and model.niva (correlation coefficient: 0.81)
Numerical
Multicollinearity detected between brand.mini and model.cooper (correlation coefficient: 0.78)
Numerical
Multicollinearity detected between brand.smart and model.fortwo (correlation coefficient: 0.93)
Numerical
Multicollinearity detected between brand.trabant and model.601 (correlation coefficient: 0.92)


In [8]:
X.columns

Index(['seller', 'offerType', 'abtest', 'gearbox', 'powerPS', 'kilometer',
       'notRepairedDamage', 'vehicleType.andere', 'vehicleType.bus',
       'vehicleType.cabrio',
       ...
       'year_range.[1970, 1975)', 'year_range.[1975, 1980)',
       'year_range.[1980, 1985)', 'year_range.[1985, 1990)',
       'year_range.[1990, 1995)', 'year_range.[1995, 2000)',
       'year_range.[2000, 2005)', 'year_range.[2005, 2010)',
       'year_range.[2010, 2015)', 'year_range.[2015, 2020)'],
      dtype='object', length=328)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential()
# model.add(Dense(2048, activation='relu'))
# # model.add(Dropout(0.5))
# model.add(Dense(1024, activation='relu'))
# # model.add(Dropout(0.3))
# model.add(Dense(512, activation='relu'))
# # model.add(Dropout(0.1))
# model.add(Dense(256, activation='relu'))
# model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1))

print('--------------------Compile------------------------------------')
model.compile(optimizer=tf.keras.optimizers.Adam(amsgrad=True),
              loss='mean_squared_error',
              metrics=['mae', r2])

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

early_stop = EarlyStopping(monitor='val_loss', patience=100)

checkpoint = ModelCheckpoint(filepath='best_weights_amsgrad.h5', 
                             monitor='val_loss', 
                             save_best_only=True, 
                             mode='min', 
                             verbose=1)

print('--------------------Fit------------------------------------')
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=1000, epochs=1000)

print('--------------------Evaluate------------------------------------')
score = model.evaluate(X_test, y_test)

print(f'Deep Learning Regression Mean Absolute Error: {score[1]:.2f}')
print(f'Deep Learning Regression R-squared Score: {score[2]:.2f}')

--------------------Compile------------------------------------
--------------------Fit------------------------------------
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epo

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                21056     
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 23,681
Trainable params: 23,681
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.get_weights()

[array([[-0.55976   , -0.65854746, -0.7287473 , ..., -0.7431623 ,
         -0.6170755 , -0.61567885],
        [-0.78788865, -1.1726027 , -0.91060054, ..., -0.74649763,
         -0.5237418 , -0.69479007],
        [-0.06538297, -0.05624077,  0.06125941, ...,  0.01927553,
          0.19356324, -0.22595827],
        ...,
        [ 0.08143279,  0.5425322 ,  0.9148786 , ...,  0.4068833 ,
          0.8891997 ,  0.37311143],
        [ 2.1473625 ,  1.3629584 ,  2.2634723 , ...,  2.298304  ,
          1.6415069 ,  1.498184  ],
        [ 0.78416705,  0.3276847 ,  1.7070639 , ...,  2.2917602 ,
          0.5899661 ,  0.6736646 ]], dtype=float32),
 array([ 0.08838929,  0.4338774 ,  0.4082818 ,  0.51016796,  0.10346008,
        -0.6011889 ,  0.27982435, -0.42692435,  0.226335  ,  0.02450352,
         0.28171003,  0.26629305, -0.12586905,  0.1699366 , -0.01599055,
        -0.11751546,  0.10028981,  0.31581572,  0.09131601, -0.07324186,
         0.1886101 ,  0.20560905,  0.30394128, -0.39681575,  0.499

In [12]:
predictions = model.predict(X_test)

# Print the predicted prices
predictions



array([[12394.412 ],
       [ 1593.1005],
       [ 8358.843 ],
       ...,
       [ 3013.1333],
       [ 1303.0057],
       [ 7057.763 ]], dtype=float32)

In [13]:
y_test

57150     11800.0
174552     1100.0
232771     4360.0
3924       6999.0
3195       1899.0
           ...   
210308      555.0
20689      7750.0
94695      2100.0
159766     2499.0
229751     7200.0
Name: price, Length: 47239, dtype: float64

In [14]:
import plotly.express as px
import pandas as pd

# create a dataframe with the training history
df_graph = pd.DataFrame(history.history)

# create line charts for loss and MAE
fig = px.line(df_graph, x=df_graph.index+1, y=['loss', 'val_loss'], labels={'x':'Epoch', 'value':'Loss'},
              title='Training and Validation Loss')
fig.update_traces(mode='lines')
fig.show()

fig = px.line(df_graph, x=df_graph.index+1, y=['mae', 'val_mae'], labels={'x':'Epoch', 'value':'MAE'},
              title='Training and Validation MAE')
fig.update_traces(mode='lines')
fig.show()

In [15]:
est = HistGradientBoostingRegressor(max_iter=2000).fit(X_train, y_train)
print(est.score(X_train, y_train))
print(est.score(X_test, y_test))

0.8609412916041119
0.8439048679696662
