In [86]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import keras.backend as K
# %pip install tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import chi2_contingency
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate, Embedding, Flatten
from tensorflow.keras.models import Model
from sklearn.ensemble import HistGradientBoostingRegressor

In [87]:
df = pd.read_csv('../../Dataset/cleaned_used_cars_v3.csv')

In [88]:
df.shape

(236195, 25)

In [91]:
df.shape

(236195, 25)

In [92]:
def r2(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [93]:
X = df.drop(['price'], axis=1)
y = df['price']
X

Unnamed: 0,seller,offerType,abtest,yearOfRegistration,gearbox,powerPS,model,kilometer,brand,notRepairedDamage,...,vehicleType.kleinwagen,vehicleType.kombi,vehicleType.limousine,vehicleType.suv,fuelType.benzin,fuelType.cng,fuelType.diesel,fuelType.elektro,fuelType.hybrid,fuelType.lpg
0,0,0,0,2001.0,0,75.0,117,150000.0,37,0,...,1,0,0,0,1,0,0,0,0,0
1,0,0,0,2008.0,0,69.0,102,90000.0,31,0,...,1,0,0,0,0,0,1,0,0,0
2,0,0,0,1995.0,0,102.0,11,150000.0,2,1,...,0,0,1,0,1,0,0,0,0,0
3,0,0,0,2004.0,0,109.0,8,150000.0,25,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,1980.0,0,50.0,40,40000.0,37,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236190,0,0,0,2005.0,0,3.0,11,150000.0,2,0,...,0,1,0,0,0,0,1,0,0,0
236191,0,0,1,2004.0,0,225.0,141,150000.0,30,1,...,0,0,1,0,1,0,0,0,0,0
236192,0,0,1,2000.0,0,0.0,248,150000.0,24,0,...,0,0,0,0,1,0,0,0,0,0
236193,0,0,0,2000.0,1,101.0,107,125000.0,32,0,...,0,0,0,0,1,0,0,0,0,0


In [94]:
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat) + 1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0, cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array, n_array)) / np.sum(n_array)
    numerator = np.sum(np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2)))
    denominator = np.sum(np.power(np.subtract(measurements, y_total_avg), 2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator / denominator)
    return eta

def check_multicollinearity(df, threshold=0.7):
    corr = df.corr()
    n_vars = len(df.columns)
    for i in range(n_vars):
        for j in range(i+1, n_vars):
            if i != j:
                var1 = df.columns[i]
                var2 = df.columns[j]
                if df[var1].dtype == 'O' and df[var2].dtype == 'O': # Categorical vs categorical
                    contingency_table = pd.crosstab(df[var1], df[var2])
                    _, p, _, _ = chi2_contingency(contingency_table)
                    eta = correlation_ratio(df[var1], df[var2])
                    if eta > threshold and p < 0.05:
                        print("Categorical")
                        print(f"Multicollinearity detected between {var1} and {var2} (correlation ratio: {eta:.2f}, p-value: {p:.2f})")
                elif df[var1].dtype != 'O' and df[var2].dtype != 'O': # Numeric vs numeric
                    corr_coef = np.abs(corr[var1][var2])
                    if corr_coef > threshold:
                        print("Numerical")
                        print(f"Multicollinearity detected between {var1} and {var2} (correlation coefficient: {corr_coef:.2f})")
                else: # Categorical vs numeric
                    eta = correlation_ratio(df[var1], df[var2])
                    if eta > threshold:
                        print("Categorical vs Numerical")
                        print(f"Multicollinearity detected between {var1} and {var2} (correlation ratio: {eta:.2f})")

check_multicollinearity(X)

Numerical
Multicollinearity detected between fuelType.benzin and fuelType.diesel (correlation coefficient: 0.96)


In [96]:
X.columns

Index(['seller', 'offerType', 'abtest', 'yearOfRegistration', 'gearbox',
       'powerPS', 'model', 'kilometer', 'brand', 'notRepairedDamage',
       'vehicleType.andere', 'vehicleType.bus', 'vehicleType.cabrio',
       'vehicleType.coupe', 'vehicleType.kleinwagen', 'vehicleType.kombi',
       'vehicleType.limousine', 'vehicleType.suv', 'fuelType.benzin',
       'fuelType.cng', 'fuelType.diesel', 'fuelType.elektro',
       'fuelType.hybrid', 'fuelType.lpg'],
      dtype='object')

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential()
model.add(Dense(2048, activation='relu', input_shape=(X_train.shape[1],)))
# model.add(Dropout(0.5))
model.add(Dense(1024, activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(512, activation='relu'))
# model.add(Dropout(0.1))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1))

print('--------------------Compile------------------------------------')
model.compile(optimizer=tf.keras.optimizers.Adam(amsgrad=True),
              loss='mean_squared_error',
              metrics=['mae', r2])

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

early_stop = EarlyStopping(monitor='val_loss', patience=100)

checkpoint = ModelCheckpoint(filepath='best_weights_amsgrad.h5', 
                             monitor='val_loss', 
                             save_best_only=True, 
                             mode='min', 
                             verbose=1)

print('--------------------Fit------------------------------------')
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=1000, epochs=1000)

print('--------------------Evaluate------------------------------------')
score = model.evaluate(X_test, y_test)

print(f'Deep Learning Regression Mean Absolute Error: {score[1]:.2f}')
print(f'Deep Learning Regression R-squared Score: {score[2]:.2f}')

--------------------Compile------------------------------------
--------------------Fit------------------------------------
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epo

In [98]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_19 (Dense)            (None, 2048)              51200     
                                                                 
 dense_20 (Dense)            (None, 1024)              2098176   
                                                                 
 dense_21 (Dense)            (None, 512)               524800    
                                                                 
 dense_22 (Dense)            (None, 256)               131328    
                                                                 
 dense_23 (Dense)            (None, 128)               32896     
                                                                 
 dense_24 (Dense)            (None, 64)                8256      
                                                                 
 dense_25 (Dense)            (None, 32)               

In [99]:
model.get_weights()

[array([[-6.8707764e-04, -1.4297612e+00, -1.8611215e-03, ...,
         -3.2746688e-02, -1.3046762e+00,  4.3449473e-02],
        [-4.8662014e-02,  6.7794590e+00, -7.4763224e-04, ...,
         -2.7869582e-02,  1.1999612e+00,  1.6671684e-02],
        [-3.5558410e-02, -9.4111693e-01,  1.6186759e-03, ...,
         -1.9049317e-02,  2.6142571e+00, -2.1454155e-02],
        ...,
        [-7.2385743e-04, -2.0731094e+00,  5.2706409e-02, ...,
         -2.4329409e-02, -8.9537907e-01,  5.3758148e-02],
        [-1.9783903e-02, -1.7462719e+01,  2.9086765e-02, ...,
         -3.9369099e-02,  6.7881870e+00,  1.4646277e-03],
        [ 4.6696778e-02,  2.1386471e+01, -2.7129738e-02, ...,
         -4.4168480e-02, -6.1851206e+00, -4.0568419e-02]], dtype=float32),
 array([-0.0100932 ,  0.14918603,  0.        , ..., -0.01530097,
        -0.15583947,  0.        ], dtype=float32),
 array([[-0.02814582, -0.02154461,  0.00106142, ..., -0.02108421,
          0.02205649, -0.04037539],
        [ 0.00151951,  0.0085739

In [102]:
predictions = model.predict(X_test)

# Print the predicted prices
predictions



array([[12234.914 ],
       [ 2227.3027],
       [ 5611.0435],
       ...,
       [ 2322.776 ],
       [ 2074.1843],
       [ 5509.486 ]], dtype=float32)

In [103]:
y_test

57150     11800.0
174552     1100.0
232771     4360.0
3924       6999.0
3195       1899.0
           ...   
210308      555.0
20689      7750.0
94695      2100.0
159766     2499.0
229751     7200.0
Name: price, Length: 47239, dtype: float64

In [104]:
import plotly.express as px
import pandas as pd

# create a dataframe with the training history
df_graph = pd.DataFrame(history.history)

# create line charts for loss and MAE
fig = px.line(df_graph, x=df_graph.index+1, y=['loss', 'val_loss'], labels={'x':'Epoch', 'value':'Loss'},
              title='Training and Validation Loss')
fig.update_traces(mode='lines')
fig.show()

fig = px.line(df_graph, x=df_graph.index+1, y=['mae', 'val_mae'], labels={'x':'Epoch', 'value':'MAE'},
              title='Training and Validation MAE')
fig.update_traces(mode='lines')
fig.show()

In [105]:
est = HistGradientBoostingRegressor(max_iter=2000).fit(X_train, y_train)
print(est.score(X_train, y_train))
print(est.score(X_test, y_test))

0.8863050691411567
0.8649421911409246
