# Goals and Overview

# Project

## Initialization

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder, MaxAbsScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

## Reading Data

In [2]:
df = pd.read_csv('./datasets/car_data.csv')

In [3]:
df = df.rename(columns={'DateCrawled': 'date_crawled', 'Price': 'price', 'VehicleType': 'vehicle_type', 'RegistrationYear': 'registration_year', 'Gearbox': 'gearbox', 'Power': 'power', 'Model': 'model', 'Mileage': 'mileage', 'RegistrationMonth': 'registration_month', 'FuelType': 'fuel_type', 'Brand': 'brand', 'NotRepaired': 'not_repaired', 'DateCreated': 'date_created', 'NumberOfPictures': 'number_of_pictures', 'PostalCode': 'postal_code', 'LastSeen': 'last_seen'})

In [4]:
df.sample(5)

Unnamed: 0,date_crawled,price,vehicle_type,registration_year,gearbox,power,model,mileage,registration_month,fuel_type,brand,not_repaired,date_created,number_of_pictures,postal_code,last_seen
291808,28/03/2016 12:06,1300,small,1996,manual,75,golf,150000,9,petrol,volkswagen,no,28/03/2016 00:00,0,56412,06/04/2016 06:45
65813,08/03/2016 16:56,10500,bus,2011,manual,163,other,70000,11,gasoline,chevrolet,no,08/03/2016 00:00,0,74196,10/03/2016 12:16
38316,24/03/2016 13:53,1950,small,1999,auto,54,fortwo,150000,9,petrol,smart,no,24/03/2016 00:00,0,78467,07/04/2016 05:46
237282,11/03/2016 22:40,1550,sedan,1999,manual,170,5er,150000,6,petrol,bmw,no,11/03/2016 00:00,0,44269,14/03/2016 21:18
119586,27/03/2016 23:55,999,wagon,1996,manual,75,astra,150000,3,petrol,opel,no,27/03/2016 00:00,0,71672,06/04/2016 09:16


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   date_crawled        354369 non-null  object
 1   price               354369 non-null  int64 
 2   vehicle_type        316879 non-null  object
 3   registration_year   354369 non-null  int64 
 4   gearbox             334536 non-null  object
 5   power               354369 non-null  int64 
 6   model               334664 non-null  object
 7   mileage             354369 non-null  int64 
 8   registration_month  354369 non-null  int64 
 9   fuel_type           321474 non-null  object
 10  brand               354369 non-null  object
 11  not_repaired        283215 non-null  object
 12  date_created        354369 non-null  object
 13  number_of_pictures  354369 non-null  int64 
 14  postal_code         354369 non-null  int64 
 15  last_seen           354369 non-null  object
dtypes:

In [6]:
df.describe()

Unnamed: 0,price,registration_year,power,mileage,registration_month,number_of_pictures,postal_code
count,354369.0,354369.0,354369.0,354369.0,354369.0,354369.0,354369.0
mean,4416.656776,2004.234448,110.094337,128211.172535,5.714645,0.0,50508.689087
std,4514.158514,90.227958,189.850405,37905.34153,3.726421,0.0,25783.096248
min,0.0,1000.0,0.0,5000.0,0.0,0.0,1067.0
25%,1050.0,1999.0,69.0,125000.0,3.0,0.0,30165.0
50%,2700.0,2003.0,105.0,150000.0,6.0,0.0,49413.0
75%,6400.0,2008.0,143.0,150000.0,9.0,0.0,71083.0
max,20000.0,9999.0,20000.0,150000.0,12.0,0.0,99998.0


__Missing Values__

In [7]:
df.isna().sum()

date_crawled              0
price                     0
vehicle_type          37490
registration_year         0
gearbox               19833
power                     0
model                 19705
mileage                   0
registration_month        0
fuel_type             32895
brand                     0
not_repaired          71154
date_created              0
number_of_pictures        0
postal_code               0
last_seen                 0
dtype: int64

__Duplicate Values__

## Data Preparation

In [9]:
df_encoded = df.copy()

In [10]:
freq_encoded = df_encoded['vehicle_type'].value_counts() / len(df_encoded)
df_encoded['vehicle_type'] = df_encoded['vehicle_type'].map(freq_encoded)

freq_encoded = df_encoded['gearbox'].value_counts() / len(df_encoded)
df_encoded['gearbox'] = df_encoded['gearbox'].map(freq_encoded)

freq_encoded = df_encoded['model'].value_counts() / len(df_encoded)
df_encoded['model'] = df_encoded['model'].map(freq_encoded)

freq_encoded = df_encoded['fuel_type'].value_counts() / len(df_encoded)
df_encoded['fuel_type'] = df_encoded['fuel_type'].map(freq_encoded)

freq_encoded = df_encoded['brand'].value_counts() / len(df_encoded)
df_encoded['brand'] = df_encoded['brand'].map(freq_encoded)

freq_encoded = df_encoded['not_repaired'].value_counts() / len(df_encoded)
df_encoded['not_repaired'] = df_encoded['not_repaired'].map(freq_encoded)

df_encoded

Unnamed: 0,date_crawled,price,vehicle_type,registration_year,gearbox,power,model,mileage,registration_month,fuel_type,brand,not_repaired,date_created,number_of_pictures,postal_code,last_seen
0,24/03/2016 11:52,480,,1993,0.756982,0,0.082490,150000,0,0.610527,0.217324,,24/03/2016 00:00,0,70435,07/04/2016 03:16
1,24/03/2016 10:58,18300,0.045611,2011,0.756982,190,,125000,5,0.278580,0.083122,0.101741,24/03/2016 00:00,0,66954,07/04/2016 01:46
2,14/03/2016 12:52,9800,0.033852,2004,0.187051,163,0.001583,125000,8,0.278580,0.001910,,14/03/2016 00:00,0,90480,05/04/2016 12:47
3,17/03/2016 16:54,1500,0.225276,2001,0.756982,75,0.082490,150000,6,0.610527,0.217324,0.697468,17/03/2016 00:00,0,91074,17/03/2016 17:40
4,31/03/2016 17:25,3600,0.225276,2008,0.756982,69,0.005870,90000,7,0.278580,0.015521,0.697468,31/03/2016 00:00,0,60437,06/04/2016 10:17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354364,21/03/2016 09:50,0,,2005,0.756982,0,0.002452,150000,7,0.610527,0.008528,0.101741,21/03/2016 00:00,0,2694,21/03/2016 10:42
354365,14/03/2016 17:48,2200,,2005,,0,,20000,1,,0.009521,,14/03/2016 00:00,0,39576,06/04/2016 00:46
354366,05/03/2016 19:56,1199,0.057011,2000,0.187051,101,0.012239,125000,3,0.610527,0.014804,0.697468,05/03/2016 00:00,0,26135,11/03/2016 18:17
354367,19/03/2016 18:57,9200,0.081201,1996,0.756982,102,0.013607,150000,3,0.278580,0.217324,0.697468,19/03/2016 00:00,0,87439,07/04/2016 07:15


In [11]:
feature_names = ['vehicle_type', 'registration_year', 'gearbox', 'power', 'model' ,'mileage', 'registration_month', 'fuel_type', 'brand', 'not_repaired', ]

transformer_mas = MaxAbsScaler().fit(df_encoded[feature_names].to_numpy())

df_scaled = df_encoded.copy()
df_scaled.loc[:, feature_names] = transformer_mas.transform(df_encoded[feature_names].to_numpy())

In [12]:
df_scaled

Unnamed: 0,date_crawled,price,vehicle_type,registration_year,gearbox,power,model,mileage,registration_month,fuel_type,brand,not_repaired,date_created,number_of_pictures,postal_code,last_seen
0,24/03/2016 11:52,480,,0.19932,1.000000,0.00000,1.000000,1.000000,0.000000,1.000000,1.000000,,24/03/2016 00:00,0,70435,07/04/2016 03:16
1,24/03/2016 10:58,18300,0.176728,0.20112,1.000000,0.00950,,0.833333,0.416667,0.456293,0.382481,0.145873,24/03/2016 00:00,0,66954,07/04/2016 01:46
2,14/03/2016 12:52,9800,0.131165,0.20042,0.247101,0.00815,0.019191,0.833333,0.666667,0.456293,0.008791,,14/03/2016 00:00,0,90480,05/04/2016 12:47
3,17/03/2016 16:54,1500,0.872880,0.20012,1.000000,0.00375,1.000000,1.000000,0.500000,1.000000,1.000000,1.000000,17/03/2016 00:00,0,91074,17/03/2016 17:40
4,31/03/2016 17:25,3600,0.872880,0.20082,1.000000,0.00345,0.071155,0.600000,0.583333,0.456293,0.071417,1.000000,31/03/2016 00:00,0,60437,06/04/2016 10:17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354364,21/03/2016 09:50,0,,0.20052,1.000000,0.00000,0.029728,1.000000,0.583333,1.000000,0.039240,0.145873,21/03/2016 00:00,0,2694,21/03/2016 10:42
354365,14/03/2016 17:48,2200,,0.20052,,0.00000,,0.133333,0.083333,,0.043811,,14/03/2016 00:00,0,39576,06/04/2016 00:46
354366,05/03/2016 19:56,1199,0.220902,0.20002,0.247101,0.00505,0.148365,0.833333,0.250000,1.000000,0.068118,1.000000,05/03/2016 00:00,0,26135,11/03/2016 18:17
354367,19/03/2016 18:57,9200,0.314629,0.19962,1.000000,0.00510,0.164956,1.000000,0.250000,0.456293,1.000000,1.000000,19/03/2016 00:00,0,87439,07/04/2016 07:15


### Prep for LGBM and Cat GBM

In [13]:
df

Unnamed: 0,date_crawled,price,vehicle_type,registration_year,gearbox,power,model,mileage,registration_month,fuel_type,brand,not_repaired,date_created,number_of_pictures,postal_code,last_seen
0,24/03/2016 11:52,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,24/03/2016 00:00,0,70435,07/04/2016 03:16
1,24/03/2016 10:58,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,24/03/2016 00:00,0,66954,07/04/2016 01:46
2,14/03/2016 12:52,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,14/03/2016 00:00,0,90480,05/04/2016 12:47
3,17/03/2016 16:54,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,17/03/2016 00:00,0,91074,17/03/2016 17:40
4,31/03/2016 17:25,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,31/03/2016 00:00,0,60437,06/04/2016 10:17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354364,21/03/2016 09:50,0,,2005,manual,0,colt,150000,7,petrol,mitsubishi,yes,21/03/2016 00:00,0,2694,21/03/2016 10:42
354365,14/03/2016 17:48,2200,,2005,,0,,20000,1,,sonstige_autos,,14/03/2016 00:00,0,39576,06/04/2016 00:46
354366,05/03/2016 19:56,1199,convertible,2000,auto,101,fortwo,125000,3,petrol,smart,no,05/03/2016 00:00,0,26135,11/03/2016 18:17
354367,19/03/2016 18:57,9200,bus,1996,manual,102,transporter,150000,3,gasoline,volkswagen,no,19/03/2016 00:00,0,87439,07/04/2016 07:15


In [14]:
df['vehicle_type'] = df['vehicle_type'].astype('category')
df['gearbox'] = df['gearbox'].astype('category')
df['model'] = df['model'].astype('category')
df['fuel_type'] = df['fuel_type'].astype('category')
df['brand'] = df['brand'].astype('category')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   date_crawled        354369 non-null  object  
 1   price               354369 non-null  int64   
 2   vehicle_type        316879 non-null  category
 3   registration_year   354369 non-null  int64   
 4   gearbox             334536 non-null  category
 5   power               354369 non-null  int64   
 6   model               334664 non-null  category
 7   mileage             354369 non-null  int64   
 8   registration_month  354369 non-null  int64   
 9   fuel_type           321474 non-null  category
 10  brand               354369 non-null  category
 11  not_repaired        283215 non-null  object  
 12  date_created        354369 non-null  object  
 13  number_of_pictures  354369 non-null  int64   
 14  postal_code         354369 non-null  int64   
 15  last_seen        

### Data Split

In [16]:
s_target = df_scaled['price']

s_features = df_scaled.drop(columns=['price', 'last_seen', 'postal_code', 'date_created', 'date_crawled', 'number_of_pictures'])

s_features_train, s_features_temp, s_target_train, s_target_temp = train_test_split(
    s_features, s_target, test_size=0.4, random_state=12345
)

s_features_valid, s_features_test, s_target_valid, s_target_test = train_test_split(
    s_features_temp, s_target_temp, test_size=0.5, random_state=12345
)

## Model Training

### Linear Regression Model

In [17]:
%%time

param_grid = {
    'fit_intercept': [True, False],  
    'normalize': [True, False]  
}

lr_model = LinearRegression()

grid_search = GridSearchCV(lr_model, param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(s_features_train, s_target_train)

best_params = grid_search.best_params_

best_lr_model = LinearRegression(**best_params)
best_lr_model.fit(s_features_train, s_target_train)

tr_pred = best_lr_model.predict(s_features_train)
pred = best_lr_model.predict(s_features_valid)

rmse_train = mean_squared_error(s_target_train, tr_pred, squared=False)
rmse_valid = mean_squared_error(s_target_valid, pred, squared=False)

print("RMSE (Training Set):", rmse_train)
print("RMSE (Validation Set):", rmse_valid)

ValueError: Invalid parameter 'normalize' for estimator LinearRegression(). Valid parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive'].

### Random Forest Regression Model

In [18]:
%%time

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1],
    'bootstrap': [True]
}

rf_model = RandomForestRegressor(random_state=12345)

grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(s_features_train, s_target_train)

best_params = grid_search.best_params_

best_rf_model = RandomForestRegressor(**best_params, random_state=12345)
best_rf_model.fit(s_features_train, s_target_train)

tr_pred = best_rf_model.predict(s_features_train)
pred = best_rf_model.predict(s_features_valid)

rmse_train = mean_squared_error(s_target_train, tr_pred, squared=False)
rmse_valid = mean_squared_error(s_target_valid, pred, squared=False)

print("RMSE (Training Set):", rmse_train)
print("RMSE (Validation Set):", rmse_valid)

KeyboardInterrupt: 

### Decision Tree Model

In [None]:
%%time

param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'max_features': [None, 'auto', 'sqrt', 'log2']
}

dtr_model = DecisionTreeRegressor(random_state=12345)

grid_search = GridSearchCV(dtr_model, param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(s_features_train, s_target_train)

best_params = grid_search.best_params_

best_dtr_model = DecisionTreeRegressor(**best_params, random_state=12345)
best_dtr_model.fit(s_features_train, s_target_train)

tr_pred = best_dtr_model.predict(s_features_train)
pred = best_dtr_model.predict(s_features_valid)

rmse_train = mean_squared_error(s_target_train, tr_pred, squared=False)
rmse_valid = mean_squared_error(s_target_valid, pred, squared=False)

print("RMSE (Training Set):", rmse_train)
print("RMSE (Validation Set):", rmse_valid)

### Light Gradient Boosting Model

In [None]:
target = df['price']

features = df.drop(columns=['price', 'last_seen', 'postal_code', 'date_created', 'date_crawled', 'number_of_pictures'])

features_train, features_temp, target_train, target_temp = train_test_split(
    features, target, test_size=0.4, random_state=12345
)

features_valid, features_test, target_valid, target_test = train_test_split(
    features_temp, target_temp, test_size=0.5, random_state=12345
)

In [None]:
categorical_features = ['vehicle_type', 'gearbox', 'model', 'fuel_type', 'brand', 'not_repaired']

for col in categorical_features:
    features_train[col] = features_train[col].astype('category')
    features_valid[col] = features_valid[col].astype('category')
    features_test[col] = features_test[col].astype('category')

In [None]:
%%time

lgb_model = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metric='rmse')

param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}

grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(features_train, target_train)

best_params = grid_search.best_params_
best_lgb_model = lgb.LGBMRegressor(**best_params)

best_lgb_model.fit(features_train, target_train)

target_pred_train = best_lgb_model.predict(features_train)
target_pred_valid = best_lgb_model.predict(features_valid)

rmse_train = mean_squared_error(target_train, target_pred_train, squared=False)
rmse_valid = mean_squared_error(target_valid, target_pred_valid, squared=False)

print(f"Best Params:", best_params)
print(f"RMSE (Training):", rmse_train)
print(f"RMSE (Validation):", rmse_valid)

### Cat Boost Model

In [None]:
%%time

catboost_model = CatBoostRegressor(verbose=0, random_state=12345)

catboost_model.fit(features_train, target_train, cat_features=categorical_features)

features_pred_train = catboost_model.predict(features_train)
features_pred_valid = catboost_model.predict(features_valid)

rmse_train = np.sqrt(mean_squared_error(target_train, features_pred_train))
rmse_valid = np.sqrt(mean_squared_error(target_valid, features_pred_valid))

print(f"RMSE (Training):", rmse_train)
print(f"RMSE (Validation):", rmse_valid)

### XGB Model

In [19]:
#data_ohe = pd.get_dummies(df, drop_first=True)
#data_ohe

In [None]:
%%time

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

xgb_model.fit(s_features_train, s_target_train)

features_pred_train = xgb_model.predict(s_features_train)
features_pred_valid = xgb_model.predict(s_features_valid)

rmse_train = np.sqrt(mean_squared_error(s_target_train, features_pred_train))
rmse_valid = np.sqrt(mean_squared_error(s_target_valid, features_pred_valid))

print(f"RMSE (Training):", rmse_train)
print(f"RMSE (Validation):", rmse_valid)

## Model Evaluation

### Random Forest Final

In [None]:
%%time

best_rf_model.fit(s_features_train, s_target_train)
predicted_test = best_rf_model.predict(s_features_test)

rmse_test = mean_squared_error(s_target_test, predicted_test, squared=False)

print(f"RMSE (Test):", rmse_test)

### Decision Tree Final

In [None]:
%%time

best_dtr_model.fit(s_features_train, s_target_train)
predicted_test = best_dtr_model.predict(s_features_test)

rmse_test = mean_squared_error(s_target_test, predicted_test, squared=False)

print(f"RMSE (Test):", rmse_test)

### Light Gradient Boosting Model Final

In [None]:
%%time

best_lgb_model.fit(features_train, target_train)
predicted_test = best_lgb_model.predict(features_test)

rmse_test = mean_squared_error(target_test, predicted_test, squared=False)

print(f"RMSE (Test):", rmse_test)

### Cat Boost Model Final

In [None]:
%%time

catboost_model.fit(features_train, target_train, cat_features=categorical_features)
predicted_test = catboost_model.predict(features_test)

rmse_test = mean_squared_error(target_test, predicted_test, squared=False)

print(f"RMSE (Test):", rmse_test)

### XGB Model Final

In [None]:
%%time

xgb_model.fit(xgb_features_train, s_target_train)
predicted_test = xgb_model.predict(s_features_test)

rmse_test = mean_squared_error(target_test, predicted_test, squared=False)

print(f"RMSE (Test):", rmse_test)

## Conclusion

Light Gradient Boosting Model offers the best trade-off between accuracy (lowest RMSE) and a reasonable training time (16 minutes).

Cat Boost Model stands out as a good alternative if you need a faster training time with slightly lower accuracy than the Light Gradient Boosting Model.

The XGB Model has the shortest training time, making it the most efficient in terms of computation.

The Random Forest Model had the 2nd best RMSE, however it has the longest training time at 30 min.