# Used cars prices analysis and prediction

In this project, we are going to use three machine learning algorithm to do the prediction:
Multiple Linear Regression with LASSO panelty;
Random Forest;
XGboost

In [127]:
# import any tool if needed
import matplotlib
import matplotlib.pyplot as plt 
%matplotlib inline
import scipy.stats as ss
from scipy.optimize import minimize
import pandas as pd 
import numpy as np
import seaborn as sns

import sklearn as sk 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import make_scorer, mean_absolute_error

# Dataset Manipulation

In [128]:
df = pd.read_csv('cars_11_18.csv')
df.head

<bound method NDFrame.head of        Unnamed: 0  Unnamed: 0.1  price    year manufacturer     model  \
0               1             5  13995  2012.0         ford     f-150   
1               2             6   7995  2010.0    chevrolet   equinox   
2               3             7   8995  2011.0    chevrolet  traverse   
3               4             8  10995  2014.0         ford  explorer   
4               5             9  12995  2004.0          ram      2500   
...           ...           ...    ...     ...          ...       ...   
19557       61918        423777   5000  2009.0       toyota   corolla   
19558       61923        423803  25871  2017.0         ford  explorer   
19559       61928        423821  12995  2014.0       subaru   outback   
19560       61929        423823   9584  2012.0       toyota     camry   
19561       61930        423824   1000  2004.0         ford     f-150   

       condition    cylinders    fuel  odometer title_status transmission  \
0           good

In [129]:
# elminate duplicate rows and set the price limits to >50 and < 79980

df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

df.drop_duplicates()

df = df[df.price <79980]
df = df[df.price > 50]

df.head
#df.shape

<bound method NDFrame.head of        price    year manufacturer     model  condition    cylinders    fuel  \
0      13995  2012.0         ford     f-150       good  6 cylinders     gas   
1       7995  2010.0    chevrolet   equinox       good  4 cylinders     gas   
2       8995  2011.0    chevrolet  traverse       good  6 cylinders     gas   
3      10995  2014.0         ford  explorer       good  6 cylinders     gas   
4      12995  2004.0          ram      2500       good  6 cylinders  diesel   
...      ...     ...          ...       ...        ...          ...     ...   
19557   5000  2009.0       toyota   corolla       good  4 cylinders     gas   
19558  25871  2017.0         ford  explorer  excellent  6 cylinders     gas   
19559  12995  2014.0       subaru   outback  excellent  4 cylinders     gas   
19560   9584  2012.0       toyota     camry  excellent  4 cylinders     gas   
19561   1000  2004.0         ford     f-150       fair  8 cylinders     gas   

       odometer title

# Insight of predictors and Data Visualization

Histogram of response/major predictors

# Methods
Multiple Linear Regression with Lasso panelty/ Random forest / XGboost

In [130]:
# Multiple linear regression code

In [131]:
# Random forest code

In [132]:
# XGboost code
data=df

n = ['price', 'odometer']
c = [i for i in data.columns.values if i not in n]
print(c)
print(n)
data[c] = data[c].astype('str')
data[n] = data[n].astype('float')

f = [i for i in data.columns.values if i not in n]

for col in f:
    print(data.groupby(col, as_index=False).agg({'price': ["mean"]}))

['year', 'manufacturer', 'model', 'condition', 'cylinders', 'fuel', 'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color']
['price', 'odometer']
      year         price
                    mean
0   2000.0   4664.023256
1   2001.0   4640.145110
2   2002.0   5207.703704
3   2003.0   5437.128492
4   2004.0   6210.379209
5   2005.0   6186.873385
6   2006.0   8005.791277
7   2007.0   7387.309592
8   2008.0   9225.525570
9   2009.0   8039.509009
10  2010.0   9324.161002
11  2011.0  11070.032216
12  2012.0  11724.986815
13  2013.0  13783.539955
14  2014.0  14622.337849
15  2015.0  18584.741310
16  2016.0  19094.360689
17  2017.0  22959.329630
18  2018.0  24881.927132
19  2019.0  28086.332579
20  2020.0  34203.173913
   manufacturer         price
                         mean
0           bmw  11883.194175
1         buick  14457.606299
2      cadillac  18377.315789
3     chevrolet  14613.156896
4      chrysler   6848.048980
5         dodge  10971.994643
6          ford  14658.

In [133]:
print(data.shape)
data = pd.get_dummies(data)
print(data.shape)

(18208, 14)
(18208, 176)


In [134]:
from xgboost import XGBRegressor
XGB = XGBRegressor(max_depth=3,
                   learning_rate=0.1,
                   n_estimators=100,
                   verbosity=0,
                   objective='reg:linear',
                   booster='gbtree',
                   n_jobs=-1,
                   gamma=0.001,
                   subsample=0.632,
                   colsample_bytree=1,
                   colsample_bylevel=1,
                   colsample_bynode=1,
                   reg_alpha=1,                  # Regularizer for first fit. alpha = 1, lambda = 0 is LASSO.
                   reg_lambda=0,                 # Regularizer for first fit.
                   scale_pos_weight=1,
                   base_score=0.5,               # Global bias. Set to average of the target rate.
                   random_state=0,
                   missing=None
                  )

In [135]:
from sklearn.metrics import mean_absolute_error, r2_score

Xtrain, Xtest, ytrain, ytest = train_test_split(data.iloc[:, 1:], data['price'], test_size=0.25, random_state=0)

param_grid = dict({'n_estimators': [250, 300, 350],
                   'max_depth': [2, 3, 4],
                 'learning_rate' : [0.2, 0.25, 0.3]
                  })

from sklearn.model_selection import GridSearchCV
GridXGB = GridSearchCV(XGB,
                       param_grid,
                       cv = 10,  
                       scoring = 'neg_mean_absolute_error',
                       n_jobs = -1,
                       refit = False,
                       verbose = 0
                      )

GridXGB.fit(Xtrain, ytrain)

GridSearchCV(cv=10,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0.001,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=3, min_child_weight=None,
                                    missing=None, monotone_constraints=None,
                                    n_estimators=100, n_jobs=-1,
                                    num_parallel_tree=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=1, reg_lambda=0,
                                    scale_pos_weight=1, subsample=0.632,
                                    tree_method=None, validate_

In [136]:
sk.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [137]:
GridXGB.best_params_

{'learning_rate': 0.3, 'max_depth': 4, 'n_estimators': 350}

In [138]:
from xgboost import XGBRegressor
XGB = XGBRegressor(max_depth=GridXGB.best_params_.get('max_depth'),
                   learning_rate=GridXGB.best_params_.get('learning_rate'),
                   n_estimators=GridXGB.best_params_.get('n_estimators'),
                   verbosity=0,
                   objective='reg:linear',
                   booster='gbtree',
                   n_jobs=-1,
                   gamma=0.001,
                   subsample=0.632,
                   colsample_bytree=1,
                   colsample_bylevel=1,
                   colsample_bynode=1,
                   reg_alpha=1,                  # Regularizer for first fit. alpha = 1, lambda = 0 is LASSO.
                   reg_lambda=0,                 # Regularizer for first fit.
                   scale_pos_weight=1,
                   base_score=0.5,               # Global bias. Set to average of the target rate.
                   random_state=0,
                   missing=None
                  )

XGB.fit(Xtrain, ytrain)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0.001, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.3, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=None, monotone_constraints='()',
             n_estimators=350, n_jobs=-1, num_parallel_tree=1,
             objective='reg:linear', random_state=0, reg_alpha=1, reg_lambda=0,
             scale_pos_weight=1, subsample=0.632, tree_method='exact',
             validate_parameters=1, verbosity=0)

In [139]:
predict = XGB.predict(Xtest)

print(mean_absolute_error(ytest.values, predict))
print(r2_score(ytest.values, predict))

1917.4645684639054
0.8991326740943864


In [140]:
# R squared score + MSE evaluation 