In [2]:
import category_encoders as ce
from sklearn.pipeline import make_pipeline 
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import r2_score
from pdpbox.pdp import pdp_isolate, pdp_plot
from pdpbox.pdp import pdp_interact,pdp_interact_plot
import eli5
from eli5.sklearn import PermutationImportance
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shap
from joblib import dump
from joblib import load
%config IPCompleter.greedy=True

In [3]:
plt.rcParams['figure.dpi'] = 72
df1 = pd.read_csv('vehicles_trimmed1.csv')
df2 = df1
df2['price'] = df2['price'].replace(0,np.NaN)
df2 = df2.dropna(subset=['price'])
df2 = df2[(df2['price'] >= np.percentile(df2['price'], 0.05)) &
          (df2['price'] <= np.percentile(df2['price'], 99.95))]
train_orig, test = train_test_split(df2.drop(columns=['county','Unnamed: 0']) , test_size = 0.1, train_size = 0.9)
train, val = train_test_split(train_orig, train_size = 0.9, test_size = 0.1)
train

Unnamed: 0,id,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,image_url,state,lat,long
266162,7048248436,western maryland,23200.0,2012.0,gmc,sierra 1500 crew cab slt,good,8 cylinders,other,73013.0,clean,other,4wd,,pickup,white,https://images.craigslist.org/00909_bzfZpc1DRO...,md,33.7865,-84.4454
248764,7036158133,huntington-ashland,10950.0,2010.0,chevrolet,silverado 2500hd,good,8 cylinders,gas,201421.0,clean,automatic,4wd,full-size,truck,white,https://images.craigslist.org/00i0i_jvm6abB3vI...,ky,38.6926,-82.8577
207785,7049056581,spokane / coeur d'alene,18998.0,2014.0,ford,f-150 stx extended cab sho,,,gas,89853.0,clean,automatic,4wd,,,,https://images.craigslist.org/00l0l_4pRJbQp5C7...,id,47.7989,-116.7420
322900,7043977188,missoula,10998.0,2015.0,kia,soul +,,4 cylinders,gas,77286.0,clean,automatic,fwd,,sedan,,https://images.craigslist.org/00g0g_lyyrsa2tJm...,mt,47.6961,-116.7810
43464,7042187039,madison,12950.0,2011.0,toyota,tacoma sr5,,6 cylinders,gas,206762.0,clean,automatic,4wd,,truck,silver,https://images.craigslist.org/00808_9Z9wnnui65...,wi,42.9918,-89.0209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424777,7049368826,medford-ashland,5200.0,2010.0,buick,lucerne cxl,,,gas,137200.0,clean,manual,,,,,https://images.craigslist.org/00F0F_ftnQZbcVzw...,or,42.3384,-122.8760
285534,7044816665,grand rapids,1600.0,2001.0,mercedes-benz,m-class,,,gas,201013.0,clean,automatic,,,,green,https://images.craigslist.org/01313_5H2XdKlzqg...,mi,43.0120,-85.6309
281146,7047327301,detroit metro,8488.0,2012.0,chevrolet,cruze,,,gas,94963.0,clean,automatic,fwd,,,red,https://images.craigslist.org/00I0I_9RSPCVrrtt...,mi,42.5572,-82.8980
220829,7047995727,"st louis, MO",3200.0,2006.0,honda,odyssey ex-l,,,gas,,clean,automatic,,,,,https://images.craigslist.org/00L0L_eARZmYIaqr...,il,38.5855,-90.2929


In [4]:
## Final model for app usage

target = 'price'
features1 = ['year', 'manufacturer', 'cylinders','fuel', 'odometer', 'drive']
X_train1 = train[features1]
y_train = train[target]
X_val1 = val[features1]
y_val = val[target]
X_test1 = test[features1]
y_test = test[target]

In [15]:
pipeline1 = make_pipeline(
    ce.TargetEncoder(),
    SimpleImputer(strategy='median'),
    RandomForestRegressor(max_depth=19,
                                      n_estimators=40, n_jobs=-3,
                                        random_state=30)
)
pipeline1.fit(X_train1, y_train)

y_pred = pipeline1.predict(X_val1)
print('MAE:', mean_absolute_error(y_val, y_pred))
print('R^2:', r2_score(y_val, y_pred))

MAE: 2991.0025998271412
R^2: 0.7736409571422505


In [20]:
dump(pipeline1, 'pipeline_drive.joblib', compress=True)

In [8]:
X_train1['year'].sort_values().head(50)

36150     1900.0
404914    1900.0
37497     1900.0
240181    1900.0
135695    1900.0
375986    1900.0
216206    1900.0
493254    1900.0
172538    1900.0
98284     1901.0
198529    1901.0
99710     1902.0
440886    1903.0
270222    1911.0
221201    1912.0
506734    1912.0
277414    1915.0
270221    1916.0
63015     1917.0
10432     1917.0
243577    1918.0
112962    1918.0
225221    1918.0
63014     1919.0
189185    1920.0
385092    1920.0
66728     1920.0
222484    1922.0
229251    1922.0
108603    1923.0
368819    1923.0
87525     1923.0
91438     1923.0
184011    1923.0
460177    1923.0
328492    1923.0
389042    1923.0
81850     1923.0
68831     1923.0
78202     1923.0
59450     1923.0
161130    1923.0
228143    1923.0
87001     1923.0
146181    1923.0
87093     1923.0
155142    1923.0
168781    1923.0
417388    1923.0
461146    1923.0
Name: year, dtype: float64