In [2]:
import category_encoders as ce
from sklearn.pipeline import make_pipeline 
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import r2_score
from pdpbox.pdp import pdp_isolate, pdp_plot
from pdpbox.pdp import pdp_interact,pdp_interact_plot
import eli5
from eli5.sklearn import PermutationImportance
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shap
from joblib import dump
from joblib import load
%config IPCompleter.greedy=True

In [3]:
plt.rcParams['figure.dpi'] = 72
df1 = pd.read_csv('vehicles_trimmed1.csv')
df2 = df1
df2['price'] = df2['price'].replace(0,np.NaN)
df2 = df2.dropna(subset=['price'])
df2 = df2[(df2['price'] >= np.percentile(df2['price'], 0.05)) &
          (df2['price'] <= np.percentile(df2['price'], 99.95))]
train_orig, test = train_test_split(df2.drop(columns=['county','Unnamed: 0']) , test_size = 0.1, train_size = 0.9)
train, val = train_test_split(train_orig, train_size = 0.9, test_size = 0.1)
train

Unnamed: 0,id,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,image_url,state,lat,long
460424,7041984523,greenville / upstate,6988.0,2014.0,ford,focus,,,gas,93606.0,clean,automatic,,compact,hatchback,blue,https://images.craigslist.org/00606_aROTCCIfJk...,sc,34.8024,-82.4288
391825,7041243371,akron / canton,33500.0,2019.0,toyota,tacoma double cab trd,good,6 cylinders,gas,4449.0,clean,other,4wd,,pickup,,https://images.craigslist.org/01010_fs8g5MP7k0...,oh,41.0800,-81.5200
346582,7049691434,las vegas,2995.0,2007.0,chrysler,300,excellent,6 cylinders,gas,132436.0,clean,automatic,rwd,,sedan,silver,https://images.craigslist.org/00M0M_lNQ0HuEZSq...,nv,36.1520,-115.1090
194704,7040003434,columbus,20799.0,2015.0,cadillac,srx,,6 cylinders,gas,49240.0,clean,automatic,fwd,,SUV,silver,https://images.craigslist.org/00000_42TcqtbjRc...,ga,32.5488,-84.9241
217874,7038572235,southern illinois,13849.0,2013.0,ford,mustang,excellent,6 cylinders,gas,44995.0,clean,manual,rwd,mid-size,coupe,black,https://images.craigslist.org/00U0U_5DT73w4xuG...,il,38.0844,-89.3398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255462,7038325760,lake charles,2400.0,2002.0,pontiac,gran prix gt,excellent,6 cylinders,gas,93000.0,clean,automatic,fwd,,,black,https://images.craigslist.org/00808_gRogy7EwYQ...,la,29.8565,-95.5660
497378,7042513346,laredo,2500.0,1993.0,ford,ranger,good,6 cylinders,gas,33100.0,clean,manual,rwd,,,,https://images.craigslist.org/01616_iR9alxBP8H...,tx,27.5595,-99.5031
465120,7049879070,chattanooga,3000.0,2000.0,ford,ranger,fair,4 cylinders,gas,111000.0,clean,automatic,rwd,,pickup,green,https://images.craigslist.org/00n0n_hS1NjsQCBP...,tn,35.0942,-85.1757
116930,7042123685,san luis obispo,25990.0,2011.0,ford,f-250 super duty,good,,gas,38388.0,clean,automatic,,,pickup,silver,https://images.craigslist.org/00F0F_jXrTIFXSw5...,ca,36.5336,-119.5810


In [4]:
## Final model for app usage

target = 'price'
features1 = ['year', 'manufacturer', 'cylinders','fuel', 'odometer', 'drive']
X_train1 = train[features1]
y_train = train[target]
X_val1 = val[features1]
y_val = val[target]
X_test1 = test[features1]
y_test = test[target]

In [15]:
pipeline1 = make_pipeline(
    ce.TargetEncoder(),
    SimpleImputer(strategy='median'),
    RandomForestRegressor(max_depth=19,
                                      n_estimators=40, n_jobs=-3,
                                        random_state=30)
)
pipeline1.fit(X_train1, y_train)

y_pred = pipeline1.predict(X_val1)
print('MAE:', mean_absolute_error(y_val, y_pred))
print('R^2:', r2_score(y_val, y_pred))

MAE: 2991.0025998271412
R^2: 0.7736409571422505


In [20]:
dump(pipeline1, 'pipeline_drive.joblib', compress=True)