In [1]:
import category_encoders as ce
from sklearn.pipeline import make_pipeline 
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import r2_score
from pdpbox.pdp import pdp_isolate, pdp_plot
from pdpbox.pdp import pdp_interact,pdp_interact_plot
import eli5
from eli5.sklearn import PermutationImportance
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shap
from joblib import dump
from joblib import load
%config IPCompleter.greedy=True

In [2]:
plt.rcParams['figure.dpi'] = 72
df1 = pd.read_csv('vehicles_trimmed1.csv')
df2 = df1
df2['price'] = df2['price'].replace(0,np.NaN)
df2 = df2.dropna(subset=['price'])
df2 = df2[(df2['price'] >= np.percentile(df2['price'], 0.05)) &
          (df2['price'] <= np.percentile(df2['price'], 99.95))]
train_orig, test = train_test_split(df2.drop(columns=['county','Unnamed: 0']) , test_size = 0.1, train_size = 0.9)
train, val = train_test_split(train_orig, train_size = 0.9, test_size = 0.1)
train

Unnamed: 0,id,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,image_url,state,lat,long
342720,7039828073,winston-salem,12500.0,2008.0,gmc,yukon denali,excellent,8 cylinders,gas,170000.0,clean,automatic,4wd,,,black,https://images.craigslist.org/00202_6N5uzlBkMt...,nc,35.8713,-80.0913
315721,7037670388,springfield,1200.0,1999.0,volvo,s70,excellent,5 cylinders,gas,188251.0,clean,automatic,fwd,,sedan,black,https://images.craigslist.org/01515_9VulAssReZ...,mo,38.0571,-91.4017
33967,7049504841,appleton-oshkosh-FDL,2000.0,2005.0,chevrolet,cobalt,good,4 cylinders,gas,110000.0,clean,automatic,fwd,,,blue,https://images.craigslist.org/00404_7x8Csvw5Dc...,wi,44.3003,-88.5365
131379,7042115085,boulder,14200.0,2017.0,ford,escape s sport utility 4d,good,,gas,39673.0,clean,automatic,fwd,,other,orange,https://images.craigslist.org/00z0z_gAfIVcqk1m...,co,33.7792,-84.4118
8628,7049047317,norfolk / hampton roads,26866.0,2018.0,chevrolet,express 12 pass van,excellent,8 cylinders,gas,19591.0,clean,automatic,rwd,full-size,van,white,https://images.craigslist.org/00t0t_73qLnDUrpH...,va,36.6883,-76.2305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345135,7040555205,lincoln,11496.0,2011.0,ford,f150 supercrew cab,,8 cylinders,other,207123.0,clean,automatic,4wd,,pickup,white,https://images.craigslist.org/00b0b_iVh2xQ3ey0...,ne,40.7378,-96.8445
227166,7043531495,lafayette / west lafayette,1999.0,1995.0,jeep,cherokee,,,gas,195000.0,clean,automatic,,,,,https://images.craigslist.org/00o0o_1S6zyeAuv1...,in,40.5734,-86.6788
417509,7045559064,bend,19950.0,2017.0,kia,sportage ex,,4 cylinders,gas,31944.0,clean,automatic,4wd,,SUV,,https://images.craigslist.org/00s0s_8vZvD5EK8F...,or,45.4983,-122.7650
342771,7039379159,winston-salem,9800.0,2011.0,honda,accord,,6 cylinders,gas,120997.0,clean,automatic,fwd,,sedan,black,https://images.craigslist.org/00909_gr7iTQ2n2E...,nc,35.8353,-80.2427


In [3]:
## Final model for app usage

target = 'price'
features1 = ['year', 'manufacturer', 'cylinders','fuel', 'odometer']
X_train1 = train[features1]
y_train = train[target]
X_val1 = val[features1]
y_val = val[target]
X_test1 = test[features1]
y_test = test[target]

In [9]:
pipeline1 = make_pipeline(
    ce.TargetEncoder(),
    SimpleImputer(strategy='median'),
    RandomForestRegressor(max_depth=30,
                                      n_estimators=50, n_jobs=-3,
                                        random_state=30)
)
pipeline1.fit(X_train1, y_train)

y_pred = pipeline1.predict(X_val1)
print('MAE:', mean_absolute_error(y_val, y_pred))
print('R^2:', r2_score(y_val, y_pred))

MAE: 2886.2397449386103
R^2: 0.7651232850307356


In [5]:
# dump(pipeline1, 'pipeline_drive.joblib', compress=True)

In [14]:
X_train1['manufacturer'].unique()

array(['gmc', 'volvo', 'chevrolet', 'ford', 'mercedes-benz', 'kia',
       'jeep', 'mazda', 'acura', 'ram', 'bmw', 'hyundai', 'chrysler',
       'honda', 'subaru', nan, 'volkswagen', 'toyota', 'nissan',
       'infiniti', 'lincoln', 'mini', 'audi', 'dodge', 'cadillac',
       'buick', 'lexus', 'pontiac', 'rover', 'ferrari', 'fiat', 'saturn',
       'mitsubishi', 'harley-davidson', 'mercury', 'jaguar', 'alfa-romeo',
       'porche', 'tesla', 'datsun', 'aston-martin', 'land rover',
       'morgan', 'hennessey'], dtype=object)