In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [2]:
# Open data
raw_data = pd.read_csv('vehicles.csv')

raw_data.head()

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,drive,size,type,paint_color,image_url,description,county,state,lat,long
0,7034441763,https://saltlakecity.craigslist.org/cto/d/salt...,salt lake city,https://saltlakecity.craigslist.org,17899,2012.0,volkswagen,golf r,excellent,4 cylinders,...,4wd,compact,hatchback,black,https://images.craigslist.org/00G0G_fTLDWM5Xyv...,PRICE REDUCED! -Garage kept -Low Miles (63K)...,,ut,40.7372,-111.858
1,7034440610,https://saltlakecity.craigslist.org/ctd/d/sand...,salt lake city,https://saltlakecity.craigslist.org,0,2016.0,ford,f-150,excellent,,...,4wd,,,,https://images.craigslist.org/00v0v_7Cu0buIofU...,Drive it home today. Call (Or Text) us now !!C...,,ut,40.5881,-111.884
2,7034440588,https://saltlakecity.craigslist.org/ctd/d/sand...,salt lake city,https://saltlakecity.craigslist.org,46463,2015.0,gmc,sierra 1500,excellent,,...,4wd,,,white,https://images.craigslist.org/01515_lPvJ9bfbdY...,Drive it home today. Call (Or Text) us now !!C...,,ut,40.5881,-111.884
3,7034440546,https://saltlakecity.craigslist.org/ctd/d/sand...,salt lake city,https://saltlakecity.craigslist.org,0,2016.0,ford,f-150,excellent,,...,4wd,,,,https://images.craigslist.org/00T0T_6Rjfp3NS4O...,Drive it home today. Call (Or Text) us now !!C...,,ut,40.5881,-111.884
4,7034406932,https://saltlakecity.craigslist.org/ctd/d/evan...,salt lake city,https://saltlakecity.craigslist.org,49999,2018.0,ford,f-450,,,...,4wd,,pickup,white,https://images.craigslist.org/00W0W_8yIUwRBXXd...,2018 Ford F-350 F350 F 350 SD Lariat Crew Cab ...,,ut,40.3744,-104.694


In [3]:
# Get Pickups & Trucks

trucks = raw_data[(raw_data['type'] == 'pickup') | (raw_data['type'] == 'truck')]

trucks.type.value_counts()

pickup    49547
truck     49510
Name: type, dtype: int64

In [4]:
# Top 20 Manufacturers

manu = trucks['manufacturer'].value_counts()

top_manu = manu[0:20]

top_manu.index

Index(['ford', 'chevrolet', 'ram', 'gmc', 'toyota', 'nissan', 'dodge', 'honda',
       'jeep', 'mitsubishi', 'cadillac', 'mazda', 'volvo', 'lincoln',
       'harley-davidson', 'mercedes-benz', 'subaru', 'volkswagen', 'bmw',
       'datsun'],
      dtype='object')

In [5]:
# Restrict data to top 20 manufacturers

top_trucks = trucks[trucks['manufacturer'].isin(top_manu.index)]

top_trucks.head()

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,drive,size,type,paint_color,image_url,description,county,state,lat,long
4,7034406932,https://saltlakecity.craigslist.org/ctd/d/evan...,salt lake city,https://saltlakecity.craigslist.org,49999,2018.0,ford,f-450,,,...,4wd,,pickup,white,https://images.craigslist.org/00W0W_8yIUwRBXXd...,2018 Ford F-350 F350 F 350 SD Lariat Crew Cab ...,,ut,40.3744,-104.694
5,7034406582,https://saltlakecity.craigslist.org/ctd/d/evan...,salt lake city,https://saltlakecity.craigslist.org,13999,2009.0,ram,,,,...,4wd,,pickup,silver,https://images.craigslist.org/00y0y_RR9x2lk7eh...,"2009 RAM 1500 4WD Quad Cab 140.5"" ST"" Offe...",,ut,40.3744,-104.694
6,7034405619,https://saltlakecity.craigslist.org/ctd/d/evan...,salt lake city,https://saltlakecity.craigslist.org,34500,2017.0,ford,f-350,,,...,4wd,,pickup,white,https://images.craigslist.org/00k0k_lIBatiU90s...,2017 Ford Super Duty F-350 F350 F 350 SRW Crew...,,ut,40.3744,-104.694
7,7034405349,https://saltlakecity.craigslist.org/ctd/d/evan...,salt lake city,https://saltlakecity.craigslist.org,14500,2007.0,gmc,sierra,,,...,4wd,,pickup,,https://images.craigslist.org/00202_jTNQzQVWZs...,2007 GMC Sierra 1500 SLE Crew Cab 4WD Offe...,,ut,40.3744,-104.694
8,7034404595,https://saltlakecity.craigslist.org/ctd/d/evan...,salt lake city,https://saltlakecity.craigslist.org,12500,2013.0,ford,f-250,,,...,4wd,,pickup,white,https://images.craigslist.org/00F0F_99lwfqrZWc...,2013 Ford F-250 F250 F 250 SD XLT 4WD Offe...,,ut,40.3744,-104.694


In [6]:
# Get relevant columns
data = top_trucks[['price', 'year', 'condition', 'odometer', 'cylinders', 'drive', 'manufacturer', 'model']].dropna()

data.head()

Unnamed: 0,price,year,condition,odometer,cylinders,drive,manufacturer,model
48,28000,2004.0,good,67348.0,10 cylinders,4wd,ford,f550 mechanics service
69,24700,2010.0,excellent,262000.0,6 cylinders,4wd,ram,diesel cummins 3500
135,38995,2016.0,good,38086.0,8 cylinders,4wd,ford,f-150
161,9995,2008.0,excellent,200000.0,4 cylinders,rwd,gmc,w4500 hd dsl reg
163,29000,2014.0,excellent,32000.0,8 cylinders,4wd,ford,f-150


In [7]:
# Condition function

def conv_cond(val):
    if val == 'new':
        return 5
    elif val == 'like new':
        return 4
    elif val == 'excellent':
        return 3
    elif val == 'good':
        return 2
    elif val == 'fair':
        return 1
    else:
        return 0

In [8]:
# Condition

data['condition'] = data['condition'].apply(conv_cond)

print(data.condition.value_counts().sort_index())

data.head()

0       83
1     1568
2    15463
3    19676
4     3699
5      176
Name: condition, dtype: int64


Unnamed: 0,price,year,condition,odometer,cylinders,drive,manufacturer,model
48,28000,2004.0,2,67348.0,10 cylinders,4wd,ford,f550 mechanics service
69,24700,2010.0,3,262000.0,6 cylinders,4wd,ram,diesel cummins 3500
135,38995,2016.0,2,38086.0,8 cylinders,4wd,ford,f-150
161,9995,2008.0,3,200000.0,4 cylinders,rwd,gmc,w4500 hd dsl reg
163,29000,2014.0,3,32000.0,8 cylinders,4wd,ford,f-150


In [9]:
# Price

data = data[(data['price'] > 100) & (data['price'] <= 80000)]

print(len(data))

data['price'].describe()

38198


count    38198.000000
mean     17372.574507
std      11552.543713
min        130.000000
25%       7995.000000
50%      15400.000000
75%      24900.000000
max      78800.000000
Name: price, dtype: float64

In [10]:
# Odometer

data = data[data['odometer'] < 900000]

data['odometer'].describe()

count     38132.000000
mean     122203.308507
std       70171.879135
min           0.000000
25%       71009.000000
50%      118492.000000
75%      167025.000000
max      885148.000000
Name: odometer, dtype: float64

In [11]:
# Cylinders Function

def cyl_conv(val):
    items = val.split(' ')
    return int(items[0])

In [12]:
# Cylinders

data = data[data['cylinders'] != 'other']
data['cylinders'] = data['cylinders'].apply(cyl_conv)

data.head()

Unnamed: 0,price,year,condition,odometer,cylinders,drive,manufacturer,model
48,28000,2004.0,2,67348.0,10,4wd,ford,f550 mechanics service
69,24700,2010.0,3,262000.0,6,4wd,ram,diesel cummins 3500
135,38995,2016.0,2,38086.0,8,4wd,ford,f-150
161,9995,2008.0,3,200000.0,4,rwd,gmc,w4500 hd dsl reg
163,29000,2014.0,3,32000.0,8,4wd,ford,f-150


# ML - Reg

In [14]:
# ML Imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing


In [15]:
# Get dummies (removed veh model)

data_ml = pd.get_dummies(data.iloc[:, 0:7])

data_ml.head()

Unnamed: 0,price,year,condition,odometer,cylinders,drive_4wd,drive_fwd,drive_rwd,manufacturer_bmw,manufacturer_cadillac,...,manufacturer_lincoln,manufacturer_mazda,manufacturer_mercedes-benz,manufacturer_mitsubishi,manufacturer_nissan,manufacturer_ram,manufacturer_subaru,manufacturer_toyota,manufacturer_volkswagen,manufacturer_volvo
48,28000,2004.0,2,67348.0,10,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,24700,2010.0,3,262000.0,6,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
135,38995,2016.0,2,38086.0,8,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
161,9995,2008.0,3,200000.0,4,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
163,29000,2014.0,3,32000.0,8,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Split data

X = data_ml.iloc[:, 1:]
y = data_ml.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [17]:
# Fit Model

model1 = LinearRegression()
model1.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
# Eval

print(f'''
Train: {model1.score(X_train, y_train)}
Test: {model1.score(X_test, y_test)}''')


Train: 0.5479050993637659
Test: 0.5299792603400852


In [19]:
from joblib import dump, load
dump(model1, 'model2.joblib')

['model2.joblib']

# ML - KNN

In [20]:
# Imports

from sklearn.neighbors import KNeighborsClassifier

In [21]:
data1 = data.copy()
data1.head()

Unnamed: 0,price,year,condition,odometer,cylinders,drive,manufacturer,model
48,28000,2004.0,2,67348.0,10,4wd,ford,f550 mechanics service
69,24700,2010.0,3,262000.0,6,4wd,ram,diesel cummins 3500
135,38995,2016.0,2,38086.0,8,4wd,ford,f-150
161,9995,2008.0,3,200000.0,4,rwd,gmc,w4500 hd dsl reg
163,29000,2014.0,3,32000.0,8,4wd,ford,f-150


In [22]:
# Price bins

bin1 = [500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 70000, 75000, 80000]

len(bin1)

25

In [23]:
# Price Converter

def price_conv(val, binner):
    for i in range(len(binner)):
        if val <= binner[i]:
            return i

In [24]:
data1['price_range'] = data1['price'].apply(price_conv, args=(bin1,))

data1.head()

Unnamed: 0,price,year,condition,odometer,cylinders,drive,manufacturer,model,price_range
48,28000,2004.0,2,67348.0,10,4wd,ford,f550 mechanics service,14
69,24700,2010.0,3,262000.0,6,4wd,ram,diesel cummins 3500,13
135,38995,2016.0,2,38086.0,8,4wd,ford,f-150,16
161,9995,2008.0,3,200000.0,4,rwd,gmc,w4500 hd dsl reg,10
163,29000,2014.0,3,32000.0,8,4wd,ford,f-150,14


In [25]:
# Get dummies
data1_ml = pd.get_dummies(data1.drop(columns=['model', 'price']))

data1_ml.head()

Unnamed: 0,year,condition,odometer,cylinders,price_range,drive_4wd,drive_fwd,drive_rwd,manufacturer_bmw,manufacturer_cadillac,...,manufacturer_lincoln,manufacturer_mazda,manufacturer_mercedes-benz,manufacturer_mitsubishi,manufacturer_nissan,manufacturer_ram,manufacturer_subaru,manufacturer_toyota,manufacturer_volkswagen,manufacturer_volvo
48,2004.0,2,67348.0,10,14,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,2010.0,3,262000.0,6,13,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
135,2016.0,2,38086.0,8,16,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
161,2008.0,3,200000.0,4,10,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
163,2014.0,3,32000.0,8,14,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Split data

X = data1_ml.drop(columns=['price_range'])
y = data1_ml['price_range']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [27]:
# Fit model
neigh2 = KNeighborsClassifier(n_neighbors=1)
neigh2.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [28]:
# Evaluation

print(f'''
Train: {neigh2.score(X_train, y_train)}
Test: {neigh2.score(X_test, y_test)}''')


Train: 0.9826358115870671
Test: 0.979256925298674


In [29]:
from joblib import dump, load
dump(neigh2, 'neigh2.joblib')

['neigh2.joblib']