## Modeling Using Only Year and Odometer

In [1]:
import random
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [2]:
n = 375619 #number of records in file
s = 10000 #desired sample size
filename = '../data/clean/df1.csv'
skip = sorted(random.sample(range(1,n+1),n-s))
df=pd.read_csv(filename, header=0, skiprows=skip)

In [3]:
df.shape

(10000, 21)

In [4]:
df.columns

Index(['state_area', 'price', 'year', 'manufacturer', 'model', 'condition',
       'cylinders', 'fuel', 'odometer', 'title', 'transmission', 'drive',
       'size', 'type', 'color', 'posting_date', 'State', 'region', 'division',
       'VIN_p', 'image_url_p'],
      dtype='object')

In [5]:
# Create target object and call it y
y = df['price']
# Create X
features = ['year','odometer']
X = df[features]

### DecisionTreeRegressor

In [6]:
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [7]:
# Specify Model
dtr = DecisionTreeRegressor(random_state=1)
# Fit Model
dtr.fit(train_X, train_y)

DecisionTreeRegressor(random_state=1)

In [8]:
val_predictions = dtr.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))

Validation MAE: 9,280


In [9]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [10]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
best_tree_size = min(scores, key=scores.get)

# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
print(best_tree_size)

50


In [11]:
# Fill in argument to make optimal size and uncomment
final_model = DecisionTreeRegressor(max_leaf_nodes=5, random_state=1)

# fit the final model and uncomment the next two lines
final_model.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=5, random_state=1)

In [12]:
val_predictions = final_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))

Validation MAE: 8,890


### Random Forest

In [13]:
from sklearn.ensemble import RandomForestRegressor

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)

# fit your model
rf_model.fit(train_X, train_y)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))

Validation MAE for Random Forest Model: 8407.30112552554


In [14]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [15]:
from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

## Modeling With Features

In [16]:
df.isna().sum().sort_values()

state_area         0
division           0
region             0
State              0
posting_date       0
VIN_p              0
odometer           0
image_url_p        0
year               0
price              0
transmission      29
fuel              53
model            112
title            162
manufacturer     241
color           1207
type            1284
condition       1849
drive           2283
cylinders       3768
size            6856
dtype: int64

In [17]:
#fill na with mode
df=df.fillna(df.mode().iloc[0])
df.isna().sum().sort_values()

state_area      0
division        0
region          0
State           0
posting_date    0
color           0
type            0
size            0
drive           0
VIN_p           0
transmission    0
odometer        0
fuel            0
cylinders       0
condition       0
model           0
manufacturer    0
year            0
price           0
title           0
image_url_p     0
dtype: int64

In [18]:
cat_to_dummy=df[['price','odometer','manufacturer', 'model', 'condition',
       'cylinders', 'size','fuel','title', 'transmission', 'drive', 'type', 'color','State']]

In [19]:
df1_final = pd.get_dummies(df, drop_first=True)
print(df1_final.columns)

Index(['price', 'year', 'odometer', 'VIN_p', 'image_url_p',
       'state_area_abilene', 'state_area_akron / canton', 'state_area_albany',
       'state_area_albuquerque', 'state_area_altoona-johnstown',
       ...
       'region_South', 'region_West', 'division_East South Central',
       'division_Middle Atlantic', 'division_Mountain', 'division_New England',
       'division_Pacific', 'division_South Atlantic',
       'division_West North Central', 'division_West South Central'],
      dtype='object', length=13933)


In [20]:
df1_final.sample(15)

Unnamed: 0,price,year,odometer,VIN_p,image_url_p,state_area_abilene,state_area_akron / canton,state_area_albany,state_area_albuquerque,state_area_altoona-johnstown,...,region_South,region_West,division_East South Central,division_Middle Atlantic,division_Mountain,division_New England,division_Pacific,division_South Atlantic,division_West North Central,division_West South Central
3902,15590,2016,53066,True,True,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3662,7100,2007,150200,True,True,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9566,24997,2017,46564,True,True,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
747,3000,1998,144000,True,True,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3505,5695,2009,140852,True,True,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4671,21990,2018,51773,True,True,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
7082,5495,2003,90000,True,True,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
5902,45999,2018,72234,True,True,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
9232,29990,2017,31622,True,True,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
9784,37590,2015,62718,True,True,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [21]:
# Create target object and call it y
y = df1_final.price
# Create X
X = df1_final

In [22]:
X.columns

Index(['price', 'year', 'odometer', 'VIN_p', 'image_url_p',
       'state_area_abilene', 'state_area_akron / canton', 'state_area_albany',
       'state_area_albuquerque', 'state_area_altoona-johnstown',
       ...
       'region_South', 'region_West', 'division_East South Central',
       'division_Middle Atlantic', 'division_Mountain', 'division_New England',
       'division_Pacific', 'division_South Atlantic',
       'division_West North Central', 'division_West South Central'],
      dtype='object', length=13933)

In [23]:
X.drop(['price'],axis=1, inplace=True)

In [24]:
X.columns

Index(['year', 'odometer', 'VIN_p', 'image_url_p', 'state_area_abilene',
       'state_area_akron / canton', 'state_area_albany',
       'state_area_albuquerque', 'state_area_altoona-johnstown',
       'state_area_amarillo',
       ...
       'region_South', 'region_West', 'division_East South Central',
       'division_Middle Atlantic', 'division_Mountain', 'division_New England',
       'division_Pacific', 'division_South Atlantic',
       'division_West North Central', 'division_West South Central'],
      dtype='object', length=13932)

### DecisionTreeRegressor

In [25]:
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [26]:
# Specify Model
dtr = DecisionTreeRegressor(random_state=1)
# Fit Model
dtr.fit(train_X, train_y)

DecisionTreeRegressor(random_state=1)

In [27]:
val_predictions = dtr.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))

Validation MAE: 5,499


In [28]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [29]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
best_tree_size = min(scores, key=scores.get)

# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
print(best_tree_size)

500


In [30]:
# Fill in argument to make optimal size and uncomment
final_model = DecisionTreeRegressor(max_leaf_nodes=500, random_state=1)

# fit the final model and uncomment the next two lines
final_model.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=500, random_state=1)

In [31]:
val_predictions = final_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))

Validation MAE: 3,519


In [None]:
# Import necessary modules
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge

# Instantiate a ridge regressor: ridge
ridge = Ridge(alpha=0.5, normalize=True)

# Perform 5-fold cross-validation: ridge_cv
ridge_cv = cross_val_score(ridge, X, y, cv=5)

# Print the cross-validated scores
print(ridge_cv)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alp

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)

# fit your model
rf_model.fit(train_X, train_y)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))

In [None]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

## Pre-processing and training data development
Standardize and train your dataset

In [None]:
# calculate correlation matrix
corr = df.corr()# plot the heatmap
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [None]:
cat_features=df.columns

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
encoded=df[cat_features].apply(encoder.fit_transform)
dfud=df.drop(cat_features,axis=1)
dfuc=pd.concat([encoded,dfud],axis=1)
dfuc.head(1)

In [None]:
# calculate correlation matrix
sns.set(style='white')
corr = dfuc.drop(columns=['VIN_p','image_url_p']).corr()# plot the heatmap
mask=np.zeros_like(corr,dtype=bool)
mask[np.triu_indices_from(mask)]=True
f,ax=plt.subplots(figsize=(18,15))
cmap=sns.diverging_palette(220,10,as_cmap=True)
sns.heatmap(corr, mask=mask,cmap=cmap,xticklabels=corr.columns, yticklabels=corr.columns, annot=True)

In [None]:
from sklearn.preprocessing import StandardScaler
X_head = df1_final.iloc[:, df1_final.columns != 'price']
X = df1_final.loc[:, df1_final.columns != 'price']
y = df1_final['price']
X = StandardScaler().fit_transform(X)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)
model = RandomForestRegressor(random_state=1)
model.fit(X_train, y_train)
pred = model.predict(X_test)

In [None]:
print(mae(y_test, pred))
print(df1_final['price'].mean())
print(model.score(X_test,y_test))

In [None]:
#We chose Random Forest algorith for this project
#Let's do Cross Validation to check the overal score in the Training Set
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

scores = []
forest = RandomForestRegressor(n_estimators=20, random_state=0)
acc = cross_val_score(forest, X_train, y_train, scoring='r2', cv=5)
scores.append(round(acc.mean()*100,2))

In [None]:
results = pd.DataFrame({
    'Metrics': ['R2'],
    'Accuracy': scores})
results

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X_head.columns)
feat_importances.nlargest(10).plot(kind='barh',figsize=(10,10))