In [11]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
import random as rand
from sklearn.preprocessing import OneHotEncoder,LabelBinarizer,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR,LinearSVR
from sklearn.linear_model import Ridge,Lasso,QuantileRegressor,HuberRegressor,LinearRegression
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

In [12]:
# df = pd.read_csv('vehicles.csv')
# # df.groupby(['manufacturer']).size()

# df = df[df['manufacturer']=='ford']
# df = df.loc[:, df.columns.isin(all_label)]
# df.to_csv('ford.csv',index=False)

# Read Data

In [235]:
# read data
data = pd.read_csv('toyota.csv')
data.shape

(34202, 20)

# Preprocessing

In [236]:
# all useful labels
all_label = [
         'price',
         'region',
         'year', 
         'manufacturer',     
         'model', 
         'condition', 
         'cylinders', 
         'fuel', 
         'odometer', 
         'title_status',
         'transmission', 
         'drive', 
         'size', 
         'type', 
         'paint_color',
         'description', 
         'state', 
         'lat', 
         'long']

labels_real = ['odometer']

# labels_cate = [
#             'model',
#             'region',
#             'year',  
#             'condition', 
#             'cylinders', 
#             'fuel',  
#             'title_status',
#             'transmission', 
#             'drive', 
#             'size', 
#             'type', 
#             'paint_color',
#             'state']

labels_cate = ['region',
               'year',
              'model',
              'cylinders',
              'condition', 
              'fuel', 
              'title_status',
              'transmission', 
              'drive', 
              'paint_color']

# labels_cate = ['model',
#               'region',
#               'condition', 
#               'title_status',
#               'paint_color']

In [237]:
# only look at useful labels above
data = data.loc[:, data.columns.isin(labels_real+labels_cate+['price','model'])]

# drop columns with N/A
data = data.dropna()

# drop price outliers
Q1 = data['price'].quantile(0.025)
Q3 = data['price'].quantile(0.975)

data = data[(data['price']>Q1) & (data['price']<Q3)]

# drop duplicated rows
#data = data.drop_duplicates()

# filter out model with sample <=10
data = data.groupby('model').filter(lambda x: len(x) > 10)

data = data.sort_values(by=['price'])
data.shape

(8569, 12)

In [238]:
data

Unnamed: 0,region,price,year,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,paint_color
8907,daytona beach,1,2015.0,camry,excellent,4 cylinders,gas,50566.0,clean,automatic,fwd,black
9621,lakeland,1,2016.0,rav4,excellent,10 cylinders,gas,68339.0,clean,automatic,rwd,blue
9620,lakeland,1,2015.0,camry,excellent,4 cylinders,gas,50566.0,clean,automatic,fwd,black
23610,chillicothe,1,1999.0,4runner,good,6 cylinders,gas,256000.0,clean,automatic,4wd,silver
9286,gainesville,1,2017.0,tundra 4wd,excellent,8 cylinders,gas,60453.0,clean,automatic,4wd,white
...,...,...,...,...,...,...,...,...,...,...,...,...
11992,hawaii,40000,2016.0,tundra 4wd truck,excellent,8 cylinders,gas,14450.0,clean,automatic,4wd,brown
30694,tyler / east TX,40000,2017.0,sequoia,excellent,8 cylinders,gas,138412.0,clean,automatic,4wd,white
28075,greenville / upstate,40000,2017.0,tundra,excellent,8 cylinders,gas,51500.0,clean,automatic,4wd,grey
29227,austin,40000,2007.0,fj cruiser,excellent,6 cylinders,gas,94509.0,lien,manual,4wd,blue


In [239]:
# get real value columns
data_real = np.asarray(data.loc[:, data.columns.isin(labels_real)])

# get categorical columns
data_cate = np.asarray(data.loc[:, data.columns.isin(labels_cate)])

# one-hot encode categorical data
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(data_cate)
data_cate_enc = enc.transform(data_cate).toarray()

# get X
X = np.concatenate((data_real, data_cate_enc),axis=1)
print('categorical feature size:', data_cate_enc.shape[1])

# normalize data
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

y = np.asarray(data['price'])
print('X shape:', X.shape)
print('y shape:', y.shape)

categorical feature size: 589
X shape: (8569, 590)
y shape: (8569,)


In [240]:
features = ['odometer']
l = enc.categories_
for i in l:
    features = np.concatenate((features,i))
    
len(features)

590

# Train&Test Split


In [241]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train Models

In [242]:
# This function computes the mean squared error
def RMSE(y, pred):
    return np.sqrt(np.mean(np.square(y-pred)))

In [243]:
def run_OLS(train_y, test_y, train_vals, test_vals):
    # add offset
    m,n = len(train_vals), len(test_vals)
    train_vals = np.concatenate( (train_vals, np.ones((m,1))) , axis = 1)
    test_vals = np.concatenate( (test_vals, np.ones((n,1))) , axis = 1)

    ols_model = sm.regression.linear_model.OLS(train_y, train_vals)
    while True: # Bypasses SVD convergence assertion error
        try:
            results = ols_model.fit()
            break
        except:
            None
            
    w = np.array(results.params).reshape([len(results.params),1])

    train_pred = np.matmul(train_vals,w)
    test_pred = np.matmul(test_vals,w)

    train_RMSE = RMSE(train_y, train_pred.flatten())
    test_RMSE = RMSE(test_y, test_pred.flatten())
    
    return train_RMSE, test_RMSE, test_pred

In [244]:
run_OLS(y_train, y_test, X_train, X_test)

(3326.6262567936274,
 4160.537398363917,
 array([[29115.12579255],
        [18999.78106832],
        [28567.27409238],
        ...,
        [14626.24770147],
        [13799.18877325],
        [  915.30133279]]))

In [259]:
# use different models to predict

# model = RandomForestRegressor(n_estimators = 50,
#                               criterion = 'absolute_error',
#                               max_depth=50,
#                               max_leaf_nodes = 10000)


# model = Ridge(alpha=8)
# model = Lasso(alpha=10, max_iter = 10000)

# model = HuberRegressor(alpha=0.001,
#                       epsilon = 2,
#                       max_iter = 10**100
#                       )

# model = LinearRegression()

# model = GradientBoostingRegressor(n_estimators=500,
#                                  loss='huber')

# model = MLPRegressor(hidden_layer_sizes = (5,2), max_iter=10000)

# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)

# RMSE(y_pred , y_test)


ols_model = sm.regression.linear_model.OLS(y_train, X_train)
results = ols_model.fit()
w = np.array(results.params).reshape([len(results.params),1])

l = model.coef_
A = np.argsort(np.abs(l))
A

array([100, 169, 296, 249, 241,   3, 319, 356, 235, 317, 390, 291, 109,
       262, 396, 131, 323, 218, 144, 350,  54, 557, 335, 360, 353, 175,
        44, 588,  72, 425, 163, 149,  56, 346, 228, 343,  35, 213, 244,
        18, 332,  74,  85, 107, 242, 118, 257, 289, 221, 333, 511,  62,
       187, 302, 344, 308, 174,  47, 373, 198, 178, 259, 553, 334,   5,
       202, 313,  24,  79, 113, 309, 364, 368, 438,  97, 254,  70,  36,
       564, 126, 137,  17, 297, 157, 158, 106, 306, 153,  91, 326, 173,
        40,  82, 342, 150,  41, 328, 264, 284, 247, 307, 123, 366, 141,
       270, 117, 164, 182, 582, 166, 355,  75, 108, 565, 379, 209, 580,
       386,  14, 204,  67, 188,  48,  68, 219, 195, 290, 181, 305, 505,
       345, 200, 348, 377,  71,  99, 114, 180, 273, 359, 283,  51,  83,
       101, 159, 251, 365,  23, 170, 189,  22, 171, 132,   8, 381, 183,
         6, 253, 330,  21,  87,  19, 476, 120, 298,  88, 263,  13, 145,
       197, 336,  11, 395,  33, 147, 234,  12, 201, 215, 155,  8

In [260]:
for i in A[-5:]:
    print(features[i])

2016.0
tacoma double cab pickup
2017.0
2018.0
odometer


In [268]:
print(l[521])

1828.4080351724626


In [258]:
# y_pred = model.predict(X_train)

# RMSE(y_pred , y_train)

In [249]:
model2 = BaggingRegressor(base_estimator=model,
                            n_estimators=10,
                            max_samples=1000)

model2.fit(X_train,y_train)
y2 = model2.predict(X_train)
int(np.mean((y_train - y2)**2) // 1000)

118514641692046208

In [250]:
def var(model, y, X):
    trees = model.estimators_
    v = []
    for t in trees:
        v.append(np.mean((t.predict(X) - y)**2))
    return np.mean(v)

In [251]:
int(var(model2, y2, X_train) // 1000)

856707727237916672

# Final Result

| Algorithm     | Train RMSE  | Test RMSE | 
| --------------| ----------- | --------- | 
| Linear        | 3220        |   3633    | 
| Ridge         | 3220        |   3520    | 
| Lasso         | 3232        |   3504    | 
| Huber         | 3318        |   3430    |   
| Random Forest | 1199        |   3152    |     
| Boosting      | 2606        |   3126    |     


# car modelwise model

### Read Data (makewise)

In [4]:
# read data
data = pd.read_csv('toyota.csv')

# Preprocessing(modelwise)


In [3]:
# labels_real = ['odometer','lat','long']

labels_real = ['odometer', 'lat', 'long']

# labels_cate = ['region',
#                'year',
#               'model',
#               'condition', 
#               'fuel', 
#               'title_status',
#               'transmission', 
#               'drive', 
#               'paint_color']

# labels_cate = [
#             'region',
#             'year',  
#             'condition', 
#             'cylinders', 
#             'fuel',  
#             'title_status',
#             'transmission', 
#             'drive', 
#             'size', 
#             'type', 
#             'paint_color',
#             'state']

# labels_cate = ['region',
#                'year',
#               'condition', 
#               'title_status',
#               'paint_color']

labels_cate = ['year',
              'paint_color',
              'condition', 
              'title_status']

In [5]:
# only look at useful labels above
data = data.loc[:, data.columns.isin(labels_real+labels_cate+['price','model'])]

# drop columns with N/A
data = data.dropna()

# drop price outliers
Q1 = data['price'].quantile(0.025)
Q3 = data['price'].quantile(0.975)

data = data[(data['price']>Q1) & (data['price']<Q3)]

# drop duplicated rows
#data = data.drop_duplicates()

# filter out model with sample <=10
data = data.groupby('model').filter(lambda x: len(x) > 10)

data = data.sort_values(by=['price'])
data.shape

(13095, 9)

In [6]:
# get real value columns
data_real = data.loc[:, data.columns.isin(labels_real)]

# get categorical columns and use one-hot encoding
label = labels_cate[0]
data_cate = pd.get_dummies(data[label], prefix=label)

for label in labels_cate[1:]:
    # one-hot encode
    data_cate = pd.concat([data_cate, pd.get_dummies(data[label], prefix=label)], axis=1)
    
print('categorical feature size:', data_cate.shape[1])

# get X
data_final = pd.concat([data_real, data_cate],axis=1)
# normalize data
data_final = (data_final-data_final.mean())/data_final.std()

# add model column to X
data_final['model'] = data['model']
data_final['price'] = data['price']

print('data_final shape:', data_final.shape)

categorical feature size: 71
data_final shape: (13095, 76)


# Train&Test Split(modelwise)


In [7]:
train, test = train_test_split(data_final, test_size=0.2)

# Train Models(modelwise)

In [159]:
def fit_linear(X_train, y_train):
    model = sm.regression.linear_model.OLS(y_train, X_train)
    results = model.fit()
    w = np.array(results.params).reshape([len(results.params),1])
    return w

def fit_ridge(X_train, y_train):
    model = Ridge(alpha=10)
    model.fit(X_train, y_train)
    return model

def fit_lasso(X_train, y_train):
    model = Lasso(alpha=200, max_iter= 10000)
    model.fit(X_train, y_train)
    return model

def fit_rf(X_train, y_train):
    model = RandomForestRegressor(n_estimators = 100,
                              #criterion = 'absolute_error',
                              max_leaf_nodes = 1000)
    model.fit(X_train, y_train)
    return model

def fit_huber(X_train, y_train):
    model = HuberRegressor(alpha=0.01,
                      epsilon = 10,
                      max_iter = 10000
                      )
    model.fit(X_train, y_train)
    return model

def fit_boost(X_train, y_train):
    model = GradientBoostingRegressor(n_estimators=150,
                                 loss='huber')
    model.fit(X_train, y_train)
    return model

In [163]:
# group rows by model
train_gb = train.groupby(['model'])
groups = dict(list(train_gb))
gp_model = {k:None for k in groups.keys()}

# train a model for each car-model
for model in groups.keys():
    gp_df = groups[model]
    gp_X = np.asarray(gp_df.loc[:, ~gp_df.columns.isin(["model", "price"])])
    gp_y = np.asarray(gp_df.loc[:, gp_df.columns == 'price']).flatten()
    gp_model[model] = fit_boost(gp_X, gp_y)

In [161]:
# get test RMSE
test_gb = test.groupby(['model'])
groups = dict(list(test_gb))
errors = [[0,0]]

for model in groups.keys():
    gp_df = groups[model]
    gp_X = np.asarray(gp_df.loc[:, ~gp_df.columns.isin(["model", "price"])])
    gp_y = np.asarray(gp_df.loc[:, gp_df.columns == 'price']).flatten()
    
    y_pred = gp_model[model].predict(gp_X)
    
    # use below for linear model
#     w = gp_model[model]
#     y_pred = np.matmul(gp_X,w)
    
    gp_y = gp_y.reshape((len(gp_y),1))
    y_pred = y_pred.reshape((len(y_pred),1))
    
    e = np.concatenate((gp_y,y_pred), axis=1)
    errors = np.concatenate((errors,e), axis=0)
    
RMSE(errors[:,0] , errors[:,1])

3082.3178016561196

In [162]:
# get train RMSE
train_gb = train.groupby(['model'])
groups = dict(list(train_gb))
errors = [[0,0]]

for model in groups.keys():
    gp_df = groups[model]
    gp_X = np.asarray(gp_df.loc[:, ~gp_df.columns.isin(["model", "price"])])
    gp_y = np.asarray(gp_df.loc[:, gp_df.columns == 'price']).flatten()
    
    y_pred = gp_model[model].predict(gp_X)

    # use below for linear model
#     w = gp_model[model]
#     y_pred = np.matmul(gp_X,w)
    
    gp_y = gp_y.reshape((len(gp_y),1))
    y_pred = y_pred.reshape((len(y_pred),1))
    
    e = np.concatenate((gp_y,y_pred), axis=1)
    errors = np.concatenate((errors,e), axis=0)
    
RMSE(errors[:,0] , errors[:,1])

1308.3543777852926

# Final Results(modelwise)

| Algorithm     | Train RMSE  | Test RMSE | Train RMSE(modelwise) | Test RMSE (modelwise)|
| --------------| ----------- | --------- | --------------------- | -------------------- |
| Linear        | 3220        |   3633    |        2069           |           59184      |
| Ridge         | 3220        |   3520    |        2196           |           3175       |
| Lasso         | 3232        |   3504    |        2392           |           3842       |
| Huber         | 3318        |   3430    |        2350           |           3565       |
| Random Forest | 1199        |   3152    |        1180           |           3047       |
| Boosting      | 2606        |   3126    |        1132           |           3067       |
