In [83]:
import pandas as pd
import statsmodels.formula.api as sm
import statsmodels.stats.api as sms
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.pylab as pylab
import numpy as np
pd.set_option('display.max_columns', 500)
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

After exploring of basic variables, good start is to build linear moder in order to understand how good or bad we can forecast car price based on few obvious factors. Than it would be useful to compare results of various machine learning techniques with simple linear model forecast.

In [84]:
dt_path = 'C:/Users/Vasiliy Poteriaev/Documents/GitHub/processed data/auto_matrix_2016-04-13.pickle'
dt = pd.read_pickle(dt_path)
#remove bulletproof expensive cars
dt = dt[dt['Price_rub'] < 4 * 10 ** 6]
#obviously warranty end should be 0 for 8 years old  car
dt.loc[dt['Warranty_end'] > 6, 'Warranty_end'] = 0

# regression approach

In [85]:
#generate dummy for no warranty
dt['no_warranty'] = dt['Warranty_end'] == 0
dt['person_seller'] = dt['Seller'] == 'person'

In [86]:
dt['CarName'].value_counts()

Nissan Teana II       537
Toyota Camry VII      513
Ford Mondeo IV        510
Mazda 6 II            385
Mazda 6 I             352
Audi A6 III           348
BMW 5er VI            347
Toyota Camry VI       306
Honda Accord VII      277
Hyundai Sonata IV     277
Audi A6 IV            269
Honda Accord VIII     268
Volvo S60 I           263
Opel Insignia I       261
BMW 5er V             246
Audi A6 II            244
BMW 5er IV            222
Ford Mondeo III       208
Mazda 6 III           170
Nissan Teana I        169
Kia Optima III        168
Infiniti G IV         153
Hyundai i40 I         147
Volvo S60 II          144
Nissan Teana III      127
Ford Mondeo V         104
BMW 5er III            96
Hyundai Sonata V       79
Hyundai Sonata VI      67
Toyota Camry V         64
Ford Mondeo II         58
Audi A6 I              52
Honda Accord V         40
Honda Accord VI        38
Kia Optima IV          37
Honda Accord IX        32
Ford Mondeo I          30
Infiniti G III         20
Hyundai Sona

In [None]:
#remove small car classes from analysis. It's old, rare and would only alter model results
dt = dt[~dt['CarName'].isin(['Kia Optima I','Toyota Camry II','BMW 5er II','Hyundai Sonata II',
                             'Honda Accord I','Infiniti G I','Infiniti G II'])]

In [88]:
#saving feature matrix for other algorithms
dt.to_pickle('C:/Users/Vasiliy Poteriaev/Documents/GitHub/processed data/dt.pickle')

Before estimating model we should split data into train and test set in order to get out of sample forecast

In [58]:
dt_train, dt_test = train_test_split(dt,test_size=0.2, random_state=777, stratify = dt['CarName'])

In [59]:
mod1_formula = 'np.log(Price_rub) ~ CarName + Car_age + run_km + np.log(Engine_capacity) + np.log(Engine_hp) + ' + \
'Advertising_num_days + np.log(Views) + np.log(Last_owner_time_years + 1) + np.log(Warranty_end + 1) + no_warranty +' + \
'Gear_box + Owners_N'

mod2_formula = 'np.log(Price_rub) ~ CarName + Car_age + run_km + np.log(Engine_capacity) + np.log(Engine_hp) + ' + \
'Advertising_num_days + np.log(Views) + np.log(Last_owner_time_years + 1) + np.log(Warranty_end + 1) + no_warranty +' + \
'Gear_box + Owners_N + Body_type + Swap_type + Original_car_passport + Wheel_drive_type + Left_steering +' + \
'is_metallic_color+person_seller + Engine_fuel_type'

In [60]:
mod1 = sm.ols(formula = mod1_formula, data = dt_train)
mod1_res = mod1.fit()
print(mod1_res.summary())

                            OLS Regression Results                            
Dep. Variable:      np.log(Price_rub)   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.951
Method:                 Least Squares   F-statistic:                     2115.
Date:                Sun, 17 Apr 2016   Prob (F-statistic):               0.00
Time:                        11:11:08   Log-Likelihood:                 3043.3
No. Observations:                6154   AIC:                            -5973.
Df Residuals:                    6097   BIC:                            -5589.
Df Model:                          56                                         
Covariance Type:            nonrobust                                         
                                                       coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------------

In [61]:
mod2 = sm.ols(formula = mod2_formula, data = dt_train)
mod2_res = mod2.fit()
print(mod2_res.summary())

                            OLS Regression Results                            
Dep. Variable:      np.log(Price_rub)   R-squared:                       0.956
Model:                            OLS   Adj. R-squared:                  0.955
Method:                 Least Squares   F-statistic:                     1732.
Date:                Sun, 17 Apr 2016   Prob (F-statistic):               0.00
Time:                        11:11:11   Log-Likelihood:                 3362.9
No. Observations:                6154   AIC:                            -6572.
Df Residuals:                    6077   BIC:                            -6054.
Df Model:                          76                                         
Covariance Type:            nonrobust                                         
                                                       coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------------------

Estimate out of sample prediction

In [73]:
def MAPE(y_true, y_pred):
    mape = np.sum(np.abs((y_true - y_pred) / y_true) / len(y_true))
    return mape

In [79]:
mod1_mape = MAPE(dt_train['Price_rub'], np.exp(mod1_res.predict(dt_train)))
mod2_mape = MAPE(dt_train['Price_rub'], np.exp(mod2_res.predict(dt_train)))

In [80]:
mod1_mape

0.10561797295936229

In [81]:
mod2_mape

0.09913721783602256

We can see that first model which contains basic car features returns pretty good result. Addition some extra features improved forecast but only for 0.65%. MAPE less than 10 % is a nice start and machine learning techniques obviously should improve this solutioun. Otherwise it's useless.