In [38]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

data = pd.read_csv("used_car_train_20200313_revised.csv", index_col='SaleID')

# data.count()

In [46]:
y = data['price']
X = data.drop('price',axis=1) #dropped about 15000 rows with missing values

def preprocess(X_train):# handling missing values: transform 'notRepairedDamage column from 0, -, 1 to 0,1,2
    new_col = X_train.notRepairedDamage.map(lambda x: 1 if x == '-' else int(float(x))*2)
    X_train = X_train.drop('notRepairedDamage',axis=1)
    X_train = X_train.join(new_col)   
    return X_train

imputer = SimpleImputer()
(X_train, X_valid, y_train, y_valid) = train_test_split(X, y, test_size=0.1)
X_train = preprocess(X_train)
X_valid = preprocess(X_valid)
cols = X_train.columns

X_train = pd.DataFrame(imputer.fit_transform(X_train))
X_valid = pd.DataFrame(imputer.transform(X_valid))
X_train.columns = cols
X_valid.columns = cols

Unnamed: 0,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,regionCode,...,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14,notRepairedDamage
0,31295.0,20101210.0,74.0,25.0,3.0,1.0,0.0,75.0,3.0,5571.0,...,0.101673,0.007121,0.075449,0.106063,-4.3071,0.520999,0.691595,2.252183,-0.332481,0.0
1,42444.0,20020206.0,27.0,1.0,0.0,0.0,1.0,260.0,15.0,628.0,...,0.121852,0.120218,0.045802,0.052288,-5.497704,1.331098,0.401225,-1.008238,-2.298182,2.0
2,60393.0,19950707.0,61.0,6.0,4.0,0.0,1.0,147.0,15.0,1110.0,...,0.0,0.068012,0.056879,0.041295,2.236324,-2.730259,-0.255799,-1.457576,-1.83332,0.0
3,135750.0,19950602.0,29.0,0.0,1.0,0.0,0.0,55.0,15.0,2053.0,...,0.0,0.125295,0.037867,0.063233,3.742125,-0.206057,-2.282649,-0.229403,0.103536,1.0
4,109114.0,20050911.0,100.0,31.0,1.0,0.0,0.0,58.0,7.0,2747.0,...,0.000609,0.022328,0.074624,0.137033,3.407483,-0.512371,-1.196948,3.529501,1.030333,0.0


### Random Forest

In [None]:
#Fitting rf_model without feature selection
%%time
rf_model = RandomForestRegressor(n_estimators=10, n_jobs=-1, criterion='mae',verbose=0, random_state=0)
rf_model.fit(X_train, y_train)

#save model to file
import pickle
from joblib import dump, load

#dump(rf_model, "rf_model.joblib")

In [8]:
# validation set predictions and score ～678
preds = rf_model.predict(X_valid)
mae = mean_absolute_error(preds, y_valid)
print("n_estimators {}, MAE is {}".format(10, mae))

# training set predictions and score ~227
preds_train = rf_model.predict(X_train)
mae = mean_absolute_error(preds_train, y_train)
print("mae on Training set is {}".format(mae))

n_estimators 10, MAE is 673.2532526308042


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished


## Investigating on feature selections


In [18]:
# Feature Importance to selected features to prevent overfitting
import eli5
from eli5.sklearn import PermutationImportance

rf_model = load("rf_model.joblib")
perm = PermutationImportance(rf_model, random_state=0).fit(X_valid, y_valid)
eli5.show_weights(perm,feature_names=X_valid.columns.tolist(),top=40)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using ba

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using ba

Weight,Feature
1.4648  ± 0.0276,v_12
0.1196  ± 0.0024,v_10
0.0560  ± 0.0026,regDate
0.0136  ± 0.0010,kilometer
0.0106  ± 0.0006,v_0
0.0106  ± 0.0015,v_14
0.0102  ± 0.0004,power
0.0086  ± 0.0012,v_8
0.0074  ± 0.0010,v_1
0.0073  ± 0.0014,v_5


In [66]:
%%time
selected_cols=['v_12','v_10','regDate','kilometer','v_0','v_14','power','v_8','v_1','v_5','v_3','v_11',
                   'v_9','v_6','v_4','notRepairedDamage','model','v_2','v_13','name','brand','v_7','fuelType']
selected_X_train = X_train[selected_cols]
selected_X_valid = X_valid[selected_cols]

# train on a sample set to tune feature selections (valid mae on 1/3 training set ~780)

sample_X_train = selected_X_train.iloc[0:40000]
sample_y_train = y_train.iloc[0:40000]

sample_rf_model = RandomForestRegressor(n_estimators=10, criterion='mae', n_jobs=-1, random_state=42)
sample_rf_model.fit(sample_X_train,sample_y_train)

preds_valid = sample_rf_model.predict(selected_X_valid)
preds_train = sample_rf_model.predict(sample_X_train)
mae_valid = mean_absolute_error(preds_valid, y_valid)
mae_train = mean_absolute_error(preds_train, sample_y_train)
print("with selected features and imputation:")
print("train set mae on sample training set is {}".format(mae_train))
print("valid set mae on full validation set is {}".format(mae_valid))

# with selected features:
# train set mae on sample training set is 305.35938550226916
# valid set mae on full validation set is 745.396195452204
# CPU times: user 35min 1s, sys: 358 ms, total: 35min 2s
# Wall time: 12min 49s

with selected features and imputation:
train set mae on sample training set is 306.24827
valid set mae on full validation set is 741.5131599999999
CPU times: user 33min 32s, sys: 3.38 s, total: 33min 35s
Wall time: 10min 16s


In [31]:
%%time
seg_X_train = X_train.iloc[0:X_train.shape[0]//3]
seg_y_train = y_train[seg_X_train.index]

seg_rf_model = RandomForestRegressor(n_estimators=10, criterion='mae', n_jobs=-1, random_state=42)
seg_rf_model.fit(seg_X_train,seg_y_train)

preds_valid = seg_rf_model.predict(X_valid)
preds_train = seg_rf_model.predict(seg_X_train)
mae_valid = mean_absolute_error(preds_valid, y_valid)
mae_train = mean_absolute_error(preds_train, seg_y_train)
print("with full features:")
print("train set mae on sample training set is {}".format(mae_train))
print("valid set mae on full validation set is {}".format(mae_valid))

with full features:
train set mae on sample training set is 310.0793008708451
valid set mae on full validation set is 756.756946795202
CPU times: user 39min 53s, sys: 2.45 s, total: 39min 55s
Wall time: 11min 40s


In [None]:
# ❯ python rf_train.py
# [Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
# [Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 215.1min finished
# [Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
# [Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.2s finished
# n_estimators 10, MAE is 681.503317535545

### Fit RF model with feature selectoin

In [67]:
%%time
#Fitting rf_model with feature selection
final_X_train = X_train[selected_cols]
final_X_valid = X_valid[selected_cols]
rf_model_with_FE = RandomForestRegressor(n_estimators=10, n_jobs=-1, criterion='mae',verbose=0, random_state=42)
rf_model_with_FE.fit(final_X_train, y_train)

#save model to file
import pickle
from joblib import dump, load

dump(rf_model_with_FE, "rf_model_with_FE.joblib")

CPU times: user 15h 13min 39s, sys: 1min 8s, total: 15h 14min 47s
Wall time: 4h 20min 46s


['rf_model_with_FE.joblib']

In [70]:
preds = rf_model_with_FE.predict(final_X_valid)
mae_full = mean_absolute_error(preds, y_valid)
print(mae_full)

657.57699


In [None]:
%%time
data_test = pd.read_csv("used_car_testA_20200313_revised.csv", index_col='SaleID')

final_X = X[selected_cols]
final_y = y.copy()

final_imputer = SimpleImputer()
final_X = pd.DataFrame(imputer.fit_transform(final_X))
final_X.columns = selected_cols

final_rf_model = RandomForestRegressor(n_estimators=10, criterion='mae', n_jobs=-1, random_state=42)
final_rf_model.fit(final_X, final_y)

import pickle
from joblib import dump, load
dump(final_rf_model, "final_rf_model.joblib")
