In [16]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

#preprocess data
data = pd.read_csv("used_car_train_20200313_revised.csv", index_col='SaleID')

y = data['price']
X = data.drop('price',axis=1) #dropped about 15000 rows with missing values

def preprocess(X_train):# handling missing values: transform 'notRepairedDamage column from 0, -, 1 to 0,1,2
    new_col = X_train.notRepairedDamage.map(lambda x: 1 if x == '-' else int(float(x))*2)
    X_train = X_train.drop('notRepairedDamage',axis=1)
    X_train = X_train.join(new_col)   
    return X_train

imputer = SimpleImputer()
(X_train, X_valid, y_train, y_valid) = train_test_split(X, y, test_size=0.1)
X_train = preprocess(X_train)
X_valid = preprocess(X_valid)
cols = X_train.columns

X_train = pd.DataFrame(imputer.fit_transform(X_train))
X_valid = pd.DataFrame(imputer.transform(X_valid))
X_train.columns = cols
X_valid.columns = cols
selected_cols=['v_12','v_10','regDate','kilometer','v_0','v_14','power','v_8','v_1','v_5','v_3','v_11',
                   'v_9','v_6','v_4','notRepairedDamage','model','v_2','v_13','name','brand','v_7','fuelType']
X_train = X_train[selected_cols]
X_valid = X_valid[selected_cols]

In [17]:
# Tune HyperParameters
n_estimators = [int(x) for x in np.linspace(start=10, stop=100, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10,110,num=11)]
max_depth.append(None)
bootstrap = [True, False]
max_leaf_nodes = [int(x) for x in np.linspace(start=100,stop=1000,num=10)]
max_leaf_nodes.append(None)
min_impurity_decrease = [float(x/100) for x in np.linspace(start=0, stop=50, num=11)]
warm_start=[True, False]

random_grid = {'n_estimators':n_estimators,
              'max_features':max_features,
              'max_depth':max_depth,
              'bootstrap':bootstrap,
              'max_leaf_nodes':max_leaf_nodes,
              'min_impurity_decrease':min_impurity_decrease,
              'warm_start':warm_start}

from pprint import pprint

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'max_leaf_nodes': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, None],
 'min_impurity_decrease': [0.0,
                           0.05,
                           0.1,
                           0.15,
                           0.2,
                           0.25,
                           0.3,
                           0.35,
                           0.4,
                           0.45,
                           0.5],
 'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
 'warm_start': [True, False]}


In [None]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 14.4min


In [23]:
rf_random.best_params_

{'warm_start': True,
 'n_estimators': 50,
 'min_impurity_decrease': 0.1,
 'max_leaf_nodes': 700,
 'max_features': 'sqrt',
 'max_depth': 110,
 'bootstrap': False}

In [None]:
1500:
{'warm_start': True,
 'n_estimators': 80,
 'min_impurity_decrease': 0.0,
 'max_leaf_nodes': 500,
 'max_features': 'sqrt',
 'max_depth': 80,
 'bootstrap': True}

4000
{'warm_start': True,
 'n_estimators': 90,
 'min_impurity_decrease': 0.3,
 'max_leaf_nodes': 900,
 'max_features': 'sqrt',
 'max_depth': 80,
 'bootstrap': False}

10000
{'warm_start': True,
 'n_estimators': 50,
 'min_impurity_decrease': 0.1,
 'max_leaf_nodes': 700,
 'max_features': 'sqrt',
 'max_depth': 110,
 'bootstrap': False}