In [None]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np

def preprocess(X_train):# transform 'notRepairedDamage column from 0, -, 1 to 0,1,2
    new_col = X_train.notRepairedDamage.map(lambda x: 1 if x == '-' else int(float(x))*2)
    X_train = X_train.drop('notRepairedDamage',axis=1)
    X_train = X_train.join(new_col)   
    return X_train

def validate(model, X_train, X_valid, y_train, y_valid):
    preds_valid = model.predict(X_valid)
    preds_train = model.predict(X_train)
    mae_valid = mean_absolute_error(preds_valid, y_valid)
    mae_train = mean_absolute_error(preds_train, y_train)
    print("Validation result:")
    print("train set mae on training set is {}".format(mae_train))
    print("valid set mae on validation set is {}".format(mae_valid))
    
data = pd.read_csv('./data/used_car_train_20200313_revised.csv')
data.dropna(subset=['price'], axis=0, inplace=True)
y = data['price']
X = data.drop('price', axis=1)
X = preprocess(X)
(X_train, X_valid, y_train, y_valid) = train_test_split(X, y, test_size=0.1)
cols = X.columns
selected_cols=['v_12','v_10','regDate','kilometer','v_0','v_14',
               'power','v_8','v_1','v_5','v_3','v_11',
               'v_9','v_6','v_4','notRepairedDamage','model',
               'v_2','v_13','name','brand','v_7','fuelType']

final_params = {'warm_start': False,
                'n_estimators': 50,
                'min_impurity_decrease': 0.05,
                'max_leaf_nodes': None,
                'max_features': 'sqrt',
                'max_depth': 30,
                'bootstrap': False}

X_train = X_train[selected_cols]
X_valid = X_valid[selected_cols]

# pipeline
# num_cols = ['gearbox', 'bodyType', 'fuelType']
num_cols = ['fuelType']
num_transformer = SimpleImputer()

preprocessor = ColumnTransformer(
    transformers=[('num', num_transformer, num_cols)])


model = RandomForestRegressor(criterion='mae', random_state=42, n_jobs=-1, **final_params)
pip = Pipeline(steps=[('preprocessor', preprocessor),
                     ('model', model)])
pip.fit(X_train, y_train)


In [None]:
validate(model, X_train, X_valid, y_train, y_valid)