In [0]:
#!pip install --upgrade tables
#!pip install eli5
#!pip install xgboost

In [0]:
import pandas as pd
import numpy as np

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

import eli5
from eli5.sklearn import PermutationImportance

In [0]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix_two/dw_matrix_car"

/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix_two/dw_matrix_car


In [0]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

In [0]:
X=df[['car_id']].values
y=df['price_value'].values

model = DummyRegressor()
model.fit(X,y)
y_pred = model.predict(X)

print('Benchmark: ', mae(y, y_pred))

Benchmark:  39465.934630440985


In [0]:
df = df[df.price_currency == 'PLN']

In [0]:
SUFFIX_CAT = '__cat'
for feat in df.columns:
  if isinstance(df[feat][0],list):continue

  factorized_values = df[feat].factorize()[0]
  if SUFFIX_CAT in feat: 
    df[feat] = factorized_values
  else:
    df[feat+SUFFIX_CAT] = factorized_values  

In [0]:
cat_feats = [x for x in df.columns if SUFFIX_CAT in x]
cat_feats = [x for x in cat_feats if 'price' not in x]

In [0]:
def run_model(model, feats):
  X = df[feats].values
  y = df['price_value'].values

  scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

### Decision Tree

In [0]:
run_model(DecisionTreeRegressor(max_depth=5), cat_feats)

(-19566.588937368324, 90.6181486516617)

### Random Forest

In [0]:
model = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0)
run_model(model, cat_feats)

(-18734.2072708522, 109.87074106274046)

### XGBoost

In [0]:
xgb_params = {
    'max_depth': 5,
    'n_estimators': 50, 
    'learning_rate': 0.1,
    'seed': 0
}

model = xgb.XGBRegressor(**xgb_params)
run_model(model, cat_feats)



(-13039.290196724838, 109.36715375706265)

In [0]:
n = xgb.XGBRegressor(**xgb_params)
n.fit(X, y)

imp = PermutationImportance(n).fit(X, y)
eli5.show_weights(imp, feature_names=cat_feats)



Weight,Feature
0.1286  ± 0.0023,param_napęd__cat
0.0900  ± 0.0025,param_stan__cat
0.0736  ± 0.0018,param_faktura-vat__cat
0.0592  ± 0.0011,param_rok-produkcji__cat
0.0576  ± 0.0014,param_skrzynia-biegów__cat
0.0435  ± 0.0009,param_moc__cat
0.0214  ± 0.0002,feature_kamera-cofania__cat
0.0192  ± 0.0006,param_typ__cat
0.0153  ± 0.0005,feature_światła-led__cat
0.0141  ± 0.0005,seller_name__cat


In [0]:
feats = ['param_napęd__cat','param_stan__cat','param_faktura-vat__cat','param_rok-produkcji__cat','param_skrzynia-biegów__cat','param_moc__cat','feature_kamera-cofania__cat','param_typ__cat','feature_światła-led__cat','seller_name__cat','feature_wspomaganie-kierownicy__cat','feature_system-start-stop__cat','param_pojemność-skokowa__cat','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat','param_marka-pojazdu__cat','feature_asystent-pasa-ruchu__cat','feature_hud-(wyświetlacz-przezierny)__cat','param_model-pojazdu__cat','param_kod-silnika__cat']

In [0]:
xgb_params = {
    'max_depth': 5,
    'n_estimators': 100, 
    'learning_rate': 0.1,
    'seed': 0
}

model = xgb.XGBRegressor(**xgb_params)
run_model(model, feats)



(-12035.500323196715, 93.38505577106305)

In [0]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x)=="None" else int(x))
df['param_moc'] = df['param_moc'].map(lambda x: -1 if str(x)=="None" else int(x.split(' ')[0]))
df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x)=="None" else int(x.split('cm')[0].replace(' ','')))

In [0]:
feats = ['param_napęd__cat','param_stan__cat','param_faktura-vat__cat','param_rok-produkcji','param_skrzynia-biegów__cat','param_moc','feature_kamera-cofania__cat','param_typ__cat','feature_światła-led__cat','seller_name__cat','feature_wspomaganie-kierownicy__cat','feature_system-start-stop__cat','param_pojemność-skokowa','feature_łopatki-zmiany-biegów__cat','feature_regulowane-zawieszenie__cat','param_marka-pojazdu__cat','feature_asystent-pasa-ruchu__cat','feature_hud-(wyświetlacz-przezierny)__cat','param_model-pojazdu__cat','param_kod-silnika__cat']

In [0]:
model = xgb.XGBRegressor(**xgb_params)
run_model(model, feats)



(-8909.117679528028, 37.89844059788225)

In [0]:
n = xgb.XGBRegressor(**xgb_params)
X = df[feats].values
y = df['price_value'].values
n.fit(X, y)

imp = PermutationImportance(n).fit(X, y)
eli5.show_weights(imp, feature_names=feats)



Weight,Feature
0.6426  ± 0.0095,param_rok-produkcji
0.2882  ± 0.0018,param_moc
0.0613  ± 0.0004,param_pojemność-skokowa
0.0391  ± 0.0011,param_marka-pojazdu__cat
0.0202  ± 0.0008,seller_name__cat
0.0175  ± 0.0008,param_kod-silnika__cat
0.0153  ± 0.0007,param_typ__cat
0.0144  ± 0.0004,param_napęd__cat
0.0115  ± 0.0004,param_model-pojazdu__cat
0.0095  ± 0.0003,param_stan__cat
