In [1]:
!pip install --upgrade tables
!pip install eli5
!install xgboost

Collecting tables
[?25l  Downloading https://files.pythonhosted.org/packages/ed/c3/8fd9e3bb21872f9d69eb93b3014c86479864cca94e625fd03713ccacec80/tables-3.6.1-cp36-cp36m-manylinux1_x86_64.whl (4.3MB)
[K     |████████████████████████████████| 4.3MB 2.9MB/s 
Installing collected packages: tables
  Found existing installation: tables 3.4.4
    Uninstalling tables-3.4.4:
      Successfully uninstalled tables-3.4.4
Successfully installed tables-3.6.1
Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |████████████████████████████████| 112kB 2.8MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.10.1
install: missing destination file operand after 'xgboost'
Try 'install --help' for more information.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, KFold

import eli5
from eli5.sklearn import PermutationImportance

Using TensorFlow backend.


In [3]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix_two/dw_matrix_car"


/content/drive/My Drive/Colab Notebooks/dw_matrix/matrix_two/dw_matrix_car


In [4]:

!ls


cars_visualisation.ipynb  day3_simple_model.ipynb  LICENSE
data			  day4_XGB_model.ipynb	   README.md


## Reading data

In [5]:
df = pd.read_hdf('data/car.h5')
df.shape

(106494, 155)

In [0]:
#df.columns.values

## Dummy Model

In [0]:
  feats = ['car_id']
  X = df[ feats ].values
  y = df['price_value'].values

  model = DummyRegressor()
  model.fit(X, y)
  y_pred = model.predict(X)

  mae(y, y_pred)

39465.934630440985

In [7]:
# Remove prices in currencies different than PLN
df = df[ df['price_currency'] == 'PLN' ]
df.shape


(106290, 155)

In [0]:
SUFIX_CAT = '_cat'
  
for feat in df.columns:

  if isinstance(df[feat][0], list): continue

  factorized_values = df[ feat ].factorize()[0]

  if SUFIX_CAT in feat: 
    df[feat] = factorized_value
  else:  
    df[feat + SUFIX_CAT] = factorized_values

In [9]:
cat_feats = [x for x in df.columns if SUFIX_CAT in x]
cat_feats = [x for x in cat_feats if 'price' not in x]
len(cat_feats)


151

In [0]:
def run_model(model, feats):
  X = df[ feats ].values
  y = df['price_value'].values

  scores = cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')

  return np.mean(scores), np.std(scores)


In [0]:
#Decision tree

In [15]:
run_model(DecisionTreeRegressor(max_depth=5), cat_feats)

(-19566.588937368324, 90.6181486516617)

In [16]:
#Random Forest
model = RandomForestRegressor(max_depth=5, n_estimators=50, random_state=0)
run_model(model, cat_feats)


(-18734.2072708522, 109.87074106274046)

In [18]:
#XGBoost 

xgb_params = {
    'max_depth':5, 
    'n_estimators':50, 
    'learning_rate':0.1,
    'seed':0,
    }

model = xgb.XGBRegressor(**xgb_params)
run_model(model, cat_feats)



(-13039.290196724838, 109.36715375706265)

In [19]:
xgb_params = {
    'max_depth':5, 
    'n_estimators':50, 
    'learning_rate':0.1,
    'seed':0,
    }

m = xgb.XGBRegressor(**xgb_params)
m.fit(X, y)

imp = PermutationImportance(m, random_state=0).fit(X, y)
eli5.show_weights(imp, feature_names=cat_feats)



Weight,Feature
0.1209  ± 0.0019,param_napęd_cat
0.1175  ± 0.0030,param_rok-produkcji_cat
0.1113  ± 0.0013,param_stan_cat
0.0625  ± 0.0019,param_skrzynia-biegów_cat
0.0527  ± 0.0016,param_faktura-vat_cat
0.0461  ± 0.0015,param_moc_cat
0.0275  ± 0.0008,param_marka-pojazdu_cat
0.0230  ± 0.0004,param_typ_cat
0.0227  ± 0.0007,feature_kamera-cofania_cat
0.0191  ± 0.0007,param_pojemność-skokowa_cat


In [23]:
df['param_rok-produkcji'] = df['param_rok-produkcji'].map(lambda x: -1 if str(x)=='None' else int(x) )
df['param_rok-produkcji'].unique()

array([2018, 2011, 2015, 2009, 2017, 2012, 2013, 2007, 2001, 2016, 2006,
       2008, 2004, 1999, 2000, 2010, 2005, 2002, 1998, 2014, 2003, 1982,
       1995, 1997, 1992, 1993, 1994, 1996, 1989, 1988, 1967, 1987, 1959,
       1990, 1991, 1974,   -1, 1975, 1973, 1985, 1984, 1986, 1981, 1979,
       1960, 1983, 1978, 1964, 1980, 1972, 1969, 1956, 1966, 1977, 1971,
       1963, 1953, 1961, 1952, 1949, 1976, 1965, 1937, 1968, 1958, 1962,
       1955, 1970, 1933, 1929, 1957, 1944, 1954, 1932, 1936, 1947, 1948])

In [38]:
df['param_moc']

0          90
2         115
3         262
4         110
5         310
         ... 
160609     75
160610     75
160611    120
160614    150
160615     88
Name: param_moc, Length: 106290, dtype: int64

In [0]:
# = df['param_moc'].map(lambda x: -1 if str(x)=='None' else x.split(' ')[0])

df['param_pojemność-skokowa'] = df['param_pojemność-skokowa'].map(lambda x: -1 if str(x)=='None' else x.split('cm')[0].replace(' ',''))


In [0]:
feats = ['param_napęd_cat', 'param_rok-produkcji', 'param_stan_cat', 'param_skrzynia-biegów_cat', 'param_faktura-vat_cat', 'param_moc',
         'param_marka-pojazdu_cat','param_typ_cat', 'feature_kamera-cofania_cat', 'param_pojemność-skokowa', 'seller_name_cat', 'param_kod-silnika_cat',
         'feature_wspomaganie-kierownicy_cat', 'feature_asystent-pasa-ruchu_cat', 'feature_regulowane-zawieszenie_cat',
         'feature_system-start-stop_cat', 'feature_światła-led_cat']

In [47]:
xgb_params = {
    'max_depth':5, 
    'n_estimators':50, 
    'learning_rate':0.1,
    'seed':0,
    }

model = xgb.XGBRegressor(**xgb_params)
run_model(model, feats)



(-9610.499296539281, 77.58748249892398)

array(['898 cm3', '1 560 cm3', '3 000 cm3', ..., '5 992 cm3', '1 966 cm3',
       '142 280 cm3'], dtype=object)