In [None]:
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt

from tqdm import tqdm
import time

import helper as h
import utils as u

from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import cross_val_score, train_test_split, KFold

from itertools import product
import eli5
from tqdm import tqdm

import gc
from datetime import datetime

In [None]:
df_train = pd.read_csv("../interim/02_train_data.csv", sep='|')
df_test = pd.read_csv("../interim/02_test_data.csv", sep='|')

In [None]:
df = pd.concat([df_train, df_test])
# Indeksy są od 0 w obu ramkach, więc muszę zrobić reset
df.reset_index(drop=True, inplace=True)
print(df.shape)

### Wybór cech do modelowania

In [None]:
feats = [x for x in df_train.columns if "_cat" in x]
feats += ['Общая площадь:','Высота потолков:', 'Площадь кухни:', 'Жилая комната:']
feats += ['flat_floor', 'build_floor', 'floor_rte','if_first_last_floor']
feats += [x for x in df if 'agency' in x]
feats += [x for x in df if 'repl' in x]
feats += ['is_today', 'is_yesterday','publish_days']

### Wybór (ew. przekształcenie) zmiennej celu

In [None]:
price = 'price_per_m2'
df['log_price_per_m2'] = np.log(df[price])

price = 'log_price_per_m2'

### Dodanie cech randomowych (dla odcięcia zbędnych cech) 

In [None]:
for i in range(5):
    df['random_{}'.format(i)] = np.random.normal(size=len(df))
    feats.append('random_{}'.format(i))    

In [None]:
col_to_drop = ['breadcrumbs_0_repl_count',
 'breadcrumbs_2_repl_count',
 'breadcrumbs_3_cat',
 'breadcrumbs_4_cat',
 'breadcrumbs_4_repl_count',
 'date_0_cat',
 'date_1_cat',
 'date_2_cat',
 'date_3_cat',
 'geo_block_0_repl_count',
 'geo_block_2_cat',
 'geo_block_2_repl_count',
 'geo_block_3_cat',
 'geo_block_4_cat',
 'geo_block_4_repl_count',
 'geo_block_4_repl_mean',
 'geo_block_4_repl_median',
 'Адрес:_13_cat',
 'Адрес:_13_repl_count',
 'Адрес:_13_repl_mean',
 'Адрес:_13_repl_median',
 'Адрес:_7_cat',
 'Адрес:_8_cat',
 'Адрес:_9_cat',
 'Дата  обновления:_cat',
 'Дата публикации:_cat',
 'Детская площадка:_cat',
 'Лифт:_cat',
 'Мусоропровод:_cat',
 'Площадь кухни:',
 'Серия:_cat',
 'Тип балкона:_repl_count',
 'Тип балкона:_repl_mean',
 'Тип балкона:_repl_median',
 'Тип дома:_cat',
 'Тип комнат:_cat',
 'Тип фундамента:_cat',
 'Управляющая компания:_cat']

In [None]:
col_to_drop += ['breadcrumbs_0_cat', 'Возможна ипотека:_cat', 'Адрес:_11_cat','Стиральная машина:_repl_count']
col_to_drop += ['is_today', 'random_0', 'random_1', 'random_2', 'random_3', 'random_4']

In [None]:
for col in col_to_drop:
    feats.remove(col)

## Modelowanie

In [None]:
model = DecisionTreeRegressor(max_depth=20)
u.check_model(df, feats, price, model, orig_price_trans=u.log_price_factr_area_trans)

In [None]:
xgb_params={'n_estimators': 375, 'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 15, 'min_child_weight': 8, 'subsample': 0.95}

model = xgb.XGBRegressor(**xgb_params)
u.check_model(df, feats, price, model, n_splits=5, orig_price_trans=u.log_price_factr_area_trans)

In [None]:
# (2.355716375929465, 0.233763690588422) <-- submit 3 (1.88070)
# (2.2888983069605917, 0.27275301838603083) <-- submit 4 (1.86007)
# (2.10085497423071, 0.29354647610120993) <-- submit 5 (1.71501)

# xgb_params={
#   'n_estimators': 350,
#   'colsample_bytree': 0.3,
#   'learning_rate': 0.05,
#   'max_depth': 15,
#   'min_child_weight': 3,
#   'subsample': 0.75}

# (2.031405362730201, 0.27745594834270715) <-- submit 6 (1.68999) oczyszczenie ze zmiennych losowych

# xgb_params={
#   'n_estimators': 400,
#   'colsample_bytree': 0.3,
#   'learning_rate': 0.05,
#   'max_depth': 15,
#   'min_child_weight': 3,
#   'subsample': 0.75}

# (2.026766325034033, 0.2786812280083919) <-- submit 7 (1.68381) best na ręcznym wyborze parametrów


# xgb_params={
#     'n_estimators': 350, 
#     'colsample_bytree': 0.3, 
#     'learning_rate': 0.075, 
#     'max_depth': 15, 
#     'min_child_weight': 6, 
#     'subsample': 0.85}


# (2.0658578335869375, 0.31226221389120995) <-- submit 8 (1.70633) najlepszy z Random Search


# xgb_params={
#     'n_estimators': 375, 
#     'colsample_bytree': 0.5, 
#     'learning_rate': 0.1, 
#     'max_depth': 15, 
#     'min_child_weight': 8, 
#     'subsample': 0.95}

# (2.0983975078759416, 0.27381624010553185) <-- submit 9 (1.70633) (prawie) najstabilniejszy z Random Search

In [None]:
feat_imprt = pd.DataFrame(index = model.get_booster().feature_names, data= model.feature_importances_)
feat_imprt.columns = ['feat_importance']

In [None]:
feat_imprt.sort_values('feat_importance', ascending=False).head()

In [None]:
# col_to_drop = feat_imprt[feat_imprt['feat_importance'] <= feat_imprt.iloc[-5:].max().values[0]].index

In [None]:
eli5.show_weights(model, feature_names=feats)

In [None]:
X_test = df[df[price].isna()][feats].reset_index(drop=True)
y_pred = model.predict(X_test)
y_pred = u.log_price_factr_area_trans(y_pred, X_test)

now = datetime.now()
current_time = now.strftime("%D_%H_%M_%S").replace('/','_')

df_test = pd.read_csv("../interim/02_test_data.csv", sep='|')
df_test["price"] = y_pred
df_test[["id", "price"]].to_csv("../output/xgb_area_{}.csv".format(current_time), index=False)

### Random search

In [None]:
xgb_params_space={
  'n_estimators': [350, 375, 400, 425, 450, 500],
  'colsample_bytree': [0.3, 0.5],
  'learning_rate': [0.01, 0.05, 0.075, 0.1, 0.15],
  'max_depth': [10,12,15,20],
  'min_child_weight': [4,6,8,10],
  'subsample': [0.5,0.6,0.75,0.85, 0.95],}

In [None]:
xgb_params_dicts = []

for n_estimators, colsample_bytree, learning_rate, max_depth, min_child_weight, subsample  in product(*xgb_params_space.values()):
    
    xgb_params_dict={
          'n_estimators': n_estimators,
          'colsample_bytree': colsample_bytree,
          'learning_rate': learning_rate,
          'max_depth': max_depth,
          'min_child_weight': min_child_weight,
          'subsample': subsample}
    
    xgb_params_dicts.append(xgb_params_dict)

In [None]:
# (2.026766325034033, 0.2786812280083919) <-- best_score

In [None]:
GridSearchResults = pd.DataFrame(columns=['n_estimators', 'colsample_bytree', 'learning_rate', 'max_depth', 'min_child_weight', 'subsample', 'mean_mae', 'std_mae'])
for idx in range(30):

    xgb_params = random.choice(xgb_params_dicts)
    xgb_params_dicts.remove(xgb_params)
    
    print(idx, xgb_params)
    
    model = xgb.XGBRegressor(**xgb_params)
    mean_mae, std_mae = u.check_model(df, feats, price, model, n_splits=5, orig_price_trans=u.log_price_factr_area_trans)
    
    print(idx, mean_mae, std_mae)
    
    GridSearchResults = GridSearchResults.append(pd.DataFrame(data=pd.Series(xgb_params), columns=[idx]).T)
    GridSearchResults.loc[idx, 'mean_mae'] = mean_mae
    GridSearchResults.loc[idx, 'std_mae'] = std_mae
    print("It's {}. model ready!".format(idx))

In [None]:
GridSearchResults.sort_values('mean_mae')