In [1]:
from pathlib import Path
import sys

PROJECT_DIR = Path('..').resolve()
sys.path.append(str(PROJECT_DIR))

%load_ext autoreload 
%autoreload 2

In [2]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

from sklearn.dummy import DummyRegressor
import xgboost as xgb
import gc

import numpy as np
import pandas as pd
np.random.seed(0)

import mlflow
import eli5

import seaborn as sns
import utils.helper as h
import utils.mlflow_helper as flow_h

## Przygotowanie danych

In [3]:
df_train, df_test, df_all = h.get_data(h.Const.INPUT_DIR)

In [4]:
df_all = h.preprocessing_data(df_all)
# h.save_to_interim(df_all)

## Modelowanie

In [5]:
flow_h.get_or_create_experiment("dwsolution_property");

In [6]:
h.Const.BLACK_LIST.remove('price_median')

In [7]:
feats = df_all.select_dtypes(["number", 'boolean']).columns
feats = list(feats.drop(h.Const.BLACK_LIST))

In [8]:
# Co mogę zmieniać w eksperymentach? 
h.Const.TARGET_NAME = 'price'
model = xgb.XGBRegressor(max_depth=8, n_estimators=400, random_state=0)
convert_target_method = 'log1p'

In [9]:
flow_h.mlflow_start_run(
    df=df_all, 
    model=model, 
    feats=feats, 
    target=h.Const.TARGET_NAME,
    convert_target_method=convert_target_method, 
)

(13947, 152) (46275, 151)


In [10]:
X_train, X_test, y_train = h.get_X_y(df_all, feats, h.Const.TARGET_NAME)
y_train = flow_h.convert_target(y_train, method=convert_target_method)
model.fit(X_train, y_train)

(13947, 152) (46275, 151)


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=8,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=400, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

## Zapisanie predykcji

In [12]:
df_train, df_test = h.split_df_all_to_train_test(df_all)

In [14]:
df_test.loc[:, 'price'] = flow_h.unconvert_target(model.predict(X_test), method=convert_target_method)
    
if h.Const.TARGET_NAME=='price_m2':
    df_test.loc[:, 'price'] = df_test['price']*df_test['area_norm']

In [15]:
df_test['price'].describe()

count    4.627500e+04
mean     3.756072e+05
std      2.735151e+05
min      2.160057e+04
25%      2.340789e+05
50%      3.153389e+05
75%      4.308963e+05
max      8.728767e+06
Name: price, dtype: float64

In [16]:
df_test = h.overwrite_prediction_by_data_leak(df_test)

In [17]:
df_test[['id', 'price']].to_csv('../output/xgb_md8_ne400_log1p_price+.csv', index=False) 