In [1]:
import xgboost
from xgboost import XGBRegressor, XGBClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_log_error, r2_score, mean_squared_error

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from category_encoders import MEstimateEncoder, TargetEncoder
from sklearn.inspection import permutation_importance

import seaborn as sns
import shap

from tqdm import tqdm


from utils import get_na_cols, process_train, process, params2, categorical_cols, one_hot_cols, target_encoding_cols, ordinal_cols, useless_cols, numeric_cols

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [2]:
def cal_mean_errors(train_df, models):
    cv = KFold(n_splits=5, shuffle=True)
    rmsles = []
    
    train_df = train_df.drop(train_df[train_df['build_year'] > 2018].id)
    train_df.reset_index(drop=True, inplace=True)
    
    for fold, (train_idx, test_idx) in tqdm(enumerate(cv.split(train_df))):
        rmsles_ = []
        for model in models:
            X_train = train_df.iloc[train_idx]
            processed_df, oe, te = process_train(X_train)
            X_train = processed_df.drop(['price_doc'], axis=1)
            y_train = processed_df['price_doc']
            
            X_test = train_df.iloc[test_idx]
            y_test = X_test['price_doc']
            X_test = X_test.drop(['price_doc'], axis=1)
            X_test = process(X_test, te, oe)

            
            evaluation = [( X_test, y_test)]
            model.fit(X_train, y_train, eval_set=evaluation, verbose=False)
            pred = model.predict(X_test)
            rmsles_.append(mean_squared_log_error(y_test, pred, squared=False))
        
        rmsles.append(rmsles_)

    return rmsles


In [3]:
# Import train data
train_df = pd.read_csv('../Dataset/train.csv/train.csv')
model1 =  XGBRegressor(
                    n_estimators=params2['n_estimators'],
                    device=params2['device'],
                    objective=params2['objective'],
                    eval_metric=params2['eval_metric'],
                    enable_categorical=params2['enable_categorical'],
                    early_stopping_rounds=params2['early_stopping_rounds'],

                    eta=params2['eta'],
                    max_depth = params2['max_depth'],
                    # gamma = params2['gamma'],
                    # reg_alpha = params2['reg_alpha'],
                    # min_child_weight=params2['min_child_weight'],
                    colsample_bytree=params2['colsample_bytree']     
                )

models = [model1]

rmsles = cal_mean_errors(train_df, models)

          id   timestamp  full_sq  life_sq  floor  max_floor  material  \
10092  10096  2013-08-27       39     23.0    9.0       12.0       1.0   
15223  15228  2014-02-19       79      NaN   17.0       25.0       4.0   

       build_year  num_room  kitch_sq  ...  cafe_count_5000_price_2500  \
10092      1966.0       2.0       7.0  ...                          25   
15223      2014.0       2.0      10.0  ...                          19   

      cafe_count_5000_price_4000 cafe_count_5000_price_high  \
10092                          4                          0   
15223                          7                          2   

       big_church_count_5000  church_count_5000  mosque_count_5000  \
10092                     17                 36                  0   
15223                      3                 17                  0   

       leisure_count_5000  sport_count_5000  market_count_5000  price_doc  
10092                  11               109                 14    6900000  
1

0it [00:00, ?it/s]is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


price_doc not found


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecat

price_doc not found


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecat

price_doc not found


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecat

price_doc not found


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecat

price_doc not found


is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecat

In [6]:
rmsles = np.array(rmsles)
print(rmsles)

[[0.48647076]
 [0.46749353]
 [0.49221381]
 [0.47940274]
 [0.46941295]]
