Used to generate XGB predictions

In [4]:
import xgboost
from xgboost import XGBRegressor, XGBClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, r2_score, mean_squared_error

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from category_encoders import MEstimateEncoder, TargetEncoder
from sklearn.inspection import permutation_importance

import seaborn as sns
import shap



from xgb_utils import get_na_cols, xgb_params, categorical_cols, one_hot_cols, target_encoding_cols, ordinal_cols, useless_cols, numeric_cols

In [5]:
test_df_path = '../../Dataset/test.csv/test.csv'
process_train_path = './processed/processed_train.csv'
processed_test_path = './processed/processed_test.csv'

output_path = './output/xgb_pred.csv'

# feats_path = 'Ordinal_only_have_NANs.csv'
feats_path = './best_feats/Onehot_only_with_NANs.csv'


top_n = 40

feats_df = pd.read_csv(feats_path)
feats = feats_df['col_name'].values.tolist()[:top_n]


In [6]:
processed_df = pd.read_csv(process_train_path)
X = processed_df.drop(['price_doc'], axis=1)
y = processed_df['price_doc']

X = X[feats]

In [7]:

def train(X, y):
    
    model = XGBRegressor(
                    # verbosity=0,
                    n_estimators=xgb_params['n_estimators'],
                    device=xgb_params['device'],
                    objective=xgb_params['objective'],
                    eval_metric=xgb_params['eval_metric'],
                    enable_categorical=xgb_params['enable_categorical'],
                    # early_stopping_rounds=xgb_params['early_stopping_rounds'],

                    eta=xgb_params['eta'],
                    max_depth = xgb_params['max_depth'],
                    # gamma = xgb_params['gamma'],
                    # reg_alpha = xgb_params['reg_alpha'],
                    # min_child_weight=xgb_params['min_child_weight'],
                    colsample_bytree=xgb_params['colsample_bytree']
                )
  
    model.fit(X, y)

    return model

def test(model, X_test, y_test):
    pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, pred, squared=False)
    r2 = r2_score(y_test, pred)

    print(f"RMSE: {rmse}")
    print(f"R2: {r2}")

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    shap.summary_plot(shap_values, X_test, plot_type="bar")

In [8]:
model = train(X, y)

is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


In [9]:
test_df = pd.read_csv(test_df_path)
processed_test_df = pd.read_csv(processed_test_path)
processed_test_df.head()

Unnamed: 0,id,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,...,water_1line_yes,big_road1_1line_no,big_road1_1line_yes,railroad_1line_no,railroad_1line_yes,ecology_excellent,ecology_good,ecology_no data,ecology_poor,ecology_satisfactory
0,30474,39.0,20.7,2,9,1,1998.0,1,8.9,3.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,30475,79.2,34.404467,8,17,1,0.0,3,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,30476,40.5,25.1,3,5,2,1960.0,2,4.8,2.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,30477,62.8,36.0,17,17,1,2016.0,2,62.8,3.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,30478,40.0,40.0,17,17,1,0.0,1,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [10]:
pred = model.predict(processed_test_df[feats])
pred

is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




array([5509913., 8750029., 5435474., ..., 4222409., 5602239., 8712463.],
      dtype=float32)

In [11]:
prediction_df = pd.DataFrame({
    'id': test_df['id'],
    'price_doc': pred
})

In [12]:
prediction_df.to_csv(output_path, index=False)