Used to generate XGB predictions

In [10]:
import xgboost
from xgboost import XGBRegressor, XGBClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, r2_score, mean_squared_error

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from category_encoders import MEstimateEncoder, TargetEncoder
from sklearn.inspection import permutation_importance

import seaborn as sns
import shap



from utils import get_na_cols, params2, categorical_cols, one_hot_cols, target_encoding_cols, ordinal_cols, useless_cols, numeric_cols

In [11]:
process_train_path = 'processed_train.csv'
test_df_path = '../Dataset/test.csv/test.csv'
processed_test_path = 'processed_test.csv'

In [12]:
processed_df = pd.read_csv(process_train_path)
X = processed_df.drop(['price_doc'], axis=1)
y = processed_df['price_doc']

In [13]:
def train(X, y):
    model = XGBRegressor(
                      n_estimators=params2['n_estimators'],
                      device=params2['device'],
                      objective=params2['objective'],
                      eval_metric=params2['eval_metric'],
                      enable_categorical=params2['enable_categorical'],
                      
                      eta=params2['eta'],
                      max_depth = params2['max_depth'],
                    #   gamma = params2['gamma'],
                    #   reg_alpha = params2['reg_alpha'],
                    #   min_child_weight=params2['min_child_weight'],
                      colsample_bytree=params2['colsample_bytree']                     
                      )
  
    model.fit(X, y)

    return model

def test(model, X_test, y_test):
    pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, pred, squared=False)
    r2 = r2_score(y_test, pred)

    print(f"RMSE: {rmse}")
    print(f"R2: {r2}")

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    shap.summary_plot(shap_values, X_test, plot_type="bar")

In [14]:
model = train(X, y)

is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


In [15]:
test_df = pd.read_csv(test_df_path)
processed_test_df = pd.read_csv(processed_test_path)
processed_test_df.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,area_m,...,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,year,month,product_type_Investment,product_type_OwnerOccupier,sub_area_te
0,39.0,20.7,2.0,9.0,0.0,1998,1,8.9,2.0,26155140.0,...,10,1,0,14,1,2015,6.0,1,0,6813803.0
1,79.2,32.660037,8.0,17.0,0.0,0,3,1.0,0.0,25536300.0,...,11,0,1,12,1,2015,6.0,0,1,6215746.0
2,40.5,25.1,3.0,5.0,1.0,1960,2,4.8,1.0,9946335.0,...,21,0,10,71,11,2015,6.0,1,0,6074524.0
3,62.8,36.0,17.0,17.0,0.0,2016,2,62.8,2.0,21494090.0,...,10,0,0,2,0,2015,6.0,0,1,5322553.0
4,40.0,40.0,17.0,17.0,0.0,0,1,1.0,0.0,25536300.0,...,12,0,1,11,1,2015,6.0,0,1,6215746.0


In [16]:
pred = model.predict(processed_test_df)
pred

is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


array([5626352.5, 8832323. , 5395107.5, ..., 4983416.5, 5783618. ,
       9489654. ], dtype=float32)

In [17]:
prediction_df = pd.DataFrame({
    'id': test_df['id'],
    'price_doc': pred
})

In [18]:
prediction_df.to_csv('xgb_pred.csv', index=False)