In [1]:
import xgboost
from xgboost import XGBRegressor, XGBClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_log_error, r2_score, mean_squared_error

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from category_encoders import MEstimateEncoder, TargetEncoder
from sklearn.inspection import permutation_importance

import seaborn as sns
import shap

from tqdm import tqdm


from xgb_utils import  process_train, process_test, one_hot_cols, cal_mean_errors
from xgb_params import xgb_params_inv, xgb_params_own, num_inv_top_features, num_own_top_features

In [2]:
# Train datasets
invesment_train_path = 'processed/processed_train_inv.csv'
owner_train_path = 'processed/processed_train_own.csv'

# Test
investment_test_path = 'processed/processed_test_inv.csv'
owner_test_path = 'processed/processed_test_own.csv'

# Feats
investment_feat_path = './best_feats/Inv_Onehot_only_mean_NANs.csv'
owner_feat_path = './best_feats/Own_Onehot_only_mean_NANs.csv'

output_path = './output/xgb_split_pred.csv'

# Investment Dataset

In [3]:
inv_feats_df = pd.read_csv(investment_feat_path)
feats = inv_feats_df['col_name'].values.tolist()[:num_inv_top_features]

processed_df = pd.read_csv(invesment_train_path)
X = processed_df.drop(['price_doc'], axis=1)
y = processed_df['price_doc']

X = X[feats]

In [4]:
def train(X, y):
    
    model = XGBRegressor(
                    # verbosity=0,
                    n_estimators=xgb_params_inv['n_estimators'],
                    device=xgb_params_inv['device'],
                    objective=xgb_params_inv['objective'],
                    eval_metric=xgb_params_inv['eval_metric'],
                    enable_categorical=xgb_params_inv['enable_categorical'],
                    # early_stopping_rounds=xgb_params_inv['early_stopping_rounds'],

                    eta=xgb_params_inv['eta'],
                    max_depth = xgb_params_inv['max_depth'],
                    # gamma = xgb_params_inv['gamma'],
                    # reg_alpha = xgb_params_inv['reg_alpha'],
                    # min_child_weight=xgb_params_inv['min_child_weight'],
                    colsample_bytree=xgb_params_inv['colsample_bytree']
                )
  
    model.fit(X, y)

    return model

def test(model, X_test, y_test):
    pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, pred, squared=False)
    r2 = r2_score(y_test, pred)

    print(f"RMSE: {rmse}")
    print(f"R2: {r2}")

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    shap.summary_plot(shap_values, X_test, plot_type="bar")

In [5]:
model = train(X, y)

is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


In [6]:
processed_inv_test_df = pd.read_csv(investment_test_path)
processed_inv_test_df_copy = processed_inv_test_df.copy(deep=True)
processed_inv_test_df.head()

Unnamed: 0,id,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,area_m,...,big_road1_1line_no,big_road1_1line_yes,railroad_1line_no,railroad_1line_yes,material_1.0,material_2.0,material_4.0,material_5.0,material_6.0,material_nan
0,30474,39.0,20.7,2,9,1998.0,1,8.9,3.0,26155140.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,30476,40.5,25.1,3,5,1960.0,2,4.8,2.0,9946335.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,30482,45.4,28.5,9,12,1972.0,2,6.0,2.0,15319900.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,30487,39.8,18.9,4,17,2013.0,1,9.9,3.0,11391680.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,30491,94.5,61.9,2,12,2000.0,4,10.3,2.480988,5704502.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [7]:
pred = model.predict(processed_inv_test_df[feats])
pred

is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




array([5958187. , 5750031. , 4919771.5, ..., 8218507. , 5593338. ,
       8896945. ], dtype=float32)

In [8]:
inv_prediction_df = pd.DataFrame({
    'id': processed_inv_test_df_copy['id'],
    'price_doc': pred
})

# Owner Dataset

In [9]:
own_feats_df = pd.read_csv(owner_feat_path)
feats = own_feats_df['col_name'].values.tolist()[:num_own_top_features]

processed_df = pd.read_csv(owner_train_path)
X = processed_df.drop(['price_doc'], axis=1)
y = processed_df['price_doc']

X = X[feats]

In [10]:
model = train(X, y)

is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


In [11]:
processed_own_test_df = pd.read_csv(owner_test_path)
processed_own_test_df_copy = processed_own_test_df.copy(deep=True)
processed_own_test_df.head()

Unnamed: 0,id,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,area_m,...,big_road1_1line_yes,railroad_1line_no,railroad_1line_yes,material_1.0,material_2.0,material_3.0,material_4.0,material_5.0,material_6.0,material_nan
0,30475,79.2,49.632838,8,17,0.0,3,1.0,1.0,25536300.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30477,62.8,36.0,17,17,2016.0,2,62.8,3.0,21494090.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30478,40.0,40.0,17,17,0.0,1,1.0,1.0,25536300.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,30479,48.43,49.632838,21,1,2015.0,1,1.0,1.0,9629358.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,30480,38.8,49.632838,15,17,1493.225347,1,1.0,1.0,11324090.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
pred = model.predict(processed_own_test_df[feats])
pred

is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead


array([8435192. , 6734083. , 5405712. , ..., 5277606.5, 5196483.5,
       4732529. ], dtype=float32)

In [13]:
own_prediction_df = pd.DataFrame({
    'id': processed_own_test_df_copy['id'],
    'price_doc': pred
})


## Combine both predictions and sort by id

In [14]:
overall_preds = pd.concat([inv_prediction_df, own_prediction_df])

overall_preds.sort_values(by=['id'], inplace=True)

overall_preds.head()

Unnamed: 0,id,price_doc
0,30474,5958187.0
0,30475,8435192.0
1,30476,5750031.0
1,30477,6734083.0
2,30478,5405712.0


In [16]:
overall_preds.to_csv(output_path, index=False)