In [1]:
import os
import sys
from pathlib import Path

mod_path = os.path.join(Path.cwd().parent.parent)
if mod_path not in sys.path:
    sys.path.append(mod_path) 

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold

from src.data import *
from src.features.utils import *
from src.model.tree_based import ModelXgBoost

In [3]:
# data with shape 28007, 33 [transaction related features]
train = pd.read_csv('../../data/processed/train1.csv')
test = pd.read_csv('../../data/processed/test1.csv')

In [4]:
# train.columns
test.columns

Index(['Unnamed: 0', 'ID', 'Deposit', 'AccessoryRate', 'rateTypeEntity',
       'RatePerUnit', 'DaysOnDeposit', 'MainApplicantGender', 'Age', 'Region',
       'Occupation', 'Term', 'TotalContractValue', 'SplitPaymentsHistory',
       'nb_payments', 'amount_paid', 'percent_amt_paid', 'mean_amt_paid',
       'median_amt_paid', 'max_amt_paid', 'min_amt_paid', 'stddev_amt_paid',
       'nb_skipped_months', 'b1', 'b2', 'b3', 'b4', 'b5'],
      dtype='object')

In [5]:
train.drop(columns = 'Unnamed: 0', inplace=True)
test.drop(columns = 'Unnamed: 0', inplace=True)
test.columns

Index(['ID', 'Deposit', 'AccessoryRate', 'rateTypeEntity', 'RatePerUnit',
       'DaysOnDeposit', 'MainApplicantGender', 'Age', 'Region', 'Occupation',
       'Term', 'TotalContractValue', 'SplitPaymentsHistory', 'nb_payments',
       'amount_paid', 'percent_amt_paid', 'mean_amt_paid', 'median_amt_paid',
       'max_amt_paid', 'min_amt_paid', 'stddev_amt_paid', 'nb_skipped_months',
       'b1', 'b2', 'b3', 'b4', 'b5'],
      dtype='object')

In [6]:
test.shape

(9336, 27)

In [7]:
print(train['Region'].isna().sum() / train.shape[0] * 100)

5.162994965544328


In [8]:
# Region has certain NaN values which might cause issues while encoding
# As total NaNs constitute ~5% of the data (1446) we remove it as of now
print(test['Region'].isna().sum() / test.shape[0] * 100)
# train.dropna(subset=['Region'], how='all', inplace=True)

# When attempting drop=first in OHE, the reverse transform throws an issue as it reads the NaN values as a separate
# category. So converting NaNs into strings
train['Region'] = train['Region'].fillna('Null')
test['Region'] = test['Region'].fillna('Null')

5.227077977720651


In [9]:
train.shape

(28007, 33)

In [10]:
# split_payment_history_df = train[["ID", "SplitPaymentsHistory"]]
id_arr = train[["ID"]]

target = train[['m1', 'm2', 'm3', 'm4', 'm5', 'm6']]
train_arr = train.drop(columns=['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 
                                'ID',
                                'SplitPaymentsHistory'])
test_id = test[["ID"]]
test.drop(columns=['ID',
                   'SplitPaymentsHistory'], inplace=True)

In [11]:
test.shape

(9336, 25)

## Approach 1

### Model train on initial hp

In [12]:
def approach_one_model(x_train, y_train, x_test):
    model = ModelXgBoost(train_array=x_train, train_target=y_train)
    model.train_model()  # Default h.params (Checkout the code)
    
    predict = model.trained_model.predict(x_test)
    
    return model, predict

In [13]:
def encode_and_drop(full_array, data_type, tr_encoder=None):
#     print(tr_encoder)
    categorical_array = full_array[full_array.select_dtypes(exclude=['number']).columns]
    numerical_array = full_array.drop(columns=full_array.select_dtypes(exclude=['number']).columns)
#     print(numerical_array.shape)
    encoded_array, encoder = one_hot_encoding(
        categorical_frame=categorical_array, 
        type_of_data=data_type,
        fitted_encoder=tr_encoder,
        conv=True,
        drop="first",
        handle_unknown="error"
    )
#     print(encoded_array.shape)
    final_array = pd.concat([numerical_array.reset_index(drop=True), 
                             encoded_array.reset_index(drop=True)], axis=1)
    final_array.index = numerical_array.index
#     print(final_array.shape)
    return numerical_array, final_array, encoder

In [14]:
# split data into train and test sets
from sklearn.model_selection import train_test_split

seed = 10
X_train, X_test, y_train, y_test = train_test_split(train_arr, target, test_size=0.45, random_state=seed)

In [17]:
model_dict = dict()
predict_dict = dict()
for model_no, target_col in enumerate(["m1", "m2", "m3", "m4", "m5", "m6"]):
    print(f"MODEL {model_no+1}")
    og_frame, encoded_train, encoder_model = encode_and_drop(X_train, "train", None)
    og_frame_test, encoded_test, encoder_model = encode_and_drop(X_test, "test", encoder_model)
    model_dict[f"M_{model_no}"], predict_dict[f"m{model_no+1}_pred"] = approach_one_model(
        x_train=encoded_train,
        y_train=y_train[[target_col]],
        x_test=encoded_train
    )

MODEL 1
{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'gamma': None, 'gpu_id': None, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': None, 'max_delta_step': None, 'max_depth': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': 50, 'reg_lambda': 0, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': 0, 'use_label_encoder': False}
MODEL 2
{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'gamma': None, 'gpu_id': None, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': None, 'max_delta_step': None, 'max_depth': None, 'min_child_weight': None, 'missing': na

model_dict = dict()
predict_dict = dict()
for model_no, target_col in enumerate(["m1", "m2", "m3", "m4", "m5", "m6"]):
    og_frame, encoded_train, encoder_model = encode_and_drop(X_train, "train", None)
    og_frame_test, encoded_test, encoder_model = encode_and_drop(X_test, "test", encoder_model)
    model_dict[f"M_{model_no}"], predict_dict[f"m{model_no+1}_pred"] = approach_one_model(
        x_train=encoded_train,
        y_train=y_train[[target_col]],
        x_test=encoded_train
    )

In [18]:
predict_dict

{'m1_pred': array([ 696.7633 , 3020.8875 ,   86.29766, ..., 2108.0122 ,  841.9431 ,
        1246.153  ], dtype=float32),
 'm2_pred': array([ 832.72327 , 2507.6167  ,   55.434147, ..., 1350.4679  ,
         671.1714  , 1149.3551  ], dtype=float32),
 'm3_pred': array([ 861.92004, 1817.4036 ,   30.50992, ..., 1635.627  ,  690.8228 ,
        1159.1213 ], dtype=float32),
 'm4_pred': array([ 973.28925, 2124.0352 ,   76.56692, ..., 1516.7856 ,  701.21466,
        1097.7303 ], dtype=float32),
 'm5_pred': array([ 814.0458  , 2576.7205  ,   18.634806, ..., 2108.123   ,
         837.8979  , 1182.604   ], dtype=float32),
 'm6_pred': array([ 461.53244, 2750.6143 ,  -80.74959, ..., 1169.5126 ,  509.15338,
         749.5798 ], dtype=float32)}

### Calculation of RMSE

In [19]:
pred_frame = pd.DataFrame(predict_dict)
# pred_frame.index = X_test.index
pred_frame.index = X_train.index

In [20]:
# full_test_array = pd.concat([X_test, y_test, pred_frame], axis=1)
full_test_array = pd.concat([X_train, y_train, pred_frame], axis=1)

In [21]:
full_test_array = pd.merge(full_test_array, id_arr, how='left', left_index=True, right_index=True)

In [22]:
sub_file = SubmissionFile(
    validation_data=full_test_array,
    type_of_data='validation'
).execute()

In [23]:
sub_file.shape

(92418, 3)

In [24]:
sub_file['SquaredError'] = np.square(sub_file['Target'] - sub_file['Prediction'])

In [25]:
rmse = np.sqrt(np.sum(sub_file['SquaredError'])/sub_file.shape[0])
print('Final RMSE --> ', rmse)

Final RMSE -->  243.56665476102563


### Preparing Submission

In [None]:
encoded_train.columns

In [None]:
model_dict_test = dict()
predict_dict_test = dict()
for model_no, target_col in enumerate(["m1", "m2", "m3", "m4", "m5", "m6"]):
    og_frame, encoded_train, encoder_model = encode_and_drop(train_arr, "train", None)
    og_frame_test, encoded_test, encoder_model = encode_and_drop(test, "test", encoder_model)
    model_dict_test[f"M_{model_no}"], predict_dict_test[f"m{model_no+1}_pred"] = approach_one_model(
        x_train=encoded_train,
        y_train=target[[target_col]],
        x_test=encoded_test
    )

In [None]:
pred_frame_test = pd.DataFrame(predict_dict_test)
pred_frame_test.index = test.index

In [None]:
full_test_array_test = pd.concat([test, pred_frame_test], axis=1)
full_test_array_test = pd.merge(full_test_array_test, test_id, how='left', left_index=True, right_index=True)

In [None]:
sub_file = SubmissionFile(
    validation_data=full_test_array_test,
    type_of_data='test'
).execute()
sub_file.reset_index(drop=True, inplace=True)

In [None]:
sub_file.to_csv('../../submissions/submission_approach_1_default_tree_method.csv', )

In [None]:
sub_file

In [None]:
sub_file[sub_file["ID"].str.contains('ID_6L67PAA')]