In [2]:
import os
import sys
from pathlib import Path

mod_path = os.path.join(Path.cwd().parent.parent)
if mod_path not in sys.path:
    sys.path.append(mod_path)

In [3]:
import pandas as pd
import numpy as np
from copy import deepcopy

from sklearn.model_selection import StratifiedKFold

from src.data import *
from src.features.utils import *
from src.model.tree_based import ModelXgBoost

In [4]:
# data with shape 28007, 33 [transaction related features]
train = pd.read_csv('../../data/processed/train.csv')

In [5]:
train

Unnamed: 0.1,Unnamed: 0,ID,Deposit,AccessoryRate,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,Age,Region,...,mean_amt_paid,median_amt_paid,max_amt_paid,min_amt_paid,stddev_amt_paid,b1,b2,b3,b4,b5
0,0,ID_MR53LEX,2500,0.0,DAILY,55,3,Male,,Coast Region,...,517.26,350.0,3600.0,55.0,652.49,770.0,280.0,1320.0,1200.0,660.0
1,1,ID_3D7NQUH,2500,0.0,DAILY,55,3,Male,26.0,South Rift,...,737.87,655.0,2940.0,380.0,452.03,770.0,655.0,660.0,660.0,605.0
2,2,ID_0IWQNPI,2400,0.0,DAILY,50,3,Male,21.0,Mount Kenya Region,...,1126.67,980.0,2850.0,200.0,1005.32,250.0,200.0,610.0,1350.0,1500.0
3,3,ID_IY8SYB9,2000,0.0,DAILY,40,7,Female,26.0,Mount Kenya Region,...,1126.00,1140.0,2200.0,380.0,511.30,380.0,600.0,1100.0,1300.0,780.0
4,4,ID_9XHL7VZ,2000,0.0,DAILY,40,7,Male,27.0,North Rift,...,353.59,190.0,2640.0,40.0,559.24,520.0,80.0,40.0,40.0,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28002,28002,ID_CDZ24L9,2500,0.0,DAILY,55,3,Male,,Coast Region,...,1753.21,1635.0,5235.0,800.0,1069.37,1750.0,1600.0,1500.0,800.0,1670.0
28003,28003,ID_0XINELS,2400,0.0,DAILY,50,3,Female,,Nairobi Region,...,1166.25,1025.0,2980.0,450.0,786.66,550.0,1200.0,1250.0,850.0,1050.0
28004,28004,ID_PAU9JJU,2000,0.0,DAILY,40,3,Male,29.0,South Rift,...,952.22,1020.0,2580.0,200.0,695.70,1020.0,1110.0,1080.0,660.0,200.0
28005,28005,ID_K866QHS,2000,0.0,DAILY,40,7,Female,57.0,Mount Kenya Region,...,1241.25,1180.0,2000.0,970.0,323.04,970.0,1240.0,1200.0,1000.0,1120.0


In [6]:
train.drop(columns = 'Unnamed: 0', inplace=True)
train.columns

Index(['ID', 'Deposit', 'AccessoryRate', 'rateTypeEntity', 'RatePerUnit',
       'DaysOnDeposit', 'MainApplicantGender', 'Age', 'Region', 'Occupation',
       'Term', 'TotalContractValue', 'ExpectedTermDate', 'FirstPaymentDate',
       'LastPaymentDate', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6',
       'SplitPaymentsHistory', 'nb_payments', 'amount_paid',
       'percent_amt_paid', 'mean_amt_paid', 'median_amt_paid', 'max_amt_paid',
       'min_amt_paid', 'stddev_amt_paid', 'b1', 'b2', 'b3', 'b4', 'b5'],
      dtype='object')

In [7]:
train.shape

(28007, 35)

In [8]:
# Region has certain NaN values which might cause issues while encoding
# As total NaNs constitute ~5% of the data (1446) we remove it as of now
print(train['Region'].isna().sum() / train.shape[0] * 100)
train.dropna(subset=['Region'], how='all', inplace=True)

5.162994965544328


In [9]:
train['SplitPaymentsHistory'][0]

'[3600.0, 750.0, 350.0, 65.0, 95.0, 135.0, 85.0, 55.0, 345.0, 155.0, 55.0, 630.0, 585.0, 185.0, 440.0, 483.0, 660.0, 515.0, 292.0, 505.0, 210.0, 120.0, 260.0, 610.0, 230.0, 390.0, 660.0, 1200.0, 1320.0, 280.0, 770.0]'

In [10]:
train

Unnamed: 0,ID,Deposit,AccessoryRate,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,Age,Region,Occupation,...,mean_amt_paid,median_amt_paid,max_amt_paid,min_amt_paid,stddev_amt_paid,b1,b2,b3,b4,b5
0,ID_MR53LEX,2500,0.0,DAILY,55,3,Male,,Coast Region,Teacher,...,517.26,350.0,3600.0,55.0,652.49,770.0,280.0,1320.0,1200.0,660.0
1,ID_3D7NQUH,2500,0.0,DAILY,55,3,Male,26.0,South Rift,Farmer,...,737.87,655.0,2940.0,380.0,452.03,770.0,655.0,660.0,660.0,605.0
2,ID_0IWQNPI,2400,0.0,DAILY,50,3,Male,21.0,Mount Kenya Region,Business,...,1126.67,980.0,2850.0,200.0,1005.32,250.0,200.0,610.0,1350.0,1500.0
3,ID_IY8SYB9,2000,0.0,DAILY,40,7,Female,26.0,Mount Kenya Region,Farmer,...,1126.00,1140.0,2200.0,380.0,511.30,380.0,600.0,1100.0,1300.0,780.0
4,ID_9XHL7VZ,2000,0.0,DAILY,40,7,Male,27.0,North Rift,Farmer,...,353.59,190.0,2640.0,40.0,559.24,520.0,80.0,40.0,40.0,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28002,ID_CDZ24L9,2500,0.0,DAILY,55,3,Male,,Coast Region,Labourer,...,1753.21,1635.0,5235.0,800.0,1069.37,1750.0,1600.0,1500.0,800.0,1670.0
28003,ID_0XINELS,2400,0.0,DAILY,50,3,Female,,Nairobi Region,Business,...,1166.25,1025.0,2980.0,450.0,786.66,550.0,1200.0,1250.0,850.0,1050.0
28004,ID_PAU9JJU,2000,0.0,DAILY,40,3,Male,29.0,South Rift,Business,...,952.22,1020.0,2580.0,200.0,695.70,1020.0,1110.0,1080.0,660.0,200.0
28005,ID_K866QHS,2000,0.0,DAILY,40,7,Female,57.0,Mount Kenya Region,Business,...,1241.25,1180.0,2000.0,970.0,323.04,970.0,1240.0,1200.0,1000.0,1120.0


## Approach 2

In [13]:
s = pd.DataFrame(np.arange(0, len(train)), columns=['m1'])
df = train[['b1', 'b2', 'b3', 'b4', 'b5']]
df.drop(columns=['b5'], inplace=True)
df.rename(columns={'b1': 'b2', 'b2':'b3', 'b3': 'b4', 'b4': 'b5'}, inplace=True)
df.insert(loc=0, column='b1', value=s.values)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,b1,b2,b3,b4,b5
0,0,770.0,280.0,1320.0,1200.0
1,1,770.0,655.0,660.0,660.0
2,2,250.0,200.0,610.0,1350.0
3,3,380.0,600.0,1100.0,1300.0
4,4,520.0,80.0,40.0,40.0
...,...,...,...,...,...
28002,26556,1750.0,1600.0,1500.0,800.0
28003,26557,550.0,1200.0,1250.0,850.0
28004,26558,1020.0,1110.0,1080.0,660.0
28005,26559,970.0,1240.0,1200.0,1000.0


In [None]:
def slide_variable_window(
    predictor_array: pd.DataFrame,
    var_to_add: pd.DataFrame
) -> pd.DataFrame:
    predictor_array.drop(columns=['b5'], inplace=True)  # We drop the first payment
    predictor_array.rename(columns={'b1': 'b2', 'b2':'b3', 'b3': 'b4', 'b4': 'b5'}, inplace=True)
    predictor_array.insert(loc=0, column='b1', value=var_to_add.values)  # And add the new variable (mn)
    
    return predictor_array

In [11]:
# split_payment_history_df = train[["ID", "SplitPaymentsHistory"]]
# id_arr = train[["ID"]]

target = train[['m1', 'm2', 'm3', 'm4', 'm5', 'm6']]
train_arr = train.drop(columns=['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 
                                'SplitPaymentsHistory',
                                'ExpectedTermDate', 
                                'FirstPaymentDate',
                                'LastPaymentDate'])

In [12]:
# split data into train and test sets
from sklearn.model_selection import train_test_split

seed = 10
X_train, X_test, y_train, y_test = train_test_split(train_arr, target, test_size=0.45, random_state=seed)

In [None]:
X_train

In [None]:
t = pd.DataFrame([[1]], columns=['a'])
q = pd.DataFrame([[1]], columns=['b'])
r = pd.DataFrame([['aum']], columns=['name'])

k = pd.concat([t, q])
kk = pd.merge(k, r, how='left', left_index=True, right_index=True)
kk

In [13]:
def create_data_with_sliding_approach(data_without_target: pd.DataFrame,
                                      target_data: pd.DataFrame):
    frame = pd.DataFrame(None)
    new_df = deepcopy(data_without_target)
    target_df = pd.DataFrame(None)
    target_features = target_data.columns.tolist()
    for itr, col in enumerate(target_features):
        if itr == 0:
            target_df = pd.concat([target_df, target_data[[col]]])
            frame = pd.concat([frame, data_without_target])
        else:
            filter_df = new_df[['b1', 'b2', 'b3', 'b4', 'b5']]  # Intermediate df
            new_df.drop(columns=['b1', 'b2', 'b3', 'b4', 'b5'], inplace=True)
            concatinating_df = slide_variable_window(predictor_array=filter_df, 
                                                     var_to_add=target_data[[target_features[itr-1]]])
            new_df = pd.concat([new_df, concatinating_df], axis=1)  # We add the newly created rows
            target_df = pd.concat([target_df, target_data[[col]]])
            frame = pd.concat([frame, new_df])
#             print(new_df.shape)

    target_df = pd.DataFrame(target_df.sum(axis=1).astype(int), columns=['target'])
#     print(frame.shape)  # Should be 6 * original data's no. of rows
    
    frame.reset_index(drop=True, inplace=True)
    target_df.reset_index(drop=True, inplace=True)
    
    return frame, target_df

In [None]:
frame

### Model train on initial hp :: Approach 2

In [None]:
def approach_two_model(x_train, y_train, x_test):
    model = ModelXgBoost(train_array=x_train, train_target=y_train)
    model.train_model()  # Default h.params (Checkout the code)
    predict = model.trained_model.predict(x_test)
    
    return model, predict

In [16]:
def encode_and_drop(full_array, data_type, tr_encoder=None):
#     print(tr_encoder)
    categorical_array = full_array[full_array.select_dtypes(exclude=['number']).columns]
    numerical_array = full_array.drop(columns=full_array.select_dtypes(exclude=['number']).columns)
#     print(numerical_array.shape)
    encoded_array, encoder = one_hot_encoding(
        categorical_frame=categorical_array, 
        type_of_data=data_type,
        fitted_encoder=tr_encoder,
        conv=True
    )
#     print(encoded_array.shape)
    final_array = pd.concat([numerical_array.reset_index(drop=True), 
                             encoded_array.reset_index(drop=True)], axis=1)
    final_array.index = numerical_array.index
#     print(final_array.shape)
    return numerical_array, encoded_array, encoder

In [17]:
# Preparing the TRAIN data for approach two and fitting the model
train_data, target_frame = create_data_with_sliding_approach(data_without_target=X_train, 
                                                             target_data=y_train)
id_array = train_data[["ID"]]
train_data.drop(columns=["ID"], inplace=True)
og_frame, encoded_train, encoder_model = encode_and_drop(train_data, "train", None)

model_two_obj = ModelXgBoost(train_array=encoded_train, 
                             train_target=target_frame)
model_two_obj.train_model()  # Default h.params (Checkout the code)
model_two = model_two_obj.trained_model

{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'gamma': None, 'gpu_id': None, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': None, 'max_delta_step': None, 'max_depth': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': 50, 'reg_lambda': 0, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': 0, 'use_label_encoder': False}


### Prediction using Model :: Approach 2

In [20]:
X_test.columns

Index(['ID', 'Deposit', 'AccessoryRate', 'rateTypeEntity', 'RatePerUnit',
       'DaysOnDeposit', 'MainApplicantGender', 'Age', 'Region', 'Occupation',
       'Term', 'TotalContractValue', 'nb_payments', 'amount_paid',
       'percent_amt_paid', 'mean_amt_paid', 'median_amt_paid', 'max_amt_paid',
       'min_amt_paid', 'stddev_amt_paid', 'b1', 'b2', 'b3', 'b4', 'b5'],
      dtype='object')

In [21]:
encoded_train

Unnamed: 0,DAILY,MONTHLY,WEEKLY,Female,Male,Coast Region,Mount Kenya Region,Nairobi Region,North Rift,Nyanza,South Rift,Western,Business,Driver/Motorbike Rider,Farmer,Government Employee,Labourer,Other,Teacher
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87643,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
87644,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
87645,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
87646,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [18]:
test_id_array = X_test[["ID"]]
X_test.drop(columns=["ID"], inplace=True)

# Encoding and re-attaching using train encoding model
og_frame_test, encoded_test, encoder_model = encode_and_drop(X_test, "test", encoder_model)

predict_dict = dict()

for col_no, predict_col in enumerate(y_test.columns):
    predict_dict[f"m{col_no+1}_pred"] = model_two.predict(encoded_test)

Unnamed: 0,m1,m2,m3,m4,m5,m6
19204,300.0,300.0,50.0,140.0,100.0,260.0
15721,1265.0,1925.0,1650.0,1100.0,1485.0,2600.0
7879,210.0,150.0,55.0,110.0,150.0,55.0
25250,10.0,250.0,1345.0,600.0,200.0,100.0
19371,110.0,55.0,55.0,55.0,55.0,165.0
...,...,...,...,...,...,...
22564,760.0,985.0,1115.0,1185.0,905.0,1160.0
17551,600.0,880.0,640.0,960.0,1160.0,960.0
14648,110.0,110.0,55.0,55.0,55.0,55.0
3550,1925.0,1485.0,1515.0,1925.0,1540.0,3289.0


### Calculation of RMSE

In [None]:
pred_frame = pd.DataFrame(predict_dict)
pred_frame.index = X_test.index

In [None]:
full_test_array = pd.concat([X_test, y_test, pred_frame], axis=1)

In [None]:
full_test_array = pd.merge(full_test_array, id_arr, how='left', left_index=True, right_index=True)

In [None]:
sub_file = SubmissionFile(
    validation_data=full_test_array,
    type_of_data='validation'
).execute()

In [None]:
sub_file.shape

In [None]:
sub_file['SquaredError'] = np.square(sub_file['Target'] - sub_file['Prediction'])

In [None]:
rmse = np.sqrt(np.sum(sub_file['SquaredError'])/sub_file.shape[0])
print('Final RMSE --> ', rmse)