In [1]:
import os
import sys
from pathlib import Path

mod_path = os.path.join(Path.cwd().parent.parent)
if mod_path not in sys.path:
    sys.path.append(mod_path)

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold

from src.data.pre_process import one_hot_encoding
from src.features.utils import *
from src.model.tree_based import ModelXgBoost

In [3]:
# data with shape 28007, 33 [transaction related features]
train = pd.read_csv('../../data/processed/train.csv')

In [4]:
train

Unnamed: 0.1,Unnamed: 0,ID,Deposit,UpsellDate,AccessoryRate,PaymentMethod,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,...,m6,SplitPaymentsHistory,nb_payments,amount_paid,percent_amt_paid,mean_amt_paid,median_amt_paid,max_amt_paid,min_amt_paid,stddev_amt_paid
0,0,ID_MR53LEX,2500,,0.0,FINANCED,DAILY,55,3,Male,...,385.0,"[3600.0, 750.0, 350.0, 65.0, 95.0, 135.0, 85.0...",31,16035.0,0.484734,517.26,350.0,3600.0,55.0,652.49
1,1,ID_3D7NQUH,2500,,0.0,FINANCED,DAILY,55,3,Male,...,935.0,"[2940.0, 970.0, 380.0, 880.0, 385.0, 440.0, 11...",30,22136.0,0.669166,737.87,655.0,2940.0,380.0,452.03
2,2,ID_0IWQNPI,2400,,0.0,FINANCED,DAILY,50,3,Male,...,1200.0,"[2850.0, 1500.0, 1350.0, 610.0, 200.0, 250.0]",6,6760.0,0.469444,1126.67,980.0,2850.0,200.0,1005.32
3,3,ID_IY8SYB9,2000,,0.0,FINANCED,DAILY,40,7,Female,...,530.0,"[2200.0, 1420.0, 1180.0, 900.0, 1400.0, 780.0,...",10,11260.0,0.679952,1126.00,1140.0,2200.0,380.0,511.30
4,4,ID_9XHL7VZ,2000,,0.0,FINANCED,DAILY,40,7,Male,...,330.0,"[2640.0, 910.0, 480.0, 280.0, 200.0, 180.0, 33...",22,7779.0,0.469746,353.59,190.0,2640.0,40.0,559.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28002,28002,ID_CDZ24L9,2500,,0.0,FINANCED,DAILY,55,3,Male,...,720.0,"[5235.0, 1270.0, 940.0, 1370.0, 2020.0, 1750.0...",14,24545.0,0.741989,1753.21,1635.0,5235.0,800.0,1069.37
28003,28003,ID_0XINELS,2400,,0.0,FINANCED,DAILY,50,3,Female,...,620.0,"[2980.0, 1000.0, 450.0, 1050.0, 850.0, 1250.0,...",8,9330.0,0.647917,1166.25,1025.0,2980.0,450.0,786.66
28004,28004,ID_PAU9JJU,2000,,0.0,FINANCED,DAILY,40,3,Male,...,401.0,"[2580.0, 1020.0, 540.0, 360.0, 200.0, 660.0, 1...",9,8570.0,0.669531,952.22,1020.0,2580.0,200.0,695.70
28005,28005,ID_K866QHS,2000,,0.0,FINANCED,DAILY,40,7,Female,...,800.0,"[2000.0, 1160.0, 1240.0, 1120.0, 1000.0, 1200....",8,9930.0,0.599638,1241.25,1180.0,2000.0,970.0,323.04


In [5]:
train.drop(columns = 'Unnamed: 0', inplace=True)
train.columns

Index(['ID', 'Deposit', 'UpsellDate', 'AccessoryRate', 'PaymentMethod',
       'rateTypeEntity', 'RatePerUnit', 'DaysOnDeposit', 'MainApplicantGender',
       'Age', 'Region', 'Town', 'Occupation', 'SupplierName', 'Term',
       'TotalContractValue', 'ExpectedTermDate', 'FirstPaymentDate',
       'LastPaymentDate', 'TransactionDates', 'PaymentsHistory', 'm1', 'm2',
       'm3', 'm4', 'm5', 'm6', 'SplitPaymentsHistory', 'nb_payments',
       'amount_paid', 'percent_amt_paid', 'mean_amt_paid', 'median_amt_paid',
       'max_amt_paid', 'min_amt_paid', 'stddev_amt_paid'],
      dtype='object')

In [6]:
split_payment_history_df = train[["ID", "SplitPaymentsHistory"]]

target = train[['m1', 'm2', 'm3', 'm4', 'm5', 'm6']]
train_arr = train.drop(columns=['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'SplitPaymentsHistory'])

### Model train on initial hp

In [None]:
def feature_construction(df, predicted_col):
    try:
        df["SplitPaymentsHistory"] = df.apply(lambda row: add_predicted_payment(row['SplitPaymentsHistory'], 
                                                                                row[predicted_col]), axis=1)
        df["nb_payments"] = df.apply(lambda row: length_calc(row["SplitPaymentsHistory"]), axis=1)
        df["amount_paid"] = df.apply(lambda row: sum_calc(row["SplitPaymentsHistory"]), axis=1)
        df["percent_amt_paid"] = df["amount_paid"] / train_merged["TotalContractValue"]
        df["mean_amt_paid"] = df.apply(lambda row: mean_calc(row["SplitPaymentsHistory"]), axis=1)
        df["median_amt_paid"] = df.apply(lambda row: median_calc(row["SplitPaymentsHistory"]), axis=1)
        df["max_amt_paid"] = df.apply(lambda row: max_calc(row["SplitPaymentsHistory"]), axis=1)
        df["min_amt_paid"] = df.apply(lambda row: min_calc(row["SplitPaymentsHistory"]), axis=1)
        df["stddev_amt_paid"] = df.apply(lambda row: std_dev_calc(row["SplitPaymentsHistory"]), axis=1)
    except KeyError as e:
        raise Exception("Column Missing")
        
    return df

In [None]:
def approach_one_model(x_train, y_train, x_test):
    model = ModelXgBoost(train_array=x_train, train_target=y_train)
    model.train_model()  # Default h.params (Checkout the code)
    predict = model.trained_model.predict(x_test)
    
    return predict

In [None]:
kfold = StratifiedKFold(n_splits=2,shuffle=True,random_state=0)

In [None]:
for train_idx, test_idx in kfold.split(train_arr, target):
#     print(_train, test)
    X_train, X_test = train[train_idx], train[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Step 1 is to encode the cat cols
    # Step 2 to join the encoded cols to original dataframe
    # OR remove the encoding step outside the CV -- then will need to save the column sequence

    for model_no, target_col in enumerate(["m1", "m2", "m3", "m4", "m5", "m6"]):
        if model_no == 1:
            target_array = target[[target_col]]
            train_array = train_arr


        
        predictions