In [1]:
import os
import sys
from pathlib import Path

mod_path = os.path.join(Path.cwd().parent)
if mod_path not in sys.path:
    sys.path.append(mod_path)

In [47]:
from typing import Union, Optional
from copy import deepcopy
import numpy as np

from src.data.ingestion import DataIngestion
from src.features.engineering import FeatureEngineering
from src.features.encoding import FeatureEncoding
from src.model.tree_based import ModelXgBoost
from src.data.validation_file import SubmissionFile

# Data Ingestion

In [3]:
# Should be run always [train and test]
train, test = DataIngestion(mode="dev").execute()
train.dropna(subset=['Region'], how='all', inplace=True)
test.dropna(subset=['Region'], how='all', inplace=True)
train.drop(columns=['Town'], inplace=True)
test.drop(columns=['Town'], inplace=True)
# test = DataIngestion(mode="test").execute()

  exec(code_obj, self.user_global_ns, self.user_ns)


# Feature Engineering

In [4]:
# FEATURE_ID_MAPPING = {
#     "Occupation": 0,
#     "MainApplicantGender": 1
# }
train = FeatureEngineering().execute(train)
test = FeatureEngineering().execute(test)
train.drop(
    columns=['TransactionDates',
             'PaymentsHistory',
             'RegistrationDate',
             'UpsellDate',
             'SupplierName',
             'PaymentMethod',
             'ExpectedTermDate',
             'FirstPaymentDate',
             'LastPaymentDate',
             'SplitTransactionDates'],
    inplace=True
)
test.drop(
    columns=['TransactionDates',
             'PaymentsHistory',
             'RegistrationDate',
             'UpsellDate',
             'SupplierName',
             'PaymentMethod',
             'ExpectedTermDate',
             'FirstPaymentDate',
             'LastPaymentDate',
             'SplitTransactionDates'],
    inplace=True
)

# X_Train and Y_Train Formulation

In [5]:
train.columns

Index(['ID', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6', 'Deposit', 'AccessoryRate',
       'rateTypeEntity', 'RatePerUnit', 'DaysOnDeposit', 'MainApplicantGender',
       'Age', 'Region', 'Occupation', 'Term', 'TotalContractValue',
       'SplitPaymentsHistory', 'nb_payments', 'amount_paid',
       'percent_amt_paid', 'mean_amt_paid', 'median_amt_paid', 'max_amt_paid',
       'min_amt_paid', 'stddev_amt_paid', 'nb_skipped_months', 'b1', 'b2',
       'b3', 'b4', 'b5'],
      dtype='object')

In [6]:
# Prepare data for Train
import copy
import pandas as pd

new_payments_df = train[["m1", "m2", "m3", "m4", "m5", "m6"]]
y_train = pd.concat([new_payments_df, new_payments_df.T.stack().reset_index(name='y')['y']], axis=1)[["y"]]

all_x_dfs = {"df_0": train.drop(["m1", "m2", "m3", "m4", "m5", "m6"], axis=1)}
for i in range(1, 6):
    temp_df = copy.deepcopy(all_x_dfs[f"df_{i - 1}"])
    m_df = new_payments_df[[f"m{i}"]]
    temp_df = pd.concat([temp_df, m_df], axis=1)
    temp_df.rename(columns={f"m{i}": "new_payment"}, inplace=True)
    temp_df = FeatureEngineering().get_updated_df(base_df=temp_df)
    all_x_dfs[f"df_{i}"] = temp_df

x_train = pd.concat([all_x_dfs[f"df_{i}"] for i in range(6)], ignore_index=True)
x_train.drop(columns=['SplitPaymentsHistory'], inplace=True)
del new_payments_df, temp_df, m_df

In [7]:
# ID variable is saved separately
# x_train_id = x_train[['ID']]
x_train.drop(columns=['ID'], inplace=True)

test_id = test[['ID']]
test.drop(columns=['ID'], inplace=True)

In [8]:
x_train.shape

(87570, 25)

# Encoding

In [9]:
def filter_categorical_features(encoded_frame,
                                numerical_frame,
                                categorical_variable: Optional[Union[list, str]] = None,
                                categories_to_filter: Optional[list] = None):
    """
    Filter category or categories in a specifc or multiple categorical variables from the encoded frame

    """    
    # Categories to filter
    if categories_to_filter:
        filtered_frame = encoded_frame[encoded_frame.columns[encoded_frame.columns.isin(categories_to_filter)]]
    else:
        categories = []
        if isinstance(categorical_variable, list):
            for _variable in categorical_variable:
                categories.extend(train[_variable].unique().tolist())
        else:
            categories.extend(train[categorical_variable].unique().tolist())
        filtered_frame = encoded_frame[encoded_frame.columns[encoded_frame.columns.isin(categories)]]
    
    _index = numerical_frame.index
    final_df = pd.concat([numerical_frame.reset_index(drop=True),
                          filtered_frame.reset_index(drop=True)], axis=1)
    final_df.index = _index
    
    return final_df


def encode_and_drop(full_array, data_type, tr_encoder=None):
    categorical_array = full_array[full_array.select_dtypes(exclude=['number']).columns]
    numerical_array = full_array.drop(columns=full_array.select_dtypes(exclude=['number']).columns)
    if not tr_encoder:
        encoder = FeatureEncoding()
    else:
        encoder = tr_encoder
    encoded_array = encoder.one_hot_encoding(
    categorical_frame=categorical_array,
    type_of_data=data_type,
    conv=True,
    drop=None,
    handle_unknown="ignore"
)
    final_array = pd.concat([numerical_array.reset_index(drop=True),
                             encoded_array.reset_index(drop=True)], axis=1)
    final_array.index = numerical_array.index

    return numerical_array, encoded_array, final_array, encoder

# For x_train
numerical_df, encoded_categories, encoded_train, encoding_obj = encode_and_drop(x_train, "train", None)

filtered_df = filter_categorical_features(encoded_categories, numerical_df, categorical_variable=['Occupation', 'Region'])

In [10]:
# Feature Scaling

# Model Training

In [11]:
# Preparing the TRAIN data for approach two and fitting the model

model_two_obj = ModelXgBoost(train_array=filtered_df,
                             train_target=y_train)
model_two_obj.train_model()  # Default h.params (Checkout the code)
model_two = model_two_obj.trained_model

# Model Inference

In [25]:
y_test = test[["m1", "m2", "m3", "m4", "m5", "m6"]]
x_test = test.drop(columns=["m1", "m2", "m3", "m4", "m5", "m6", "SplitPaymentsHistory"])

In [30]:
# For x_test
split_payment_history = test[["SplitPaymentsHistory"]]

numerical_df_test, encoded_categories_test, encoded_test, encoding_obj_t = encode_and_drop(x_test, "test", encoding_obj)
filtered_df_test = filter_categorical_features(encoded_categories_test, numerical_df_test, categorical_variable=['Occupation', 'Region'])
_indexes = filtered_df_test.index
# y_pred = pd.DataFrame()
predict_dict = dict()
for predict_col in y_test.columns:
    predict_dict[f"{predict_col}_pred"] = model_two.predict(filtered_df_test)
    y_pred_df = pd.DataFrame(predict_dict[f"{predict_col}_pred"], columns=[f"{predict_col}_pred"])
    filtered_df_test = pd.concat([filtered_df_test.reset_index(drop=True),
                                  y_pred_df.reset_index(drop=True), 
                                  split_payment_history.reset_index(drop=True)], axis=1)
    filtered_df_test.index = _indexes  # Making sure indexes are maintained
    filtered_df_test.rename(columns={f"{predict_col}_pred": "new_payment"}, inplace=True)
    filtered_df_test = FeatureEngineering().get_updated_df(base_df=filtered_df_test)
    split_payment_history = filtered_df_test[["SplitPaymentsHistory"]]
    filtered_df_test.drop(columns=["SplitPaymentsHistory"], inplace=True)

del split_payment_history

In [31]:
predict_dict

{'m1_pred': array([1007.7985 ,  793.37146,  102.31387, ...,  430.7622 , 1517.0515 ,
        1210.1794 ], dtype=float32),
 'm2_pred': array([ 615.17944,  754.7656 ,  164.8769 , ...,  513.09296, 1329.1776 ,
        1092.514  ], dtype=float32),
 'm3_pred': array([ 874.8007 ,  703.97577,  133.0233 , ...,  495.88226, 1131.4154 ,
         953.39465], dtype=float32),
 'm4_pred': array([ 729.5285  ,  966.5423  ,  124.398766, ...,  426.3277  ,
        1336.6155  ,  937.6392  ], dtype=float32),
 'm5_pred': array([2133.902  , 1153.6567 ,  107.14194, ...,  530.14575, 1441.7386 ,
        1405.635  ], dtype=float32),
 'm6_pred': array([11463.94   ,  1255.5376 ,   125.16381, ...,   438.10602,
          453.28284,   215.36354], dtype=float32)}

# Calculation of Metric

In [32]:
y_test

Unnamed: 0,m1,m2,m3,m4,m5,m6
27689,650.0,100.0,200.0,1400.0,940.0,110.0
7826,280.0,200.0,560.0,600.0,800.0,400.0
22604,40.0,90.0,5.0,40.0,50.0,40.0
3333,1595.0,1595.0,1540.0,1650.0,1595.0,275.0
23500,50.0,40.0,110.0,50.0,1050.0,750.0
...,...,...,...,...,...,...
187,1205.0,804.0,1251.0,1045.0,1210.0,1154.0
21736,120.0,120.0,40.0,230.0,200.0,80.0
19332,450.0,40.0,40.0,540.0,40.0,40.0
3478,2070.0,1158.0,1830.0,1370.0,1650.0,760.0


In [33]:
pred_frame = pd.DataFrame(predict_dict)
pred_frame.index = _indexes

In [34]:
full_test_array = pd.concat([filtered_df_test, y_test, pred_frame], axis=1)

In [40]:
full_test_array = pd.merge(full_test_array, test_id, how='left', left_index=True, right_index=True)

In [44]:
sub_file = SubmissionFile(
    validation_data=full_test_array,
    type_of_data='validation'
).execute()

In [45]:
sub_file.shape

(71796, 3)

In [48]:
sub_file['SquaredError'] = np.square(sub_file['Target'] - sub_file['Prediction'])

In [49]:
rmse = np.sqrt(np.sum(sub_file['SquaredError'])/sub_file.shape[0])
print('Final RMSE --> ', rmse)

Final RMSE -->  1564.208212407409


In [None]:
## Inference: model, X  -> yPred

In [None]:
# Submission File, Calculation of Metric (RMSE)
# Input: y_test(n X 6), y_pred(n X 6)
# Output: RMSE



In [None]:
# Outlier detection (if any)

# TASKS
# Tasks
    - Feature selection  Aum
        - Automated selection based on Genie Index

    - Pipeline setup  Nikhil
        - Pre-process
            - ingestion (DONE)
            - Features (DONE)
            - Encoding (Pending)
            - scaling (if applicable)
        - Modelling
            - Approach2
            - Sliding updates for features for Train and test

        - validation
            - Calculate RMSE

In [None]:
# Drop Useless Columns
def drop_cols(df):
    df.drop(
        [
            "ID",
            "UpsellDate",
            "PaymentMethod",
            "TransactionDates",
            "PaymentsHistory",
            "SupplierName",
            "Town",
            "RegistrationDateParsed",
            "ExpectedTermDateParsed",
            "FirstPaymentDateParsed",
            "LastPaymentDateParsed"
        ],
        inplace=True,
        axis=1
    )


drop_cols(train)
drop_cols(test)
train.head()
test.head()
