In [1]:
import os
import sys
from pathlib import Path

mod_path = os.path.join(Path.cwd().parent)
if mod_path not in sys.path:
    sys.path.append(mod_path)

In [2]:
from typing import Union, Optional
from copy import deepcopy
import numpy as np

from src.data.ingestion import DataIngestion
from src.features.engineering import FeatureEngineering
from src.features.encoding import FeatureEncoding
from src.model.tree_based import ModelXgBoost
from src.data.validation_file import SubmissionFile

# Data Ingestion

In [3]:
# Should be run always [train and test]
train, test = DataIngestion(mode="dev").execute()
train.dropna(subset=['Region'], how='all', inplace=True)
test.dropna(subset=['Region'], how='all', inplace=True)
train.drop(columns=['Town'], inplace=True)
test.drop(columns=['Town'], inplace=True)
# test = DataIngestion(mode="test").execute()

  exec(code_obj, self.user_global_ns, self.user_ns)


# Feature Engineering

In [4]:
# FEATURE_ID_MAPPING = {
#     "Occupation": 0,
#     "MainApplicantGender": 1
# }
train = FeatureEngineering().execute(train)
test = FeatureEngineering().execute(test)
train.drop(
    columns=[
        'TransactionDates',
        'PaymentsHistory',
        'RegistrationDate',
        'UpsellDate',
        'SupplierName',
        'PaymentMethod',
        'ExpectedTermDate',
        'FirstPaymentDate',
        'LastPaymentDate',
#         'SplitTransactionDates'
    ],
    inplace=True
)
test.drop(
    columns=[
        'TransactionDates',
        'PaymentsHistory',
        'RegistrationDate',
        'UpsellDate',
        'SupplierName',
        'PaymentMethod',
        'ExpectedTermDate',
        'FirstPaymentDate',
        'LastPaymentDate',
#         'SplitTransactionDates'
    ],
    inplace=True
)

# X_Train and Y_Train Formulation

In [6]:
# Prepare data for Train
import pandas as pd

new_payments_df = train[["m1", "m2", "m3", "m4", "m5", "m6"]]
y_train = pd.concat([new_payments_df, new_payments_df.T.stack().reset_index(name='y')['y']], axis=1)[["y"]]

all_x_dfs = {"df_0": train.drop(["m1", "m2", "m3", "m4", "m5", "m6"], axis=1)}
for i in range(1, 6):
    temp_df = deepcopy(all_x_dfs[f"df_{i - 1}"])
    m_df = new_payments_df[[f"m{i}"]]
    temp_df = pd.concat([temp_df, m_df], axis=1)
    temp_df.rename(columns={f"m{i}": "new_payment"}, inplace=True)
    temp_df = FeatureEngineering().get_updated_df(base_df=temp_df)
    all_x_dfs[f"df_{i}"] = temp_df

x_train = pd.concat([all_x_dfs[f"df_{i}"] for i in range(6)], ignore_index=True)
x_train.drop(columns=['SplitPaymentsHistory'], inplace=True)
del new_payments_df, temp_df, m_df

In [7]:
# ID variable is saved separately
# x_train_id = x_train[['ID']]
x_train.drop(columns=['ID'], inplace=True)

test_id = test[['ID']]
test.drop(columns=['ID'], inplace=True)

In [8]:
x_train.shape

(87570, 16)

# Encoding

In [9]:
def filter_categorical_features(overall_df,
                                encoded_frame,
                                categorical_variable: Optional[Union[list, str]] = None,
                                categories_to_filter: Optional[list] = None):
    """
    Filter category or categories in a specifc or multiple categorical variables from the encoded frame

    """    
    # Categories to filter
    if categories_to_filter:
        filtered_frame = encoded_frame[encoded_frame.columns[encoded_frame.columns.isin(categories_to_filter)]]
    else:
        categories = []
        if isinstance(categorical_variable, list):
            for _variable in categorical_variable:
                categories.extend(overall_df[_variable].unique().tolist())
        else:
            categories.extend(overall_df[categorical_variable].unique().tolist())
        filtered_frame = encoded_frame[encoded_frame.columns[encoded_frame.columns.isin(categories)]]
    
    return filtered_frame


def encode_and_drop(full_array, data_type, tr_encoder=None):
    categorical_array = full_array[full_array.select_dtypes(exclude=['number']).columns]
    numerical_array = full_array.drop(columns=full_array.select_dtypes(exclude=['number']).columns)
    if not tr_encoder:
        encoder = FeatureEncoding()
    else:
        encoder = tr_encoder
    encoded_array = encoder.one_hot_encoding(
    categorical_frame=categorical_array,
    type_of_data=data_type,
    conv=True,
    drop=None,
    handle_unknown="ignore"
)
    final_array = pd.concat([numerical_array.reset_index(drop=True),
                             encoded_array.reset_index(drop=True)], axis=1)
    final_array.index = numerical_array.index
    encoded_array.index = numerical_array.index

    return numerical_array, encoded_array, final_array, encoder


In [10]:
# For x_train
numerical_df, encoded_categories, encoded_train, encoding_obj = encode_and_drop(x_train, "train", None)

filtered_df = filter_categorical_features(overall_df=train, 
                                          encoded_frame=encoded_categories, 
                                          categorical_variable=['Occupation', 'Region'])

# Concatenating with the Numerical frame 
numerical_frame = numerical_df[
    [
#         'Deposit', 
#         'AccessoryRate', 
#         'RatePerUnit', 
#         'DaysOnDeposit', 
#         'Age',
#         'Term', 
#         'TotalContractValue', 
        'b1', 
        'b2', 
        'b3', 
        'b4', 
        'b5'
    ]
]  # Filtering required variables

_index = numerical_frame.index
filtered_df = pd.concat([numerical_frame.reset_index(drop=True),
                         filtered_df.reset_index(drop=True)], axis=1)
filtered_df.index = _index

In [11]:
filtered_df

Unnamed: 0,b1,b2,b3,b4,b5,Coast Region,Mount Kenya Region,Nairobi Region,North Rift,Nyanza,South Rift,Western,Business,Driver/Motorbike Rider,Farmer,Government Employee,Labourer,Other,Teacher
0,766.0,1000.0,1245.0,1195.0,840.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,200.0,2000.0,2000.0,3000.0,2500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1110.0,1130.0,1230.0,990.0,1250.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1000.0,2000.0,1884.0,1500.0,3000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1020.0,1240.0,1080.0,1020.0,860.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87565,1540.0,1320.0,540.0,843.0,855.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
87566,350.0,150.0,100.0,100.0,215.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
87567,2360.0,1260.0,1704.0,1435.0,1995.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
87568,800.0,800.0,760.0,600.0,600.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Feature Scaling

# Model Training

In [13]:
# Preparing the TRAIN data for approach two and fitting the model

model_two_obj = ModelXgBoost(train_array=filtered_df,
                             train_target=y_train)
model_two_obj.train_model()  # Default h.params (Checkout the code)
model_two = model_two_obj.trained_model

# Model Inference

In [14]:
y_test = test[["m1", "m2", "m3", "m4", "m5", "m6"]]
x_test = test.drop(columns=["m1", "m2", "m3", "m4", "m5", "m6", "SplitPaymentsHistory"])

In [15]:
# For x_test
split_payment_history = test[["SplitPaymentsHistory"]]

numerical_df_test, encoded_categories_test, encoded_test, encoding_obj_t = encode_and_drop(x_test, "test", encoding_obj)
filtered_df_test = filter_categorical_features(overall_df=x_test,
                                               encoded_frame=encoded_categories_test,
                                               categorical_variable=['Occupation', 'Region'])
_indexes = filtered_df_test.index

# Concatenating with the Numerical frame 
numerical_frame_test = numerical_df_test[
    [
#         'Deposit', 
#         'AccessoryRate', 
#         'RatePerUnit', 
#         'DaysOnDeposit', 
#         'Age',
#         'Term', 
#         'TotalContractValue', 
        'b1', 
        'b2', 
        'b3', 
        'b4', 
        'b5'
    ]
]  # Filtering required variables

_index = numerical_frame_test.index
filtered_df_test = pd.concat([numerical_frame_test.reset_index(drop=True),
                         filtered_df_test.reset_index(drop=True)], axis=1)
filtered_df_test.index = _index

In [16]:
# Inference on test data
predict_dict = dict()
for predict_col in y_test.columns:
    predict_dict[f"{predict_col}_pred"] = model_two.predict(filtered_df_test)
    y_pred_df = pd.DataFrame(predict_dict[f"{predict_col}_pred"], columns=[f"{predict_col}_pred"])
    filtered_df_test = pd.concat([filtered_df_test.reset_index(drop=True),
                                  y_pred_df.reset_index(drop=True), 
                                  split_payment_history.reset_index(drop=True)], axis=1)
    filtered_df_test.index = _indexes  # Making sure indexes are maintained
    filtered_df_test.rename(columns={f"{predict_col}_pred": "new_payment"}, inplace=True)
    filtered_df_test = FeatureEngineering().get_updated_df(base_df=filtered_df_test)
    split_payment_history = filtered_df_test[["SplitPaymentsHistory"]]
    filtered_df_test.drop(columns=["SplitPaymentsHistory"], inplace=True)

del split_payment_history

In [17]:
predict_dict

{'m1_pred': array([1310.7861 ,  702.4433 ,   66.70598, ...,  487.59427, 1531.1561 ,
        1318.1704 ], dtype=float32),
 'm2_pred': array([1382.9312 ,  781.2366 ,  145.05013, ...,  539.9375 ,  875.3562 ,
        1344.1925 ], dtype=float32),
 'm3_pred': array([1266.8241 ,  796.75244,  346.39737, ...,  543.747  , 1430.3486 ,
        1199.0573 ], dtype=float32),
 'm4_pred': array([1029.7507  ,  778.33704 ,    4.754924, ...,  352.84058 ,
        1097.7854  , 1513.1829  ], dtype=float32),
 'm5_pred': array([3380.2788  , 2559.3103  ,   22.711336, ..., 1729.3048  ,
        2379.9187  , 2122.7666  ], dtype=float32),
 'm6_pred': array([373.8821   ,  -3.6392024, 373.8821   , ...,  10.214052 ,
        373.8821   ,  10.214052 ], dtype=float32)}

# Calculation of Metric

In [18]:
pred_frame = pd.DataFrame(predict_dict)
pred_frame.index = _indexes

In [19]:
full_test_array = pd.concat([filtered_df_test, y_test, pred_frame], axis=1)

In [20]:
full_test_array = pd.merge(full_test_array, test_id, how='left', left_index=True, right_index=True)

In [21]:
sub_file = SubmissionFile(
    validation_data=full_test_array,
    type_of_data='validation'
).execute()

In [22]:
sub_file.shape

(71796, 3)

In [23]:
sub_file['SquaredError'] = np.square(sub_file['Target'] - sub_file['Prediction'])

In [24]:
rmse = np.sqrt(np.sum(sub_file['SquaredError'])/sub_file.shape[0])
print('Final RMSE --> ', rmse)

Final RMSE -->  1057.4261713619298


# Submission

In [25]:
# Data Ingestion
main_train = DataIngestion(mode="train").execute()
main_test = DataIngestion(mode="test").execute()

main_train.dropna(subset=['Region'], how='all', inplace=True)

main_train.drop(columns=['Town'], inplace=True)
main_test.drop(columns=['Town'], inplace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [26]:
# Feature Engineering
main_train = FeatureEngineering().execute(main_train)
main_test = FeatureEngineering().execute(main_test)
main_train.drop(
    columns=[
        'TransactionDates',
        'PaymentsHistory',
        'RegistrationDate',
        'UpsellDate',
        'SupplierName',
        'PaymentMethod',
        'ExpectedTermDate',
        'FirstPaymentDate',
        'LastPaymentDate',
#         'SplitTransactionDates'
    ],
    inplace=True
)
main_test.drop(
    columns=[
        'TransactionDates',
        'PaymentsHistory',
        'RegistrationDate',
        'UpsellDate',
        'SupplierName',
        'PaymentMethod',
        'ExpectedTermDate',
        'FirstPaymentDate',
        'LastPaymentDate',
#         'SplitTransactionDates'
    ],
    inplace=True
)

In [27]:
# Prepare data for Train
import copy
import pandas as pd

payments_df = main_train[["m1", "m2", "m3", "m4", "m5", "m6"]]
main_y_train = pd.concat([payments_df, payments_df.T.stack().reset_index(name='y')['y']], axis=1)[["y"]]

all_x_dfs = {"df_0": main_train.drop(["m1", "m2", "m3", "m4", "m5", "m6"], axis=1)}
for i in range(1, 6):
    temp_df = copy.deepcopy(all_x_dfs[f"df_{i - 1}"])
    m_df = payments_df[[f"m{i}"]]
    temp_df = pd.concat([temp_df, m_df], axis=1)
    temp_df.rename(columns={f"m{i}": "new_payment"}, inplace=True)
    temp_df = FeatureEngineering().get_updated_df(base_df=temp_df)
    all_x_dfs[f"df_{i}"] = temp_df

x_train_overall = pd.concat([all_x_dfs[f"df_{i}"] for i in range(6)], ignore_index=True)
x_train_overall.drop(columns=['SplitPaymentsHistory'], inplace=True)
del payments_df, temp_df, m_df

# ID variable is saved separately
x_train_overall.drop(columns=['ID'], inplace=True)

main_test_id = main_test[['ID']]
main_test.drop(columns=['ID'], inplace=True)

In [28]:
# For x_train_overall
overall_numerical_df, overall_encoded_categories, overall_encoded_train, overall_encoding_obj = encode_and_drop(x_train_overall, "train", None)

overall_filtered_df = filter_categorical_features(overall_df=main_train,
                                                  encoded_frame=overall_encoded_categories, 
                                                  categorical_variable=['Occupation', 'Region'])

# Concatenating with the Numerical frame 
numerical_frame_ov = overall_numerical_df[
    [
#         'Deposit', 
#         'AccessoryRate', 
#         'RatePerUnit', 
#         'DaysOnDeposit', 
#         'Age',
#         'Term', 
#         'TotalContractValue', 
        'b1', 
        'b2', 
        'b3', 
        'b4', 
        'b5'
    ]
]  # Filtering required variables

_index = numerical_frame_ov.index
overall_filtered_df = pd.concat([numerical_frame_ov.reset_index(drop=True),
                                 overall_filtered_df.reset_index(drop=True)], axis=1)
overall_filtered_df.index = _index

In [29]:
# Preparing the TRAIN data for approach two and fitting the model

model_two_ov_obj = ModelXgBoost(train_array=overall_filtered_df,
                                train_target=main_y_train)
model_two_ov_obj.train_model()  # Default h.params (Checkout the code)
model_two_ov = model_two_ov_obj.trained_model

In [31]:
# For x_test
x_test_submission = main_test.drop(columns=["SplitPaymentsHistory"])
split_payment_history = main_test[["SplitPaymentsHistory"]]

sub_numerical_df_test, sub_encoded_categories_test, sub_encoded_test, sub_encoding_obj_t = encode_and_drop(x_test_submission, "test", overall_encoding_obj)
sub_filtered_df_test = filter_categorical_features(overall_df=main_test,
                                                   encoded_frame=sub_encoded_categories_test, 
                                                   categorical_variable=['Occupation', 'Region'])
_indexes = sub_filtered_df_test.index

# Concatenating with the Numerical frame 
sub_numerical_frame = sub_numerical_df_test[
    [
#         'Deposit', 
#         'AccessoryRate', 
#         'RatePerUnit', 
#         'DaysOnDeposit', 
#         'Age',
#         'Term', 
#         'TotalContractValue', 
        'b1', 
        'b2', 
        'b3', 
        'b4', 
        'b5'
    ]
]  # Filtering required variables

_index = sub_numerical_frame.index
sub_filtered_df_test = pd.concat([sub_numerical_frame.reset_index(drop=True),
                                  sub_filtered_df_test.reset_index(drop=True)], axis=1)
sub_filtered_df_test.index = _index

In [32]:
# Inference on submission data
sub_predict_dict = dict()
for predict_col in ["m1", "m2", "m3", "m4", "m5", "m6"]:
    sub_predict_dict[f"{predict_col}_pred"] = model_two_ov.predict(sub_filtered_df_test)
    y_pred_df = pd.DataFrame(sub_predict_dict[f"{predict_col}_pred"], columns=[f"{predict_col}_pred"])
    sub_filtered_df_test = pd.concat([sub_filtered_df_test.reset_index(drop=True),
                                      y_pred_df.reset_index(drop=True), 
                                      split_payment_history.reset_index(drop=True)], axis=1)
    sub_filtered_df_test.index = _indexes  # Making sure indexes are maintained
    sub_filtered_df_test.rename(columns={f"{predict_col}_pred": "new_payment"}, inplace=True)
    sub_filtered_df_test = FeatureEngineering().get_updated_df(base_df=sub_filtered_df_test)
    split_payment_history = sub_filtered_df_test[["SplitPaymentsHistory"]]
    sub_filtered_df_test.drop(columns=["SplitPaymentsHistory"], inplace=True)

del split_payment_history

In [33]:
# Submission
sub_pred_frame_test = pd.DataFrame(sub_predict_dict)
sub_pred_frame_test.index = main_test.index

full_test_array_test = pd.concat([main_test, sub_pred_frame_test], axis=1)
full_test_array_test = pd.merge(full_test_array_test, main_test_id, how='left', left_index=True, right_index=True)

sub_file = SubmissionFile(
    validation_data=full_test_array_test,
    type_of_data='test'
).execute()
sub_file.reset_index(drop=True, inplace=True)

In [36]:
sub_file.to_csv('../submissions/submission_approach_2_occupation_region_b1_b5_only.csv')

In [None]:
# Outlier detection (if any)

# TASKS
# Tasks
    - Feature selection  Aum
        - Automated selection based on Genie Index

    - Pipeline setup  Nikhil
        - Pre-process
            - ingestion (DONE)
            - Features (DONE)
            - Encoding (Pending)
            - scaling (if applicable)
        - Modelling
            - Approach2
            - Sliding updates for features for Train and test

        - validation
            - Calculate RMSE

In [None]:
# Drop Useless Columns
def drop_cols(df):
    df.drop(
        [
            "ID",
            "UpsellDate",
            "PaymentMethod",
            "TransactionDates",
            "PaymentsHistory",
            "SupplierName",
            "Town",
            "RegistrationDateParsed",
            "ExpectedTermDateParsed",
            "FirstPaymentDateParsed",
            "LastPaymentDateParsed"
        ],
        inplace=True,
        axis=1
    )


drop_cols(train)
drop_cols(test)
train.head()
test.head()
