In [2]:
import os
import sys
from pathlib import Path

mod_path = os.path.join(Path.cwd().parent)
if mod_path not in sys.path:
    sys.path.append(mod_path)

In [21]:
from typing import Union, Optional
from copy import deepcopy
import numpy as np

from src.data.ingestion import DataIngestion
from src.features.engineering import FeatureEngineering
from src.features.encoding import FeatureEncoding
from src.model.tree_based import ModelXgBoost
from src.data.validation_file import SubmissionFile

# Data Ingestion

In [22]:
# Should be run always [train and test]
train, test = DataIngestion(mode="dev").execute()
train.dropna(subset=['Region'], how='all', inplace=True)
test.dropna(subset=['Region'], how='all', inplace=True)
train.drop(columns=['Town'], inplace=True)
test.drop(columns=['Town'], inplace=True)
# test = DataIngestion(mode="test").execute()

  exec(code_obj, self.user_global_ns, self.user_ns)


# Feature Engineering

In [None]:
# FEATURE_ID_MAPPING = {
#     "Occupation": 0,
#     "MainApplicantGender": 1
# }
train = FeatureEngineering().execute(train)
test = FeatureEngineering().execute(test)
train.drop(
    columns=['TransactionDates',
             'PaymentsHistory',
             'RegistrationDate',
             'UpsellDate',
             'SupplierName',
             'PaymentMethod',
             'ExpectedTermDate',
             'FirstPaymentDate',
             'LastPaymentDate',
             'SplitTransactionDates'],
    inplace=True
)
test.drop(
    columns=['TransactionDates',
             'PaymentsHistory',
             'RegistrationDate',
             'UpsellDate',
             'SupplierName',
             'PaymentMethod',
             'ExpectedTermDate',
             'FirstPaymentDate',
             'LastPaymentDate',
             'SplitTransactionDates'],
    inplace=True
)

# X_Train and Y_Train Formulation

In [None]:
train.columns

In [None]:
# Prepare data for Train
import copy
import pandas as pd

new_payments_df = train[["m1", "m2", "m3", "m4", "m5", "m6"]]
y_train = pd.concat([new_payments_df, new_payments_df.T.stack().reset_index(name='y')['y']], axis=1)[["y"]]

all_x_dfs = {"df_0": train.drop(["m1", "m2", "m3", "m4", "m5", "m6"], axis=1)}
for i in range(1, 6):
    temp_df = copy.deepcopy(all_x_dfs[f"df_{i - 1}"])
    m_df = new_payments_df[[f"m{i}"]]
    temp_df = pd.concat([temp_df, m_df], axis=1)
    temp_df.rename(columns={f"m{i}": "new_payment"}, inplace=True)
    temp_df = FeatureEngineering().get_updated_df(base_df=temp_df)
    all_x_dfs[f"df_{i}"] = temp_df

x_train = pd.concat([all_x_dfs[f"df_{i}"] for i in range(6)], ignore_index=True)
x_train.drop(columns=['SplitPaymentsHistory'], inplace=True)
del new_payments_df, temp_df, m_df

In [None]:
# ID variable is saved separately
# x_train_id = x_train[['ID']]
x_train.drop(columns=['ID'], inplace=True)

test_id = test[['ID']]
test.drop(columns=['ID'], inplace=True)

In [None]:
x_train.shape

# Encoding

In [11]:
def filter_categorical_features(overall_df,
                                encoded_frame,
                                numerical_frame,
                                categorical_variable: Optional[Union[list, str]] = None,
                                categories_to_filter: Optional[list] = None):
    """
    Filter category or categories in a specifc or multiple categorical variables from the encoded frame

    """    
    # Categories to filter
    if categories_to_filter:
        filtered_frame = encoded_frame[encoded_frame.columns[encoded_frame.columns.isin(categories_to_filter)]]
    else:
        categories = []
        if isinstance(categorical_variable, list):
            for _variable in categorical_variable:
                categories.extend(overall_df[_variable].unique().tolist())
        else:
            categories.extend(overall_df[categorical_variable].unique().tolist())
        filtered_frame = encoded_frame[encoded_frame.columns[encoded_frame.columns.isin(categories)]]
    
    _index = numerical_frame.index
    final_df = pd.concat([numerical_frame.reset_index(drop=True),
                          filtered_frame.reset_index(drop=True)], axis=1)
    final_df.index = _index
    
    return final_df


def encode_and_drop(full_array, data_type, tr_encoder=None):
    categorical_array = full_array[full_array.select_dtypes(exclude=['number']).columns]
    numerical_array = full_array.drop(columns=full_array.select_dtypes(exclude=['number']).columns)
    if not tr_encoder:
        encoder = FeatureEncoding()
    else:
        encoder = tr_encoder
    encoded_array = encoder.one_hot_encoding(
    categorical_frame=categorical_array,
    type_of_data=data_type,
    conv=True,
    drop=None,
    handle_unknown="ignore"
)
    final_array = pd.concat([numerical_array.reset_index(drop=True),
                             encoded_array.reset_index(drop=True)], axis=1)
    final_array.index = numerical_array.index

    return numerical_array, encoded_array, final_array, encoder

# For x_train
numerical_df, encoded_categories, encoded_train, encoding_obj = encode_and_drop(x_train, "train", None)

filtered_df = filter_categorical_features(train, encoded_categories, numerical_df, categorical_variable=['Occupation', 'Region'])

In [None]:
# Feature Scaling

# Model Training

In [None]:
# Preparing the TRAIN data for approach two and fitting the model

model_two_obj = ModelXgBoost(train_array=filtered_df,
                             train_target=y_train)
model_two_obj.train_model()  # Default h.params (Checkout the code)
model_two = model_two_obj.trained_model

# Model Inference

In [None]:
y_test = test[["m1", "m2", "m3", "m4", "m5", "m6"]]
x_test = test.drop(columns=["m1", "m2", "m3", "m4", "m5", "m6", "SplitPaymentsHistory"])

In [None]:
# For x_test
split_payment_history = test[["SplitPaymentsHistory"]]

numerical_df_test, encoded_categories_test, encoded_test, encoding_obj_t = encode_and_drop(x_test, "test", encoding_obj)
filtered_df_test = filter_categorical_features(encoded_categories_test, numerical_df_test, categorical_variable=['Occupation', 'Region'])
_indexes = filtered_df_test.index
# y_pred = pd.DataFrame()
predict_dict = dict()
for predict_col in y_test.columns:
    predict_dict[f"{predict_col}_pred"] = model_two.predict(filtered_df_test)
    y_pred_df = pd.DataFrame(predict_dict[f"{predict_col}_pred"], columns=[f"{predict_col}_pred"])
    filtered_df_test = pd.concat([filtered_df_test.reset_index(drop=True),
                                  y_pred_df.reset_index(drop=True), 
                                  split_payment_history.reset_index(drop=True)], axis=1)
    filtered_df_test.index = _indexes  # Making sure indexes are maintained
    filtered_df_test.rename(columns={f"{predict_col}_pred": "new_payment"}, inplace=True)
    filtered_df_test = FeatureEngineering().get_updated_df(base_df=filtered_df_test)
    split_payment_history = filtered_df_test[["SplitPaymentsHistory"]]
    filtered_df_test.drop(columns=["SplitPaymentsHistory"], inplace=True)

del split_payment_history

In [None]:
predict_dict

# Calculation of Metric

In [None]:
pred_frame = pd.DataFrame(predict_dict)
pred_frame.index = _indexes

In [None]:
full_test_array = pd.concat([filtered_df_test, y_test, pred_frame], axis=1)

In [None]:
full_test_array = pd.merge(full_test_array, test_id, how='left', left_index=True, right_index=True)

In [None]:
sub_file = SubmissionFile(
    validation_data=full_test_array,
    type_of_data='validation'
).execute()

In [None]:
sub_file.shape

In [None]:
sub_file['SquaredError'] = np.square(sub_file['Target'] - sub_file['Prediction'])

In [None]:
rmse = np.sqrt(np.sum(sub_file['SquaredError'])/sub_file.shape[0])
print('Final RMSE --> ', rmse)

# Submission

In [4]:
# Data Ingestion
main_train = DataIngestion(mode="train").execute()
main_test = DataIngestion(mode="test").execute()

main_train.dropna(subset=['Region'], how='all', inplace=True)

main_train.drop(columns=['Town'], inplace=True)
main_test.drop(columns=['Town'], inplace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
# Feature Engineering
main_train = FeatureEngineering().execute(main_train)
main_test = FeatureEngineering().execute(main_test)
main_train.drop(
    columns=['TransactionDates',
             'PaymentsHistory',
             'RegistrationDate',
             'UpsellDate',
             'SupplierName',
             'PaymentMethod',
             'ExpectedTermDate',
             'FirstPaymentDate',
             'LastPaymentDate',
             'SplitTransactionDates'],
    inplace=True
)
main_test.drop(
    columns=['TransactionDates',
             'PaymentsHistory',
             'RegistrationDate',
             'UpsellDate',
             'SupplierName',
             'PaymentMethod',
             'ExpectedTermDate',
             'FirstPaymentDate',
             'LastPaymentDate',
             'SplitTransactionDates'],
    inplace=True
)

In [7]:
# Prepare data for Train
import copy
import pandas as pd

payments_df = main_train[["m1", "m2", "m3", "m4", "m5", "m6"]]
main_y_train = pd.concat([payments_df, payments_df.T.stack().reset_index(name='y')['y']], axis=1)[["y"]]

all_x_dfs = {"df_0": main_train.drop(["m1", "m2", "m3", "m4", "m5", "m6"], axis=1)}
for i in range(1, 6):
    temp_df = copy.deepcopy(all_x_dfs[f"df_{i - 1}"])
    m_df = payments_df[[f"m{i}"]]
    temp_df = pd.concat([temp_df, m_df], axis=1)
    temp_df.rename(columns={f"m{i}": "new_payment"}, inplace=True)
    temp_df = FeatureEngineering().get_updated_df(base_df=temp_df)
    all_x_dfs[f"df_{i}"] = temp_df

x_train_overall = pd.concat([all_x_dfs[f"df_{i}"] for i in range(6)], ignore_index=True)
x_train_overall.drop(columns=['SplitPaymentsHistory'], inplace=True)
del payments_df, temp_df, m_df

# ID variable is saved separately
x_train_overall.drop(columns=['ID'], inplace=True)

main_test_id = main_test[['ID']]
main_test.drop(columns=['ID'], inplace=True)

In [12]:
# For x_train_overall
overall_numerical_df, overall_encoded_categories, overall_encoded_train, overall_encoding_obj = encode_and_drop(x_train_overall, "train", None)

overall_filtered_df = filter_categorical_features(main_train,
                                                  overall_encoded_categories, 
                                                  overall_numerical_df, 
                                                  categorical_variable=['Occupation', 'Region'])

In [13]:
# Preparing the TRAIN data for approach two and fitting the model

model_two_ov_obj = ModelXgBoost(train_array=overall_filtered_df,
                                train_target=main_y_train)
model_two_ov_obj.train_model()  # Default h.params (Checkout the code)
model_two_ov = model_two_ov_obj.trained_model

In [16]:
# For x_test
x_test_submission = main_test.drop(columns=["SplitPaymentsHistory"])
split_payment_history = main_test[["SplitPaymentsHistory"]]

sub_numerical_df_test, sub_encoded_categories_test, sub_encoded_test, sub_encoding_obj_t = encode_and_drop(x_test_submission, "test", overall_encoding_obj)
sub_filtered_df_test = filter_categorical_features(main_test,
                                                   sub_encoded_categories_test, 
                                                   sub_numerical_df_test, 
                                                   categorical_variable=['Occupation', 'Region'])
_indexes = sub_filtered_df_test.index

sub_predict_dict = dict()
for predict_col in ["m1", "m2", "m3", "m4", "m5", "m6"]:
    sub_predict_dict[f"{predict_col}_pred"] = model_two_ov.predict(sub_filtered_df_test)
    y_pred_df = pd.DataFrame(sub_predict_dict[f"{predict_col}_pred"], columns=[f"{predict_col}_pred"])
    sub_filtered_df_test = pd.concat([sub_filtered_df_test.reset_index(drop=True),
                                      y_pred_df.reset_index(drop=True), 
                                      split_payment_history.reset_index(drop=True)], axis=1)
    sub_filtered_df_test.index = _indexes  # Making sure indexes are maintained
    sub_filtered_df_test.rename(columns={f"{predict_col}_pred": "new_payment"}, inplace=True)
    sub_filtered_df_test = FeatureEngineering().get_updated_df(base_df=sub_filtered_df_test)
    split_payment_history = sub_filtered_df_test[["SplitPaymentsHistory"]]
    sub_filtered_df_test.drop(columns=["SplitPaymentsHistory"], inplace=True)

del split_payment_history

In [17]:
# Submission
sub_pred_frame_test = pd.DataFrame(sub_predict_dict)
sub_pred_frame_test.index = main_test.index

full_test_array_test = pd.concat([main_test, sub_pred_frame_test], axis=1)
full_test_array_test = pd.merge(full_test_array_test, main_test_id, how='left', left_index=True, right_index=True)

sub_file = SubmissionFile(
    validation_data=full_test_array_test,
    type_of_data='test'
).execute()
sub_file.reset_index(drop=True, inplace=True)

In [18]:
sub_filtered_df_test.columns

Index(['Deposit', 'AccessoryRate', 'RatePerUnit', 'DaysOnDeposit', 'Age',
       'Term', 'TotalContractValue', 'nb_payments', 'amount_paid',
       'percent_amt_paid', 'mean_amt_paid', 'median_amt_paid', 'max_amt_paid',
       'min_amt_paid', 'stddev_amt_paid', 'nb_skipped_months', 'Coast Region',
       'Mount Kenya Region', 'Nairobi Region', 'North Rift', 'Nyanza',
       'South Rift', 'Western', 'Business', 'Driver/Motorbike Rider', 'Farmer',
       'Government Employee', 'Labourer', 'Other', 'Teacher', 'b5', 'b4', 'b3',
       'b2', 'b1'],
      dtype='object')

In [20]:
sub_file.to_csv('../submissions/submission_approach_2_occupation_region_filter_payment_features.csv')

In [None]:
# Outlier detection (if any)

# TASKS
# Tasks
    - Feature selection  Aum
        - Automated selection based on Genie Index

    - Pipeline setup  Nikhil
        - Pre-process
            - ingestion (DONE)
            - Features (DONE)
            - Encoding (Pending)
            - scaling (if applicable)
        - Modelling
            - Approach2
            - Sliding updates for features for Train and test

        - validation
            - Calculate RMSE

In [None]:
# Drop Useless Columns
def drop_cols(df):
    df.drop(
        [
            "ID",
            "UpsellDate",
            "PaymentMethod",
            "TransactionDates",
            "PaymentsHistory",
            "SupplierName",
            "Town",
            "RegistrationDateParsed",
            "ExpectedTermDateParsed",
            "FirstPaymentDateParsed",
            "LastPaymentDateParsed"
        ],
        inplace=True,
        axis=1
    )


drop_cols(train)
drop_cols(test)
train.head()
test.head()
