In [1]:
import os
import sys
from pathlib import Path

mod_path = os.path.join(Path.cwd().parent)
if mod_path not in sys.path:
    sys.path.append(mod_path)

In [28]:
from typing import Union, Optional

from src.data.ingestion import DataIngestion
from src.features.engineering import FeatureEngineering
from src.features.encoding import FeatureEncoding

# Data Ingestion

In [3]:
# Should be run always [train and test]
train, test = DataIngestion(mode="dev").execute()
train.dropna(subset=['Region'], how='all', inplace=True)
test.dropna(subset=['Region'], how='all', inplace=True)
train.drop(columns=['Town'], inplace=True)
test.drop(columns=['Town'], inplace=True)
# test = DataIngestion(mode="test").execute()

  exec(code_obj, self.user_global_ns, self.user_ns)


# Feature Engineering

In [4]:
# FEATURE_ID_MAPPING = {
#     "Occupation": 0,
#     "MainApplicantGender": 1
# }
train = FeatureEngineering().execute(train)
test = FeatureEngineering().execute(test)
train.drop(
    columns=['TransactionDates',
             'PaymentsHistory',
             'RegistrationDate',
             'UpsellDate',
             'SupplierName',
             'PaymentMethod',
             'ExpectedTermDate',
             'FirstPaymentDate',
             'LastPaymentDate',
             'SplitTransactionDates'],
    inplace=True
)
test.drop(
    columns=['TransactionDates',
             'PaymentsHistory',
             'RegistrationDate',
             'UpsellDate',
             'SupplierName',
             'PaymentMethod',
             'ExpectedTermDate',
             'FirstPaymentDate',
             'LastPaymentDate',
             'SplitTransactionDates'],
    inplace=True
)

# X_Train and Y_Train Formulation

In [5]:
train.columns

Index(['ID', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6', 'Deposit', 'AccessoryRate',
       'rateTypeEntity', 'RatePerUnit', 'DaysOnDeposit', 'MainApplicantGender',
       'Age', 'Region', 'Occupation', 'Term', 'TotalContractValue',
       'SplitPaymentsHistory', 'nb_payments', 'amount_paid',
       'percent_amt_paid', 'mean_amt_paid', 'median_amt_paid', 'max_amt_paid',
       'min_amt_paid', 'stddev_amt_paid', 'nb_skipped_months', 'b1', 'b2',
       'b3', 'b4', 'b5'],
      dtype='object')

In [6]:
# Prepare data for Train
import copy
import pandas as pd

new_payments_df = train[["m1", "m2", "m3", "m4", "m5", "m6"]]
y_train = pd.concat([new_payments_df, new_payments_df.T.stack().reset_index(name='y')['y']], axis=1)[["y"]]

all_x_dfs = {"df_0": train.drop(["m1", "m2", "m3", "m4", "m5", "m6"], axis=1)}
for i in range(1, 6):
    temp_df = copy.deepcopy(all_x_dfs[f"df_{i - 1}"])
    m_df = new_payments_df[[f"m{i}"]]
    temp_df = pd.concat([temp_df, m_df], axis=1)
    temp_df.rename(columns={f"m{i}": "new_payment"}, inplace=True)
    temp_df = FeatureEngineering().get_updated_df(base_df=temp_df)
    all_x_dfs[f"df_{i}"] = temp_df

x_train = pd.concat([all_x_dfs[f"df_{i}"] for i in range(6)], ignore_index=True)
x_train.drop(columns=['SplitPaymentsHistory'], inplace=True)
del new_payments_df, temp_df, m_df

In [7]:
# ID variable is saved separately
x_train_id = x_train[['ID']]
x_train.drop(columns=['ID'], inplace=True)

In [8]:
x_train.shape

(87570, 25)

# Encoding

In [34]:
encoded_train[encoded_train.columns[encoded_train.columns.isin(['Mount Kenyaa Region',
 'Coast Region',
 'Nyanza',
 'Western',
 'South Rift',
 'Nairobi Region',
 'North Rift'])]]

Unnamed: 0,Coast Region,Nairobi Region,North Rift,Nyanza,South Rift,Western
0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...
87565,0.0,0.0,1.0,0.0,0.0,0.0
87566,0.0,0.0,1.0,0.0,0.0,0.0
87567,1.0,0.0,0.0,0.0,0.0,0.0
87568,1.0,0.0,0.0,0.0,0.0,0.0


In [63]:
x_train[x_train.select_dtypes(exclude=['object']).columns]

Unnamed: 0,Deposit,AccessoryRate,RatePerUnit,DaysOnDeposit,Age,Term,TotalContractValue,nb_payments,amount_paid,percent_amt_paid,...,median_amt_paid,max_amt_paid,min_amt_paid,stddev_amt_paid,nb_skipped_months,b1,b2,b3,b4,b5
0,2000,0.0,40,7,40.0,364,16560.0,10,12046.0,0.727415,...,1127.5,2570.0,766.0,519.96,0,766.0,1000.0,1245.0,1195.0,840.0
1,2999,40.0,55,3,64.0,568,56964.0,15,42215.0,0.741082,...,2165.0,8000.0,200.0,2130.94,2,200.0,2000.0,2000.0,3000.0,2500.0
2,2000,0.0,40,3,23.0,270,12800.0,7,9740.0,0.760938,...,1130.0,2960.0,990.0,697.43,0,1110.0,1130.0,1230.0,990.0,1250.0
3,2999,0.0,55,3,59.0,547,33084.0,13,24084.0,0.727965,...,1884.0,4200.0,1000.0,921.55,1,1000.0,2000.0,1884.0,1500.0,3000.0
4,2000,0.0,40,3,73.0,270,12800.0,6,8540.0,0.667188,...,1050.0,3320.0,860.0,937.18,0,1020.0,1240.0,1080.0,1020.0,860.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87565,2400,0.0,50,3,22.0,240,14400.0,10,13733.0,0.953681,...,1232.5,3550.0,540.0,823.26,0,1540.0,1320.0,540.0,843.0,855.0
87566,2400,0.0,50,3,,240,14400.0,14,6615.0,0.459375,...,270.0,2910.0,100.0,721.58,0,350.0,150.0,100.0,100.0,215.0
87567,2500,0.0,55,3,,556,33080.0,15,32514.0,0.982890,...,1704.0,7990.0,600.0,1733.01,4,2360.0,1260.0,1704.0,1435.0,1995.0
87568,2000,0.0,40,7,29.0,364,16560.0,16,15960.0,0.963768,...,880.0,2280.0,600.0,404.96,0,800.0,800.0,760.0,600.0,600.0


In [67]:
def filter_categorical_features(encoded_frame,
                                numerical_frame,
                                categorical_variable: Optional[Union[list, str]] = None,
                                categories_to_filter: Optional[list] = None):
    """
    Filter category or categories in a specifc or multiple categorical variables from the encoded frame

    """    
    # Categories to filter
    if categories_to_filter:
        filtered_frame = encoded_frame[encoded_frame.columns[encoded_frame.columns.isin(categories_to_filter)]]
    else:
        categories = []
        if isinstance(categorical_variable, list):
            for _variable in categorical_variable:
                categories.extend(train[_variable].unique().tolist())
        else:
            categories.extend(train[categorical_variable].unique().tolist())
        filtered_frame = encoded_frame[encoded_frame.columns[encoded_frame.columns.isin(categories)]]
    
    _index = numerical_frame.index
    final_df = pd.concat([numerical_frame.reset_index(drop=True),
                          filtered_frame.reset_index(drop=True)], axis=1)
    final_df.index = _index
    
    return final_df


def encode_and_drop(full_array, data_type, tr_encoder=None):
    categorical_array = full_array[full_array.select_dtypes(exclude=['number']).columns]
    numerical_array = full_array.drop(columns=full_array.select_dtypes(exclude=['number']).columns)
    if not tr_encoder:
        encoder = FeatureEncoding()
    else:
        encoder = tr_encoder
    encoded_array = encoder.one_hot_encoding(
    categorical_frame=categorical_array,
    type_of_data=data_type,
    conv=True,
    drop=None,
    handle_unknown="ignore"
)
    final_array = pd.concat([numerical_array.reset_index(drop=True),
                             encoded_array.reset_index(drop=True)], axis=1)
    final_array.index = numerical_array.index

    return numerical_array, encoded_array, final_array, encoder

# For x_train
numerical_df, encoded_categories, encoded_train, encoding_obj = encode_and_drop(x_train, "train", None)
# For x_test
# cat_features_test, encoded_test, encoding_obj = encode_and_drop(test, "test", encoding_obj)

In [71]:
filtered_df = filter_categorical_features(encoded_categories, numerical_df, categorical_variable=['Occupation', 'rateTypeEntity'])

In [None]:
## Feature Scaling

In [None]:
## Model Training
new_payments_df = test[["m1", "m2", "m3", "m4", "m5", "m6"]]
y_test = pd.concat([new_payments_df, new_payments_df.T.stack().reset_index(name='y')['y']], axis=1)[["y"]]

y_pred = pd.DataFrame()
x_test = test.drop(["m1", "m2", "m3", "m4", "m5", "m6"])
for _ in range(6):
    # TODO AP:
    y_pred_np_arr = model.predict(x_test)
    # Convert y_pred_np_arr into y_pred_df
    y_pred_df = model.predict(x_test)
    y_pred = pd.concat([y_pred, y_pred_df], ignore_index=True)
    x_test = pd.concat([x_test, y_pred_df], axis=1)
    x_test.rename(columns={f"y_pred": "new_payment"}, inplace=True)
    x_test = FeatureEngineering().get_updated_df(base_df=x_test)
    # all_x_dfs[f"df_{i}"] = temp_df

In [None]:
## Inference: model, X  -> yPred

In [None]:
# Submission File, Calculation of Metric (RMSE)
# Input: y_test(n X 6), y_pred(n X 6)
# Output: RMSE



In [None]:
# Outlier detection (if any)

# TASKS
# Tasks
    - Feature selection  Aum
        - Automated selection based on Genie Index

    - Pipeline setup  Nikhil
        - Pre-process
            - ingestion (DONE)
            - Features (DONE)
            - Encoding (Pending)
            - scaling (if applicable)
        - Modelling
            - Approach2
            - Sliding updates for features for Train and test

        - validation
            - Calculate RMSE

In [None]:
# Drop Useless Columns
def drop_cols(df):
    df.drop(
        [
            "ID",
            "UpsellDate",
            "PaymentMethod",
            "TransactionDates",
            "PaymentsHistory",
            "SupplierName",
            "Town",
            "RegistrationDateParsed",
            "ExpectedTermDateParsed",
            "FirstPaymentDateParsed",
            "LastPaymentDateParsed"
        ],
        inplace=True,
        axis=1
    )


drop_cols(train)
drop_cols(test)
train.head()
test.head()
