In [1]:
import os
import sys
from pathlib import Path

mod_path = os.path.join(Path.cwd().parent)
if mod_path not in sys.path:
    sys.path.append(mod_path)

In [4]:
## Data Ingestion
from src.data.ingestion import DataIngestion
from src.features.encoding import FeatureEncoding
train, test = DataIngestion(mode="dev").execute(set_seed=0)

Int64Index([    0,     2,     4,     5,     7,     8,    15,    19,    21,
               23,
            ...
            27995, 27996, 27998, 28000, 28001, 28002, 28003, 28004, 28005,
            28006],
           dtype='int64', length=15398)


In [2]:
## Feature Engineering
from src.features.engineering import FeatureEngineering

FEATURE_ID_MAPPING = {
    "Occupation": 0,
    "MainApplicantGender": 1
}

train = FeatureEngineering().execute(train)
test = FeatureEngineering().execute(test)

In [3]:
## Feature Encoding
cat_features = ["Occupation", "MainApplicantGender"]

encoding_obj = FeatureEncoding()
encoded_df = encoding_obj.one_hot_encoding(
    categorical_frame=train[cat_features],
    type_of_data="train",
    conv=True,
    drop='first',
    handle_unknown="error"
)
encoded_df.head()

encoded_test_df = encoding_obj.one_hot_encoding(
    categorical_frame=test[cat_features],
    type_of_data="test",
    conv=True,
    drop='first',
    handle_unknown="error"
)
encoded_test_df.head()


COLUMN-------
Occupation
['Teacher' 'Farmer' 'Business' 'Government Employee' 'Other' 'Labourer'
 'Driver/Motorbike Rider']
COLUMN-------
MainApplicantGender
['Male' 'Female']


Unnamed: 0,Driver/Motorbike Rider,Farmer,Government Employee,Labourer,Other,Teacher,Male
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [4]:
## X_Train and Y_Train Formulation

import copy
import pandas as pd
new_payments_df = train[["m1", "m2", "m3", "m4", "m5", "m6"]]
y_train = pd.concat([new_payments_df, new_payments_df.T.stack().reset_index(name='y')['y']], axis=1)[["y"]]

all_x_dfs = {"df_0": train.drop(["m1", "m2", "m3", "m4", "m5", "m6"], axis=1)}
for i in range(1, 6):
    temp_df = copy.deepcopy(all_x_dfs[f"df_{i-1}"])
    m_df = new_payments_df[[f"m{i}"]]
    temp_df = pd.concat([temp_df, m_df], axis=1)
    temp_df.rename(columns={f"m{i}": "new_payment"}, inplace=True)
    temp_df = FeatureEngineering().get_updated_df(base_df=temp_df)
    all_x_dfs[f"df_{i}"] = temp_df

x_train = pd.concat([all_x_dfs[f"df_{i}"] for i in range(6)], ignore_index=True)
del new_payments_df, temp_df, m_df


In [5]:
print(x_train.shape)
print(y_train.shape)

(125886, 48)
(125886, 1)


In [14]:
## Feature Scaling

ValueError: too many values to unpack (expected 2)

In [None]:
## Model Training
new_payments_df = test[["m1", "m2", "m3", "m4", "m5", "m6"]]
y_test = pd.concat([new_payments_df, new_payments_df.T.stack().reset_index(name='y')['y']], axis=1)[["y"]]

y_pred = pd.DataFrame()
x_test = test.drop(["m1", "m2", "m3", "m4", "m5", "m6"])
for _ in range(6):
    # TODO AP:
    y_pred_np_arr = model.predict(x_test)
    # Convert y_pred_np_arr into y_pred_df
    y_pred_df = model.predict(x_test)
    y_pred = pd.concat([y_pred, y_pred_df], ignore_index=True)
    x_test = pd.concat([x_test, y_pred_df], axis=1)
    x_test.rename(columns={f"y_pred": "new_payment"}, inplace=True)
    x_test = FeatureEngineering().get_updated_df(base_df=x_test)
    # all_x_dfs[f"df_{i}"] = temp_df

In [None]:
## Inference: model, X  -> yPred

In [None]:
# Submission File, Calculation of Metric (RMSE)
# Input: y_test(n X 6), y_pred(n X 6)
# Output: RMSE



In [None]:
# Outlier detection (if any)

# TASKS
# Tasks
    - Feature selection  Aum
        - Automated selection based on Genie Index

    - Pipeline setup  Nikhil
        - Pre-process
            - ingestion (DONE)
            - Features (DONE)
            - Encoding (Pending)
            - scaling (if applicable)
        - Modelling
            - Approach2
            - Sliding updates for features for Train and test

        - validation
            - Calculate RMSE

In [None]:
# Drop Useless Columns
def drop_cols(df):

    df.drop(
        [
            "ID",
            "UpsellDate",
            "PaymentMethod",
            "TransactionDates",
            "PaymentsHistory",
            "SupplierName",
            "Town",
            "RegistrationDateParsed",
            "ExpectedTermDateParsed",
            "FirstPaymentDateParsed",
            "LastPaymentDateParsed"
        ],
        inplace=True,
        axis=1
    )
drop_cols(train)
drop_cols(test)
train.head()
test.head()
