In [1]:
import os
import sys
from pathlib import Path

mod_path = os.path.join(Path.cwd().parent)
if mod_path not in sys.path:
    sys.path.append(mod_path)

In [2]:
from typing import Union, Optional
from copy import deepcopy
import random
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import RepeatedKFold

from src.data.ingestion import DataIngestion
from src.features.engineering import FeatureEngineering
from src.features.encoding import FeatureEncoding
from src.model.tree_based import ModelXgBoost
from src.data.validation_file import SubmissionFile

np.random.seed(0)
random.seed(0)

## Functions

In [3]:
def filter_categorical_features(overall_df,
                                encoded_frame,
                                categorical_variable: Optional[Union[list, str]] = None,
                                categories_to_filter: Optional[list] = None):
    """
    Filter category or categories in a specifc or multiple categorical variables from the encoded frame

    """    
    # Categories to filter
    if categories_to_filter:
        filtered_frame = encoded_frame[encoded_frame.columns[encoded_frame.columns.isin(categories_to_filter)]]
    else:
        categories = []
        if isinstance(categorical_variable, list):
            for _variable in categorical_variable:
                categories.extend(overall_df[_variable].unique().tolist())
        else:
            categories.extend(overall_df[categorical_variable].unique().tolist())
        filtered_frame = encoded_frame[encoded_frame.columns[encoded_frame.columns.isin(categories)]]
    
    return filtered_frame


def encode_and_drop(full_array, data_type, tr_encoder=None):
    categorical_array = full_array[full_array.select_dtypes(exclude=['number']).columns]
    numerical_array = full_array.drop(columns=full_array.select_dtypes(exclude=['number']).columns)
    if not tr_encoder:
        encoder = FeatureEncoding()
    else:
        encoder = tr_encoder
    encoded_array = encoder.one_hot_encoding(
    categorical_frame=categorical_array,
    type_of_data=data_type,
    conv=True,
    drop=None,
    handle_unknown="ignore"
)
    final_array = pd.concat([numerical_array.reset_index(drop=True),
                             encoded_array.reset_index(drop=True)], axis=1)
    final_array.index = numerical_array.index
    encoded_array.index = numerical_array.index

    return numerical_array, encoded_array, final_array, encoder

def make_train_data(payment_df, train_data):
    all_x_dfs = {"df_0": train_data.drop(["m1", "m2", "m3", "m4", "m5", "m6"], axis=1)}
    for i in range(1, 6):
        temp_df = deepcopy(all_x_dfs[f"df_{i - 1}"])
        m_df = payment_df[[f"m{i}"]]
        temp_df = pd.concat([temp_df, m_df], axis=1)
        temp_df.rename(columns={f"m{i}": "new_payment"}, inplace=True)
        temp_df = FeatureEngineering().get_updated_df(base_df=temp_df)
        all_x_dfs[f"df_{i}"] = temp_df

    x_train = pd.concat([all_x_dfs[f"df_{i}"] for i in range(6)], ignore_index=True)
    x_train.drop(columns=['SplitPaymentsHistory'], inplace=True)
    
    return x_train

# Data Ingestion

In [4]:
# Should be run always [train and test]
train, test = DataIngestion(mode="dev").execute()
train.dropna(subset=['Region'], how='all', inplace=True)
test.dropna(subset=['Region'], how='all', inplace=True)
train.drop(columns=['Town'], inplace=True)
test.drop(columns=['Town'], inplace=True)
# test = DataIngestion(mode="test").execute()

  exec(code_obj, self.user_global_ns, self.user_ns)


# Feature Engineering

In [5]:
# FEATURE_ID_MAPPING = {
#     "Occupation": 0,
#     "MainApplicantGender": 1
# }
train = FeatureEngineering().execute(train)
test = FeatureEngineering().execute(test)

# X_Train and Y_Train Formulation

In [6]:
# Prepare data for Train
import pandas as pd

new_payments_df = train[["m1", "m2", "m3", "m4", "m5", "m6"]]
y_train = pd.concat([new_payments_df, new_payments_df.T.stack().reset_index(name='y')['y']], axis=1)[["y"]]

x_train = make_train_data(new_payments_df, train)

In [7]:
# ID variable is saved separately
# x_train_id = x_train[['ID']]
x_train.drop(columns=['ID'], inplace=True)

test_id = test[['ID']]
test.drop(columns=['ID'], inplace=True)

# Encoding

In [8]:
# [cols for cols, coef in zip(overall_filtered_df.columns, model_two_ov.coef_) if coef != 0.0]
# Features selected via any of the feature selection methods
filter_cols = ['DAILY','MONTHLY','Female','Coast Region','Nairobi Region','North Rift','Nyanza',
               'South Rift','Western','Business','Driver/Motorbike Rider','Government Employee',
               'Labourer','Other','Teacher','b1','b2','b3','b4','b5']

In [9]:
# For x_train
numerical_df, encoded_categories, encoded_train, encoding_obj = encode_and_drop(x_train, "train", None)

filtered_df = filter_categorical_features(overall_df=train, 
                                          encoded_frame=encoded_categories, 
                                          categorical_variable=['Occupation', 'Region', 'rateTypeEntity', 'MainApplicantGender'], 
                                          categories_to_filter=filter_cols
                                         )

# Concatenating with the Numerical frame 
numerical_frame = numerical_df[
    [
#         'Deposit', 
#         'AccessoryRate', 
#         'RatePerUnit', 
#         'DaysOnDeposit', 
#         'Age',
#         'Term', 
#         'TotalContractValue', 
        'b1', 
        'b2', 
        'b3', 
        'b4', 
        'b5'
    ]
]  # Filtering required variables

_index = numerical_frame.index
filtered_df = pd.concat([filtered_df.reset_index(drop=True), 
                         numerical_frame.reset_index(drop=True)], axis=1)
filtered_df.index = _index

In [10]:
# Feature Scaling

# Model Training

In [11]:
# Preparing the TRAIN data for approach two and fitting the model

model_two_obj = ModelXgBoost(train_array=filtered_df,
                             train_target=y_train)
model_two_obj.train_model()  # Default h.params (Checkout the code)
model_two = model_two_obj.trained_model

### Lasso

In [11]:
# Preparing the TRAIN data for approach two and fitting the model

# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define model
model = linear_model.LassoCV(alphas=np.arange(0, 1, 0.01), cv=cv, n_jobs=-1)

# linear_model_obj = linear_model.ElasticNet(l1_ratio=1., 
#                                            random_state=100,
#                                            normalize=False)
model_two = model.fit(X=filtered_df, y=y_train)

  return f(*args, **kwargs)
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

# Model Inference

In [12]:
y_test = test[["m1", "m2", "m3", "m4", "m5", "m6"]]
x_test = test.drop(columns=["m1", "m2", "m3", "m4", "m5", "m6", "SplitPaymentsHistory"])

In [13]:
# For x_test
split_payment_history = test[["SplitPaymentsHistory"]]

numerical_df_test, encoded_categories_test, encoded_test, encoding_obj_t = encode_and_drop(x_test, "test", encoding_obj)
filtered_df_test = filter_categorical_features(overall_df=x_test,
                                               encoded_frame=encoded_categories_test,
                                               categorical_variable=['Occupation', 'Region', 'rateTypeEntity', 'MainApplicantGender'], 
                                               categories_to_filter=filter_cols
                                              )
_indexes = filtered_df_test.index

# Concatenating with the Numerical frame 
numerical_frame_test = numerical_df_test[
    [
#         'Deposit', 
#         'AccessoryRate', 
#         'RatePerUnit', 
#         'DaysOnDeposit', 
#         'Age',
#         'Term', 
#         'TotalContractValue', 
        'b1', 
        'b2', 
        'b3', 
        'b4', 
        'b5'
    ]
]  # Filtering required variables

_index = numerical_frame_test.index
filtered_df_test = pd.concat([filtered_df_test.reset_index(drop=True),
                              numerical_frame_test.reset_index(drop=True),], axis=1)
filtered_df_test.index = _index

In [14]:
# Inference on test data
predict_dict = dict()
for predict_col in y_test.columns:
    predict_dict[f"{predict_col}_pred"] = model_two.predict(filtered_df_test)
    y_pred_df = pd.DataFrame(predict_dict[f"{predict_col}_pred"], columns=[f"{predict_col}_pred"])
    payment_df = filtered_df_test[["b1", "b2", "b3", "b4", "b5"]]
    filtered_df_test.drop(columns=["b1", "b2", "b3", "b4", "b5"], inplace=True)

    temp_df = pd.concat([payment_df.reset_index(drop=True),
                         y_pred_df.reset_index(drop=True), 
                         split_payment_history.reset_index(drop=True)], axis=1)
    temp_df.index = _indexes  # Making sure indexes are maintained
    temp_df.rename(columns={f"{predict_col}_pred": "new_payment"}, inplace=True)
    temp_df = FeatureEngineering().get_updated_df(base_df=temp_df)
    split_payment_history = temp_df[["SplitPaymentsHistory"]]
    temp_df.drop(columns=["SplitPaymentsHistory"], inplace=True)
    filtered_df_test = pd.concat([filtered_df_test, temp_df], axis=1)
    
del split_payment_history, temp_df

# Calculation of Metric

In [15]:
pred_frame = pd.DataFrame(predict_dict)
pred_frame.index = _indexes

In [16]:
full_test_array = pd.concat([filtered_df_test, y_test, pred_frame], axis=1)

In [17]:
full_test_array = pd.merge(full_test_array, test_id, how='left', left_index=True, right_index=True)

In [18]:
sub_file = SubmissionFile(
    validation_data=full_test_array,
    type_of_data='validation'
).execute()

In [19]:
sub_file['SquaredError'] = np.square(sub_file['Target'] - sub_file['Prediction'])

In [20]:
rmse = np.sqrt(np.sum(sub_file['SquaredError'])/sub_file.shape[0])
print('Final RMSE --> ', rmse)

Final RMSE -->  823.4769540628521


# Submission

In [21]:
# Data Ingestion
main_train = DataIngestion(mode="train").execute()
main_test = DataIngestion(mode="test").execute()

main_train.dropna(subset=['Region'], how='all', inplace=True)

main_train.drop(columns=['Town'], inplace=True)
main_test.drop(columns=['Town'], inplace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [22]:
# Feature Engineering
main_train = FeatureEngineering().execute(main_train)
main_test = FeatureEngineering().execute(main_test)

In [23]:
# Prepare data for Train
import copy
import pandas as pd

payments_df = main_train[["m1", "m2", "m3", "m4", "m5", "m6"]]
main_y_train = pd.concat([payments_df, payments_df.T.stack().reset_index(name='y')['y']], axis=1)[["y"]]

x_train_overall = make_train_data(payments_df, main_train)

# ID variable is saved separately
x_train_overall.drop(columns=['ID'], inplace=True)

main_test_id = main_test[['ID']]
main_test.drop(columns=['ID'], inplace=True)

In [24]:
# For x_train_overall
overall_numerical_df, overall_encoded_categories, overall_encoded_train, overall_encoding_obj = encode_and_drop(x_train_overall, "train", None)

overall_filtered_df = filter_categorical_features(overall_df=main_train,
                                                  encoded_frame=overall_encoded_categories, 
                                                  categorical_variable=['Occupation', 'Region', 'rateTypeEntity', 'MainApplicantGender'], 
                                                  categories_to_filter=filter_cols
                                                 )

# Concatenating with the Numerical frame 
numerical_frame_ov = overall_numerical_df[
    [
#         'Deposit', 
#         'AccessoryRate', 
#         'RatePerUnit', 
#         'DaysOnDeposit', 
#         'Age',
#         'Term', 
#         'TotalContractValue', 
        'b1', 
        'b2', 
        'b3', 
        'b4', 
        'b5'
    ]
]  # Filtering required variables

_index = numerical_frame_ov.index
overall_filtered_df = pd.concat([overall_filtered_df.reset_index(drop=True),
                                 numerical_frame_ov.reset_index(drop=True)], axis=1)
overall_filtered_df.index = _index

In [25]:
# Preparing the TRAIN data for approach two and fitting the model

model_two_ov_obj = ModelXgBoost(train_array=overall_filtered_df,
                                train_target=main_y_train)
model_two_ov_obj.train_model()  # Default h.params (Checkout the code)
model_two_ov = model_two_ov_obj.trained_model

In [36]:
# Preparing the TRAIN data for approach two and fitting the model using Lasso

# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define model
model = linear_model.LassoCV(alphas=np.arange(0, 1, 0.01), cv=cv, n_jobs=-1)

# linear_model_obj = linear_model.ElasticNet(l1_ratio=1., 
#                                            random_state=100,
#                                            normalize=False)
model_two_ov = model.fit(X=overall_filtered_df, y=main_y_train)

  return f(*args, **kwargs)
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

In [26]:
# For x_test
x_test_submission = main_test.drop(columns=["SplitPaymentsHistory"])
split_payment_history = main_test[["SplitPaymentsHistory"]]

sub_numerical_df_test, sub_encoded_categories_test, sub_encoded_test, sub_encoding_obj_t = encode_and_drop(x_test_submission, "test", overall_encoding_obj)
sub_filtered_df_test = filter_categorical_features(overall_df=main_test,
                                                   encoded_frame=sub_encoded_categories_test, 
                                                   categorical_variable=['Occupation', 'Region', 'rateTypeEntity', 'MainApplicantGender'], 
                                                   categories_to_filter=filter_cols
                                                  )
_indexes = sub_filtered_df_test.index

# Concatenating with the Numerical frame 
sub_numerical_frame = sub_numerical_df_test[
    [
#         'Deposit', 
#         'AccessoryRate', 
#         'RatePerUnit', 
#         'DaysOnDeposit', 
#         'Age',
#         'Term', 
#         'TotalContractValue', 
        'b1', 
        'b2', 
        'b3', 
        'b4', 
        'b5'
    ]
]  # Filtering required variables

_index = sub_numerical_frame.index
sub_filtered_df_test = pd.concat([sub_filtered_df_test.reset_index(drop=True),
                                  sub_numerical_frame.reset_index(drop=True)], axis=1)
sub_filtered_df_test.index = _index

In [27]:
# Inference on submission data
sub_predict_dict = dict()
for predict_col in ["m1", "m2", "m3", "m4", "m5", "m6"]:
    sub_predict_dict[f"{predict_col}_pred"] = model_two_ov.predict(sub_filtered_df_test)
    y_pred_df = pd.DataFrame(sub_predict_dict[f"{predict_col}_pred"], columns=[f"{predict_col}_pred"])

    test_payment_df = sub_filtered_df_test[["b1", "b2", "b3", "b4", "b5"]]
    sub_filtered_df_test.drop(columns=["b1", "b2", "b3", "b4", "b5"], inplace=True)
    
    temp_df_test = pd.concat([test_payment_df.reset_index(drop=True),
                              y_pred_df.reset_index(drop=True), 
                              split_payment_history.reset_index(drop=True)], axis=1)
    temp_df_test.index = _indexes  # Making sure indexes are maintained
    temp_df_test.rename(columns={f"{predict_col}_pred": "new_payment"}, inplace=True)
    temp_df_test = FeatureEngineering().get_updated_df(base_df=temp_df_test)
    split_payment_history = temp_df_test[["SplitPaymentsHistory"]]
    temp_df_test.drop(columns=["SplitPaymentsHistory"], inplace=True)
    sub_filtered_df_test = pd.concat([sub_filtered_df_test, temp_df_test], axis=1)

del split_payment_history, temp_df_test

In [28]:
for k, v in sub_predict_dict.items():
#     predict_dict[k] = [0 if i < 0 else i for i in v]
    for _v in v:
        if _v < 0:
            print(_v)
# predict_dict

-53.022404
-49.4473
-18.663033
-340.54553
-38.463596
-333.9259
-695.1669
-35.200977
-260.5055
-1.1518599


In [29]:
# Submission
sub_pred_frame_test = pd.DataFrame(sub_predict_dict)
sub_pred_frame_test.index = main_test.index

full_test_array_test = pd.concat([main_test, sub_pred_frame_test], axis=1)
full_test_array_test = pd.merge(full_test_array_test, main_test_id, how='left', left_index=True, right_index=True)

sub_file = SubmissionFile(
    validation_data=full_test_array_test,
    type_of_data='test'
).execute()
sub_file.reset_index(drop=True, inplace=True)

In [30]:
sub_file

Unnamed: 0,ID,Target
0,ID_000RHRU x m1,189.322662
1,ID_000RHRU x m2,200.707245
2,ID_000RHRU x m3,236.926910
3,ID_000RHRU x m4,280.863251
4,ID_000RHRU x m5,349.235901
...,...,...
56011,ID_ZZOKWZJ x m2,905.588501
56012,ID_ZZOKWZJ x m3,860.071777
56013,ID_ZZOKWZJ x m4,819.686646
56014,ID_ZZOKWZJ x m5,821.556274


In [33]:
sub_file.to_csv('../submissions/submission_approach_2_xgboost_feature_selection_usingLassoCV.csv', float_format="%.11f")

In [None]:
# Outlier detection (if any)

In [None]:
# Drop Useless Columns
def drop_cols(df):
    df.drop(
        [
            "ID",
            "UpsellDate",
            "PaymentMethod",
            "TransactionDates",
            "PaymentsHistory",
            "SupplierName",
            "Town",
            "RegistrationDateParsed",
            "ExpectedTermDateParsed",
            "FirstPaymentDateParsed",
            "LastPaymentDateParsed"
        ],
        inplace=True,
        axis=1
    )


drop_cols(train)
drop_cols(test)
train.head()
test.head()
