In [1]:
import os
import sys
from pathlib import Path

mod_path = os.path.join(Path.cwd().parent)
if mod_path not in sys.path:
    sys.path.append(mod_path)

In [2]:
from typing import Union, Optional
from copy import deepcopy
import random
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import RepeatedKFold

from src.data.ingestion import DataIngestion
from src.features.engineering import FeatureEngineering
from src.features.encoding import FeatureEncoding
from src.model.tree_based import ModelXgBoost, ModelLightGBM
from src.data.validation_file import SubmissionFile

np.random.seed(0)
random.seed(0)

### Best estimator 
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=20, reg_lambda=20, scale_pos_weight=1, subsample=1,
             tree_method='exact', use_label_encoder=False,
             validate_parameters=1, verbosity=0)

## Functions

In [3]:
def filter_categorical_features(overall_df,
                                encoded_frame,
                                categorical_variable: Optional[Union[list, str]] = None,
                                categories_to_filter: Optional[list] = None):
    """
    Filter category or categories in a specifc or multiple categorical variables from the encoded frame

    """    
    # Categories to filter
    if categories_to_filter:
        filtered_frame = encoded_frame[encoded_frame.columns[encoded_frame.columns.isin(categories_to_filter)]]
    else:
        categories = []
        if isinstance(categorical_variable, list):
            for _variable in categorical_variable:
                categories.extend(overall_df[_variable].unique().tolist())
        else:
            categories.extend(overall_df[categorical_variable].unique().tolist())
        filtered_frame = encoded_frame[encoded_frame.columns[encoded_frame.columns.isin(categories)]]
    
    return filtered_frame


def encode_and_drop(full_array, data_type, tr_encoder=None):
    categorical_array = full_array[full_array.select_dtypes(exclude=['number']).columns]
    numerical_array = full_array.drop(columns=full_array.select_dtypes(exclude=['number']).columns)
    if not tr_encoder:
        encoder = FeatureEncoding()
    else:
        encoder = tr_encoder
    encoded_array = encoder.one_hot_encoding(
    categorical_frame=categorical_array,
    type_of_data=data_type,
    conv=True,
    drop=None,
    handle_unknown="ignore"
)
    final_array = pd.concat([numerical_array.reset_index(drop=True),
                             encoded_array.reset_index(drop=True)], axis=1)
    final_array.index = numerical_array.index
    encoded_array.index = numerical_array.index

    return numerical_array, encoded_array, final_array, encoder

def make_train_data(payment_df, train_data):
    all_x_dfs = {"df_0": train_data.drop(["m1", "m2", "m3", "m4", "m5", "m6"], axis=1)}
    for i in range(1, 6):
        temp_df = deepcopy(all_x_dfs[f"df_{i - 1}"])
        m_df = payment_df[[f"m{i}"]]
        temp_df = pd.concat([temp_df, m_df], axis=1)
        temp_df.rename(columns={f"m{i}": "new_payment"}, inplace=True)
        temp_df = FeatureEngineering().get_updated_df(base_df=temp_df)
        all_x_dfs[f"df_{i}"] = temp_df

    x_train = pd.concat([all_x_dfs[f"df_{i}"] for i in range(6)], ignore_index=True)
    x_train.drop(columns=['SplitPaymentsHistory'], inplace=True)
    
    return x_train

# Data Ingestion

In [4]:
# Should be run always [train and test]
train, test = DataIngestion(mode="dev").execute()
train.dropna(subset=['Region'], how='all', inplace=True)
test.dropna(subset=['Region'], how='all', inplace=True)
train.drop(columns=['Town'], inplace=True)
test.drop(columns=['Town'], inplace=True)
# test = DataIngestion(mode="test").execute()
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())

  metadata = self._read_file(self.meta_data_path)


# Feature Engineering

In [5]:
# FEATURE_ID_MAPPING = {
#     "Occupation": 0,
#     "MainApplicantGender": 1
# }
train = FeatureEngineering().execute(train)
test = FeatureEngineering().execute(test)

# X_Train and Y_Train Formulation

In [6]:
# Prepare data for Train
import pandas as pd

new_payments_df = train[["m1", "m2", "m3", "m4", "m5", "m6"]]
y_train = pd.concat([new_payments_df, new_payments_df.T.stack().reset_index(name='y')['y']], axis=1)[["y"]]

x_train = make_train_data(new_payments_df, train)

In [7]:
# ID variable is saved separately
# x_train_id = x_train[['ID']]
x_train.drop(columns=['ID'], inplace=True)

test_id = test[['ID']]
test.drop(columns=['ID'], inplace=True)

# Encoding

In [8]:
# filter_cols = [cols for cols, coef in zip(filtered_df.columns, model_two.coef_) if abs(coef) >= 0.05]
# Features selected via any of the feature selection methods
filter_cols = ['MONTHLY','WEEKLY','Female','Male','Coast Region','Mount Kenya Region','Nairobi Region',
               'North Rift','Nyanza','South Rift','Business','Driver/Motorbike Rider','Farmer',
               'Government Employee','Labourer','Other','Teacher']

In [8]:
# For x_train
numerical_df, encoded_categories, encoded_train, encoding_obj = encode_and_drop(x_train, "train", None)

filtered_df = filter_categorical_features(overall_df=train, 
                                          encoded_frame=encoded_categories, 
                                          categorical_variable=['Occupation', 'Region', 'rateTypeEntity', 'MainApplicantGender'], 
#                                           categories_to_filter=filter_cols
                                         )

# Concatenating with the Numerical frame 
numerical_frame = numerical_df[
    [
#         'Deposit', 
#         'AccessoryRate', 
#         'RatePerUnit', 
#         'DaysOnDeposit', 
#         'Age',
#         'Term', 
#         'TotalContractValue', 
        'b1', 
        'b2', 
        'b3', 
        'b4', 
        'b5'
    ]
]  # Filtering required variables

_index = numerical_frame.index
filtered_df = pd.concat([filtered_df.reset_index(drop=True), 
                         numerical_frame.reset_index(drop=True)], axis=1)
filtered_df.index = _index

In [9]:
# Feature Scaling

# Model Training

### Optimisation

In [10]:
param_grid = {
#     'n_estimators': [400, 700, 1000],
#     'colsample_bytree': [0.3, 0.4, 0.5, 0.7, 0.8],
#     'max_depth': [3,5,6,10, 12, 13, 15, 20, 25],
    'learning_rate': [0.05, 0.1],
    'reg_alpha': [1, 10, 15, 20],
    'reg_lambda': [20, 25, 30, 35, 40]
#     'subsample': [0.7, 0.8, 0.9]
}

# params = {
#     "learning_rate": [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
#     "max_depth": [3, 4, 5, 6, 8, 10, 12, 13],
#     "min_child_weight": [1, 3, 5, 7],
#     "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
#     "colsample_bytree": [0.3, 0.4, 0.5, 0.7]
# }

In [87]:
# Preparing the TRAIN data for approach two and fitting the model
optimise_model = False

model_two_obj = ModelXgBoost(train_array=filtered_df,
                             train_target=y_train)
if optimise_model:
    model_two = model_two_obj.train_model(optimise_model=optimise_model,
#                                           params={"learning_rate": 0.1,
#                                                   "reg_lambda": 30},
                                          params=param_grid,
                                          opt_method="grid_search")  # Default h.params (Checkout the code)
else:
    model_two_obj.train_model(**{'learning_rate': 0.05,
                                 'reg_alpha': 30,
                                 'reg_lambda': 10})  # Default h.params (Checkout the code)
    model_two = model_two_obj.trained_model

PARAMS-> {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'gamma': None, 'gpu_id': None, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': 0.05, 'max_delta_step': None, 'max_depth': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 0, 'reg_alpha': 30, 'reg_lambda': 10, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': 0, 'use_label_encoder': False}


In [None]:
best params {'learning_rate': 0.05, 'reg_alpha': 20, 'reg_lambda': 20}

In [13]:
model_two.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=20, reg_lambda=20, scale_pos_weight=1, subsample=1,
             tree_method='exact', use_label_encoder=False,
             validate_parameters=1, verbosity=0)

### Lasso

In [10]:
# Preparing the TRAIN data for approach two and fitting the model

# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define model
model = linear_model.ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], alphas=np.arange(0, 1, 0.01),
                                  cv=cv, n_jobs=-1)

# model = linear_model.ElasticNet(l1_ratio=.5, 
#                                            random_state=100,
#                                            normalize=False)
model_two = model.fit(X=filtered_df, y=y_train)

  return f(*args, **kwargs)
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


In [15]:
len(filtered_df.columns)

31

In [12]:
model_two.coef_

array([-0.00000000e+00, -6.19826917e+01,  9.09901746e+01, -4.62486661e+00,
        3.72347017e+00,  1.18278167e+01, -2.14624210e+00,  2.02530797e+01,
       -1.63854488e+01, -1.92058714e+01,  4.20751624e-01,  0.00000000e+00,
       -5.12219618e+00,  2.52319876e+00, -1.07470423e+01,  5.62586180e+01,
       -5.56129542e+00, -6.24999902e-02,  6.15183287e+01, -8.47196303e-03,
        3.63680090e+00,  1.00926645e-01,  4.47286324e+00,  1.62408298e+00,
        2.67995198e-01,  3.35337618e-03,  1.61531764e-01,  1.85293519e-01,
        1.21651911e-01,  1.27613581e-01,  1.17930675e-01])

In [17]:
f =[cols for cols, coef in zip(filtered_df.columns, model_two.coef_) if abs(coef) >= 0.05]
len(f)

27

# Model Inference

In [88]:
y_test = test[["m1", "m2", "m3", "m4", "m5", "m6"]]
x_test = test.drop(columns=["m1", "m2", "m3", "m4", "m5", "m6", "SplitPaymentsHistory"])

In [89]:
# For x_test
split_payment_history = test[["SplitPaymentsHistory"]]

numerical_df_test, encoded_categories_test, encoded_test, encoding_obj_t = encode_and_drop(x_test, "test", encoding_obj)
filtered_df_test = filter_categorical_features(overall_df=x_test,
                                               encoded_frame=encoded_categories_test,
                                               categorical_variable=['Occupation', 'Region', 'rateTypeEntity', 'MainApplicantGender'], 
#                                                categories_to_filter=filter_cols
                                              )
_indexes = filtered_df_test.index

# Concatenating with the Numerical frame 
numerical_frame_test = numerical_df_test[
    [
#         'Deposit', 
#         'AccessoryRate', 
#         'RatePerUnit', 
#         'DaysOnDeposit', 
#         'Age',
#         'Term', 
#         'TotalContractValue', 
        'b1', 
        'b2', 
        'b3', 
        'b4', 
        'b5'
    ]
]  # Filtering required variables

_index = numerical_frame_test.index
filtered_df_test = pd.concat([filtered_df_test.reset_index(drop=True),
                              numerical_frame_test.reset_index(drop=True),], axis=1)
filtered_df_test.index = _index

In [90]:
# Inference on test data
predict_dict = dict()
for predict_col in y_test.columns:
    predict_dict[f"{predict_col}_pred"] = model_two.predict(filtered_df_test)
    y_pred_df = pd.DataFrame(predict_dict[f"{predict_col}_pred"], columns=[f"{predict_col}_pred"])
    payment_df = filtered_df_test[["b1", "b2", "b3", "b4", "b5"]]
    filtered_df_test.drop(columns=["b1", "b2", "b3", "b4", "b5"], inplace=True)

    temp_df = pd.concat([payment_df.reset_index(drop=True),
                         y_pred_df.reset_index(drop=True), 
                         split_payment_history.reset_index(drop=True)], axis=1)
    temp_df.index = _indexes  # Making sure indexes are maintained
    temp_df.rename(columns={f"{predict_col}_pred": "new_payment"}, inplace=True)
    temp_df = FeatureEngineering().get_updated_df(base_df=temp_df)
    split_payment_history = temp_df[["SplitPaymentsHistory"]]
    temp_df.drop(columns=["SplitPaymentsHistory"], inplace=True)
    filtered_df_test = pd.concat([filtered_df_test, temp_df], axis=1)
    
del split_payment_history, temp_df

# Calculation of Metric

In [91]:
pred_frame = pd.DataFrame(predict_dict)
pred_frame.index = _indexes

In [92]:
full_test_array = pd.concat([filtered_df_test, y_test, pred_frame], axis=1)

In [93]:
full_test_array = pd.merge(full_test_array, test_id, how='left', left_index=True, right_index=True)

In [94]:
sub_file = SubmissionFile(
    validation_data=full_test_array,
    type_of_data='validation'
).execute()

In [95]:
sub_file['SquaredError'] = np.square(sub_file['Target'] - sub_file['Prediction'])

In [96]:
rmse = np.sqrt(np.sum(sub_file['SquaredError'])/sub_file.shape[0])
print('Final RMSE --> ', rmse)

Final RMSE -->  774.3372977848034


# Submission

In [97]:
# Data Ingestion
main_train = DataIngestion(mode="train").execute()
main_test = DataIngestion(mode="test").execute()

main_train.dropna(subset=['Region'], how='all', inplace=True)

main_train.drop(columns=['Town'], inplace=True)
main_test.drop(columns=['Town'], inplace=True)

main_train['Age'] = main_train['Age'].fillna(main_train['Age'].mean())
main_test['Age'] = main_test['Age'].fillna(main_test['Age'].mean())

  metadata = self._read_file(self.meta_data_path)


In [98]:
# Feature Engineering
main_train = FeatureEngineering().execute(main_train)
main_test = FeatureEngineering().execute(main_test)

In [99]:
# Prepare data for Train
import copy
import pandas as pd

payments_df = main_train[["m1", "m2", "m3", "m4", "m5", "m6"]]
main_y_train = pd.concat([payments_df, payments_df.T.stack().reset_index(name='y')['y']], axis=1)[["y"]]

x_train_overall = make_train_data(payments_df, main_train)

# ID variable is saved separately
x_train_overall.drop(columns=['ID'], inplace=True)

main_test_id = main_test[['ID']]
main_test.drop(columns=['ID'], inplace=True)

In [100]:
# For x_train_overall
overall_numerical_df, overall_encoded_categories, overall_encoded_train, overall_encoding_obj = encode_and_drop(x_train_overall, "train", None)

overall_filtered_df = filter_categorical_features(overall_df=main_train,
                                                  encoded_frame=overall_encoded_categories, 
                                                  categorical_variable=['Occupation', 'Region', 'rateTypeEntity', 'MainApplicantGender'], 
#                                                   categories_to_filter=filter_cols
                                                 )

# Concatenating with the Numerical frame 
numerical_frame_ov = overall_numerical_df[
    [
#         'Deposit', 
#         'AccessoryRate', 
#         'RatePerUnit', 
#         'DaysOnDeposit', 
#         'Age',
#         'Term', 
#         'TotalContractValue', 
        'b1', 
        'b2', 
        'b3', 
        'b4', 
        'b5'
    ]
]  # Filtering required variables

_index = numerical_frame_ov.index
overall_filtered_df = pd.concat([overall_filtered_df.reset_index(drop=True),
                                 numerical_frame_ov.reset_index(drop=True)], axis=1)
overall_filtered_df.index = _index

In [101]:
# Preparing the TRAIN data for approach two and fitting the model

model_two_ov_obj = ModelXgBoost(train_array=overall_filtered_df,
                                train_target=main_y_train)
model_two_ov_obj.train_model(learning_rate=0.05,
                             reg_alpha=30,
                             reg_lambda=10)  # Default h.params (Checkout the code)
model_two_ov = model_two_ov_obj.trained_model

PARAMS-> {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'gamma': None, 'gpu_id': None, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': 0.05, 'max_delta_step': None, 'max_depth': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 0, 'reg_alpha': 30, 'reg_lambda': 10, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': 0, 'use_label_encoder': False}


In [70]:
# Preparing the TRAIN data for approach two and fitting the model using Lasso

# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define model
# model = linear_model.LassoCV(alphas=np.arange(0, 1, 0.01), cv=cv, n_jobs=-1)

model = linear_model.ElasticNet(l1_ratio=0.99, 
                                random_state=100,
                                normalize=False,
                                alpha=0.06)
model_two_ov = model.fit(X=overall_filtered_df, y=main_y_train)

In [71]:
model_two_ov.coef_

array([-3.18901742e+00,  7.91841474e+00,  1.88835234e+01, -9.31092701e+00,
       -1.81774314e+01, -2.29409847e+00, -1.05230773e+01,  3.27709317e+01,
       -2.84309072e+00,  4.41632998e+01, -1.73475125e-02,  2.81644190e+00,
        7.15385662e-02,  3.93087784e+00,  1.32771723e+00,  2.45486127e-01,
        3.56889502e-03,  1.89872520e-01,  1.92111584e-01,  1.24919276e-01,
        1.24563108e-01,  1.03587363e-01])

In [102]:
# For x_test
x_test_submission = main_test.drop(columns=["SplitPaymentsHistory"])
split_payment_history = main_test[["SplitPaymentsHistory"]]

sub_numerical_df_test, sub_encoded_categories_test, sub_encoded_test, sub_encoding_obj_t = encode_and_drop(x_test_submission, "test", overall_encoding_obj)
sub_filtered_df_test = filter_categorical_features(overall_df=main_test,
                                                   encoded_frame=sub_encoded_categories_test, 
                                                   categorical_variable=['Occupation', 'Region', 'rateTypeEntity', 'MainApplicantGender'], 
#                                                    categories_to_filter=filter_cols
                                                  )
_indexes = sub_filtered_df_test.index

# Concatenating with the Numerical frame 
sub_numerical_frame = sub_numerical_df_test[
    [
#         'Deposit', 
#         'AccessoryRate', 
#         'RatePerUnit', 
#         'DaysOnDeposit', 
#         'Age',
#         'Term', 
#         'TotalContractValue', 
        'b1', 
        'b2', 
        'b3', 
        'b4', 
        'b5'
    ]
]  # Filtering required variables

_index = sub_numerical_frame.index
sub_filtered_df_test = pd.concat([sub_filtered_df_test.reset_index(drop=True),
                                  sub_numerical_frame.reset_index(drop=True)], axis=1)
sub_filtered_df_test.index = _index

In [103]:
# Inference on submission data
sub_predict_dict = dict()
for predict_col in ["m1", "m2", "m3", "m4", "m5", "m6"]:
    sub_predict_dict[f"{predict_col}_pred"] = model_two_ov.predict(sub_filtered_df_test)
    y_pred_df = pd.DataFrame(sub_predict_dict[f"{predict_col}_pred"], columns=[f"{predict_col}_pred"])

    test_payment_df = sub_filtered_df_test[["b1", "b2", "b3", "b4", "b5"]]
    sub_filtered_df_test.drop(columns=["b1", "b2", "b3", "b4", "b5"], inplace=True)
    
    temp_df_test = pd.concat([test_payment_df.reset_index(drop=True),
                              y_pred_df.reset_index(drop=True), 
                              split_payment_history.reset_index(drop=True)], axis=1)
    temp_df_test.index = _indexes  # Making sure indexes are maintained
    temp_df_test.rename(columns={f"{predict_col}_pred": "new_payment"}, inplace=True)
    temp_df_test = FeatureEngineering().get_updated_df(base_df=temp_df_test)
    split_payment_history = temp_df_test[["SplitPaymentsHistory"]]
    temp_df_test.drop(columns=["SplitPaymentsHistory"], inplace=True)
    sub_filtered_df_test = pd.concat([sub_filtered_df_test, temp_df_test], axis=1)

del split_payment_history, temp_df_test

In [104]:
for k, v in sub_predict_dict.items():
#     predict_dict[k] = [0 if i < 0 else i for i in v]
    for _v in v:
        if _v < 0:
            print(_v)
# predict_dict

In [105]:
# Submission
sub_pred_frame_test = pd.DataFrame(sub_predict_dict)
sub_pred_frame_test.index = main_test.index

full_test_array_test = pd.concat([main_test, sub_pred_frame_test], axis=1)
full_test_array_test = pd.merge(full_test_array_test, main_test_id, how='left', left_index=True, right_index=True)

sub_file = SubmissionFile(
    validation_data=full_test_array_test,
    type_of_data='test'
).execute()
sub_file.reset_index(drop=True, inplace=True)

In [106]:
sub_file

Unnamed: 0,ID,Target
0,ID_000RHRU x m1,243.034348
1,ID_000RHRU x m2,248.458420
2,ID_000RHRU x m3,264.703125
3,ID_000RHRU x m4,290.786224
4,ID_000RHRU x m5,319.940796
...,...,...
56011,ID_ZZOKWZJ x m2,882.039551
56012,ID_ZZOKWZJ x m3,840.050293
56013,ID_ZZOKWZJ x m4,814.560364
56014,ID_ZZOKWZJ x m5,800.599792


In [107]:
sub_file.to_csv('../submissions/submission_approach_2_xgboost_cat_all_optimised_005_3010.csv', float_format="%.11f", 
                index=False)

In [36]:
sub_filtered_df_test.columns

Index(['DAILY', 'MONTHLY', 'WEEKLY', 'Female', 'Male', 'Coast Region',
       'Mount Kenya Region', 'Nairobi Region', 'North Rift', 'Nyanza',
       'South Rift', 'Western', 'Business', 'Driver/Motorbike Rider', 'Farmer',
       'Government Employee', 'Labourer', 'Other', 'Teacher', 'b1', 'b2', 'b3',
       'b4', 'b5'],
      dtype='object')

In [37]:
model_two_ov.get_params

<bound method XGBModel.get_params of XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', use_label_encoder=False,
             validate_parameters=1, verbosity=0)>

In [None]:
# Outlier detection (if any)

In [None]:
# Drop Useless Columns
def drop_cols(df):
    df.drop(
        [
            "ID",
            "UpsellDate",
            "PaymentMethod",
            "TransactionDates",
            "PaymentsHistory",
            "SupplierName",
            "Town",
            "RegistrationDateParsed",
            "ExpectedTermDateParsed",
            "FirstPaymentDateParsed",
            "LastPaymentDateParsed"
        ],
        inplace=True,
        axis=1
    )


drop_cols(train)
drop_cols(test)
train.head()
test.head()
