In [1]:
import os
import sys
from pathlib import Path

mod_path = os.path.join(Path.cwd().parent.parent)
if mod_path not in sys.path:
    sys.path.append(mod_path)

In [2]:
import pandas as pd
import numpy as np
from copy import deepcopy

from typing import List, Tuple
from sklearn.model_selection import StratifiedKFold

from src.data import *
from src.features.utils import *
from src.model.tree_based import ModelXgBoost

In [None]:
# data with shape 28007, 33 [transaction related features]
train = pd.read_csv('../../data/processed/train.csv')
test_set = pd.read_csv('../../data/processed/test.csv')

In [59]:
# data with shape 28007, 33 [transaction related features]
train = pd.read_csv('../../data/processed/train2.csv')
test_set = pd.read_csv('../../data/processed/test2.csv')

In [60]:
train.drop(columns = 'Unnamed: 0', inplace=True)
test_set.drop(columns = 'Unnamed: 0', inplace=True)
# test_set.columns

In [61]:
train.shape

(28007, 27)

In [6]:
# Region has certain NaN values which might cause issues while encoding
# As total NaNs constitute ~5% of the data (1446) we remove it as of now
print(train['Region'].isna().sum() / train.shape[0] * 100)
train.dropna(subset=['Region'], how='all', inplace=True)


5.162994965544328


In [62]:
# Region has certain NaN values which might cause issues while encoding
# As total NaNs constitute ~5% of the data (1446) we remove it as of now
print(test_set['Region'].isna().sum() / test_set.shape[0] * 100)
# train.dropna(subset=['Region'], how='all', inplace=True)

# When attempting drop=first in OHE, the reverse transform throws an issue as it reads the NaN values as a separate
# category. So converting NaNs into strings
train['Region'] = train['Region'].fillna('Null')
test_set['Region'] = test_set['Region'].fillna('Null')

5.227077977720651


## Approach 2

s = pd.DataFrame(np.arange(0, len(train)), columns=['m1'])
df = train[['b1', 'b2', 'b3', 'b4', 'b5']]
df.drop(columns=['b5'], inplace=True)
df.rename(columns={'b1': 'b2', 'b2':'b3', 'b3': 'b4', 'b4': 'b5'}, inplace=True)
df.insert(loc=0, column='b1', value=s.values)
df

t = pd.DataFrame([[1]], columns=['a'])
q = pd.DataFrame([[1]], columns=['b'])
r = pd.DataFrame([['aum']], columns=['name'])

k = pd.concat([t, q])
kk = pd.merge(k, r, how='left', left_index=True, right_index=True)
kk

In [63]:
def slide_variable_window(
    predictor_array: pd.DataFrame,
    var_to_add: pd.DataFrame
) -> pd.DataFrame:
    predictor_array.drop(columns=['b5'], inplace=True)  # We drop the first payment
    predictor_array.rename(columns={'b1': 'b2', 'b2':'b3', 'b3': 'b4', 'b4': 'b5'}, inplace=True)
    predictor_array.insert(loc=0, column='b1', value=var_to_add.values)  # And add the new variable (mn)
    
    return predictor_array

In [64]:
# split_payment_history_df = train[["ID", "SplitPaymentsHistory"]]
# id_arr = train[["ID"]]

target = train[['m1', 'm2', 'm3', 'm4', 'm5', 'm6']]
train_arr = train.drop(columns=['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 
                                'SplitPaymentsHistory',
                                'ExpectedTermDate', 
                                'FirstPaymentDate',
                                'LastPaymentDate'])
test_set.drop(columns=['SplitPaymentsHistory',
                       'ExpectedTermDate', 
                       'FirstPaymentDate',
                       'LastPaymentDate'], inplace=True)

In [65]:
def create_data_with_sliding_approach(data_without_target: pd.DataFrame,
                                      target_data: pd.DataFrame):
    frame = pd.DataFrame(None)
    new_df = deepcopy(data_without_target)
    target_df = pd.DataFrame(None)
    target_features = target_data.columns.tolist()
    for itr, col in enumerate(target_features):
        if itr == 0:
            target_df = pd.concat([target_df, target_data[[col]]])
            frame = pd.concat([frame, data_without_target])
        else:
            filter_df = new_df[['b1', 'b2', 'b3', 'b4', 'b5']]  # Intermediate df
            new_df.drop(columns=['b1', 'b2', 'b3', 'b4', 'b5'], inplace=True)
            concatinating_df = slide_variable_window(predictor_array=filter_df, 
                                                     var_to_add=target_data[[target_features[itr-1]]])
            new_df = pd.concat([new_df, concatinating_df], axis=1)  # We add the newly created columns
            target_df = pd.concat([target_df, target_data[[col]]])
            frame = pd.concat([frame, new_df])
#             print(new_df.shape)

    target_df = pd.DataFrame(target_df.sum(axis=1).astype(int), columns=['target'])
#     print(frame.shape)  # Should be 6 * original data's no. of rows
    
    frame.reset_index(drop=True, inplace=True)
    target_df.reset_index(drop=True, inplace=True)
    
    return frame, target_df

In [66]:
# split data into train and test sets
from sklearn.model_selection import train_test_split

seed = 10
X_train, X_test, y_train, y_test = train_test_split(train_arr, target, test_size=0.45, random_state=seed)

In [67]:
# X_train

### Model train on initial hp :: Approach 2

def approach_two_model(x_train, y_train, x_test):
    model = ModelXgBoost(train_array=x_train, train_target=y_train)
    model.train_model()  # Default h.params (Checkout the code)
    predict = model.trained_model.predict(x_test)
    
    return model, predict

In [134]:
def encode_and_drop(full_array, data_type, tr_encoder=None):
#     print(tr_encoder)
    categorical_array = full_array[full_array.select_dtypes(exclude=['number']).columns]
    numerical_array = full_array.drop(columns=full_array.select_dtypes(exclude=['number']).columns)
#     print(categorical_array.columns)
    encoded_array, encoder = one_hot_encoding(
        categorical_frame=categorical_array, 
        type_of_data=data_type,
        fitted_encoder=tr_encoder,
        conv=True,
        drop=None,
        handle_unknown="ignore"
    )
#     print(encoded_array.columns)
    final_array = pd.concat([numerical_array.reset_index(drop=True), 
                             encoded_array.reset_index(drop=True)], axis=1)
    final_array.index = numerical_array.index
#     print(final_array.shape)
    return numerical_array, final_array, encoder

In [14]:
def get_top_features(
    features,
    feature_scores,
    cut_off_score=0.8
) -> Tuple[List[List], List[List]]:
    frame = pd.DataFrame([feature_scores], columns=features, index=['gain']).T
    frame.sort_values(by=['gain'], ascending=False, inplace=True)
    frame['cum_gain'] = frame['gain'].cumsum()
    
    feature_list = list(frame[frame['cum_gain'] <= cut_off_score].index)
    feature_scores_list = frame[frame['cum_gain'] <= cut_off_score]['gain'].tolist()
    
    return feature_list, feature_scores_list

In [56]:
def get_important_feature_scores(
    feature_corpus: List[List],
    score_corpus: List[List]
) -> pd.DataFrame:
    imp_column = dict()
    imp_col = dict()
    for feature_list, score_list in zip(feature_corpus, score_corpus):
        for feature_names, scores in zip(feature_list, score_list):
            if feature_names not in imp_column.keys():
                imp_column[feature_names] = 1
#                 imp_col[feature_names] = [scores]
            else:
                imp_column[feature_names] += 1
#                 imp_col[feature_names].append(scores)
    
    important_features_df = pd.DataFrame(imp_column, index=['frequency']).T
#     important_scores = pd.DataFrame(imp_col)
    important_features_df['appearance_ratio'] = important_features_df['frequency'] / 100
    important_features_df.sort_values(by=['frequency'], ascending=False, inplace=True)
#     important_features_df['cum_score'] = important_features_df['score'].cumsum()
    
    return important_features_df

In [57]:
# Preparing the TRAIN data for approach two and fitting the model
train_data, target_frame = create_data_with_sliding_approach(data_without_target=X_train, 
                                                             target_data=y_train)
id_array = train_data[["ID"]]
train_data.drop(columns=["ID"], inplace=True)
og_frame, encoded_train, encoder_model = encode_and_drop(train_data, "train", None)

# encoded_train = pd.concat([encoded_train, train_data[['b1', 'b2', 'b3', 'b4', 'b5']]], axis=1)
print('Columns ->', encoded_train.columns)
print('Shape ->', encoded_train.shape)


np.random.seed(0)
r = np.random.randint(1, 100, 100)
f_i_list = []
f_i_features = []
for i, _seed in enumerate(r):
    sample= encoded_train.sample(n=20000, replace=False, random_state=_seed)
    target_f = target_frame[target_frame.index.isin(sample.index)]
    model_two_obj = ModelXgBoost(train_array=sample, 
                                 train_target=target_f)
    model_two_obj.train_model()  # Default h.params (Checkout the code)
    model_two = model_two_obj.trained_model
    f_i = model_two.feature_importances_
    f_i_cols = sample.columns
    variables, variable_scores = get_top_features(features=f_i_cols, feature_scores=f_i, cut_off_score=0.8)
    f_i_list.append(variable_scores)
    f_i_features.append(variables)

important_features_df = get_important_feature_scores(feature_corpus=f_i_features, score_corpus=f_i_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Columns -> Index(['Deposit', 'AccessoryRate', 'RatePerUnit', 'DaysOnDeposit', 'Age',
       'Term', 'TotalContractValue', 'b1', 'b2', 'b3', 'b4', 'b5', 'DAILY',
       'MONTHLY', 'WEEKLY', 'Female', 'Male', 'Coast Region',
       'Mount Kenya Region', 'Nairobi Region', 'North Rift', 'Nyanza',
       'South Rift', 'Western', 'Business', 'Driver/Motorbike Rider', 'Farmer',
       'Government Employee', 'Labourer', 'Other', 'Teacher'],
      dtype='object')
Shape -> (87648, 31)


In [113]:
temp_dict = {}
for feat, score in zip(f_i_features, f_i_list):
    for _feat, _score in zip(feat, score):
        if _feat not in temp_dict.keys():
            temp_dict[_feat] = [_score]
        else:
            temp_dict[_feat].append(_score)
print(temp_dict)
l = pd.DataFrame(temp_dict)
l

{'Nairobi Region': [0.38523614406585693, 0.14079660177230835, 0.02883549965918064, 0.026819294318556786, 0.0826120525598526, 0.0826120525598526, 0.09849465638399124, 0.15555430948734283, 0.12031208723783493, 0.0826120525598526, 0.02741350419819355, 0.20779338479042053, 0.028953567147254944, 0.14079660177230835, 0.043928634375333786, 0.144096240401268, 0.05252514407038689, 0.0826120525598526, 0.03276171162724495, 0.03715214133262634, 0.04166126251220703, 0.04166126251220703, 0.036588892340660095, 0.033932652324438095, 0.15555430948734283, 0.14079660177230835, 0.033932652324438095, 0.11714600771665573, 0.026819294318556786, 0.02792060375213623, 0.2757297456264496, 0.06372655928134918, 0.15555430948734283, 0.12031208723783493, 0.05252514407038689, 0.036588892340660095, 0.02792060375213623, 0.09849465638399124, 0.06372655928134918, 0.08937519788742065, 0.039397865533828735, 0.02914314530789852, 0.03223245218396187, 0.043928634375333786, 0.14973634481430054, 0.03276171162724495], 'Teacher':

ValueError: arrays must all be same length

In [116]:
frame = pd.DataFrame([variable_scores], columns=variables, index=['Gain']).T
frame.sort_values(by=['Gain'], ascending=False, inplace=True)
frame['cumulative_gain'] = frame['Gain'].cumsum()
frame
# frame.to_csv('cumulative_gain_og_features.csv')

Unnamed: 0,Gain,cumulative_gain
North Rift,0.187414,0.187414
Mount Kenya Region,0.122067,0.309482
Term,0.063913,0.373395
Nyanza,0.05437,0.427764
TotalContractValue,0.044457,0.472221
Coast Region,0.041402,0.513623
Deposit,0.034454,0.548077
b1,0.034039,0.582116
Labourer,0.033588,0.615704
Other,0.033209,0.648913


In [131]:
important_features_df.index

Index(['b1', 'Age', 'b4', 'DaysOnDeposit', 'b2', 'Deposit', 'b3', 'North Rift',
       'b5', 'South Rift', 'Term', 'Other', 'Driver/Motorbike Rider',
       'Coast Region', 'Nairobi Region', 'Western', 'Labourer',
       'TotalContractValue', 'Teacher', 'Mount Kenya Region', 'Farmer',
       'Nyanza', 'Government Employee', 'Female', 'Business', 'RatePerUnit',
       'AccessoryRate', 'WEEKLY'],
      dtype='object')

In [None]:
# important_features_df.to_csv('../../submissions/important_features_total_gain_with_80_cutoff.csv')

In [None]:
from matplotlib import pyplot

pyplot.figure(figsize=(15,10))
pyplot.bar(frame.index, frame['cumulative_gain'])
# xgb.plot_importance(model_two)
pyplot.show()

In [69]:
# When train has Null regions removed
# feature_selection = list(important_features_df[important_features_df['appearance_ratio'] >=0.3].index)
feature_selection = ['North Rift',
 'Other',
 'Term',
 'Driver/Motorbike Rider',
 'Labourer',
 'South Rift',
 'TotalContractValue',
 'Nyanza',
 'Nairobi Region',
 'Western',
 'Teacher',
 'Mount Kenya Region',
 'Government Employee',
 'Farmer',
 'Male',
 'AccessoryRate',
 'RatePerUnit',
 'WEEKLY']

In [142]:
# When train has Null regions as a category
# feature_selection = list(important_features_df[important_features_df['appearance_ratio'] <=0.5].index)
feature_selection = ['b1', 'Age', 'b4', 'DaysOnDeposit', 'b2', 'Deposit', 'b3', 'North Rift',
       'b5', 'South Rift', 'Term', 'Other', 'Driver/Motorbike Rider',
       'Coast Region', 'Nairobi Region', 'Western', 'Labourer', 'Teacher', 'Mount Kenya Region', 'Farmer',
       'Nyanza', 'Government Employee', 'Business']
# feature_selection = ['Driver/Motorbike Rider',
# #  'Coast Region',
#  'Nairobi Region',
#  'Western',
#  'Labourer',
#  'TotalContractValue',
#  'Teacher',
#  'Mount Kenya Region',
#  'Farmer',
#  'Nyanza',
#  'Government Employee',
# #  'Female',
# #  'Business',
#  'RatePerUnit',
#  'AccessoryRate',
#  'WEEKLY']

In [144]:
encoded_train

Unnamed: 0,b1,Age,b4,DaysOnDeposit,b2,Deposit,b3,North Rift,b5,South Rift,...,Coast Region,Nairobi Region,Western,Labourer,Teacher,Mount Kenya Region,Farmer,Nyanza,Government Employee,Business
0,766.0,40.0,1195.0,7,1000.0,2000,1245.0,0.0,840.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,200.0,64.0,3000.0,3,2000.0,2999,2000.0,0.0,2500.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,40.0,35.0,360.0,7,40.0,2000,40.0,0.0,35.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1110.0,23.0,990.0,3,1130.0,2000,1230.0,0.0,1250.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,1000.0,59.0,1500.0,3,2000.0,2999,1884.0,0.0,3000.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92413,1540.0,22.0,843.0,3,1320.0,2400,540.0,1.0,855.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
92414,350.0,,100.0,3,150.0,2400,100.0,1.0,215.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
92415,2360.0,,1435.0,3,1260.0,2500,1704.0,0.0,1995.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
92416,800.0,29.0,600.0,7,800.0,2000,760.0,0.0,600.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [143]:
# Preparing the TRAIN data for approach two and fitting the model
train_data, target_frame = create_data_with_sliding_approach(data_without_target=X_train, 
                                                             target_data=y_train)
id_array = train_data[["ID"]]
train_data.drop(columns=["ID"], inplace=True)
og_frame, encoded_train, encoder_model = encode_and_drop(train_data, "train", None)
encoded_train = encoded_train[feature_selection]
# encoded_train = pd.concat([encoded_train, train_data[['b1', 'b2', 'b3', 'b4', 'b5']]], axis=1)
# print(encoded_train.shape)

model_two_obj = ModelXgBoost(train_array=encoded_train,
                             train_target=target_frame)
model_two_obj.train_model()  # Default h.params (Checkout the code)
model_two = model_two_obj.trained_model


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


### Prediction using Model :: Approach 2

In [86]:
test_id_array = X_test[["ID"]]
X_test.drop(columns=["ID"], inplace=True)

KeyError: "None of [Index(['ID'], dtype='object')] are in the [columns]"

In [154]:
encoded_test.columns

Index(['Age', 'DaysOnDeposit', 'Deposit', 'North Rift', 'South Rift', 'Term',
       'Other', 'Driver/Motorbike Rider', 'Coast Region', 'Nairobi Region',
       'Western', 'Labourer', 'Teacher', 'Mount Kenya Region', 'Farmer',
       'Nyanza', 'Government Employee', 'Business', 'b1', 'b2', 'b3', 'b4',
       'b5'],
      dtype='object')

In [145]:
# Encoding and re-attaching using train encoding model
og_frame_test, encoded_test, encoder_model = encode_and_drop(X_test, "test", encoder_model)
encoded_test = encoded_test[feature_selection]
# encoded_test = pd.concat([encoded_test, X_test[['b1', 'b2', 'b3', 'b4', 'b5']]], axis=1)

predict_dict = dict()
for col_no, predict_col in enumerate(y_test.columns):
#     print(encoded_test)
    predict_dict[f"{predict_col}_pred"] = model_two.predict(encoded_test)
    print(predict_dict[f"{predict_col}_pred"])
    int_df = encoded_test[['b1', 'b2', 'b3', 'b4', 'b5']]
    encoded_test.drop(columns=['b1', 'b2', 'b3', 'b4', 'b5'], inplace=True)
    concatinating_df = slide_variable_window(predictor_array=deepcopy(int_df), 
                                             var_to_add=pd.DataFrame(deepcopy(predict_dict[f"{predict_col}_pred"])))
    encoded_test = pd.concat([encoded_test, concatinating_df], axis=1)  # We add the newly created columns
#     print(encoded_test[['b1', 'b2', 'b3', 'b4', 'b5']])
    
predict_dict

[1000.634    766.4636    64.32562 ... 1540.5897  1322.3157  1155.1216 ]
[ 638.91846 1508.1882  1487.4618  ... 1090.6755   877.505    573.1279 ]
[ 638.91846 1508.1882  1487.4618  ... 1090.6755   877.505    573.1279 ]
[ 638.91846 1508.1882  1487.4618  ... 1090.6755   877.505    573.1279 ]
[ 638.91846 1508.1882  1487.4618  ... 1090.6755   877.505    573.1279 ]
[ 638.91846 1508.1882  1487.4618  ... 1090.6755   877.505    573.1279 ]


{'m1_pred': array([1000.634  ,  766.4636 ,   64.32562, ..., 1540.5897 , 1322.3157 ,
        1155.1216 ], dtype=float32),
 'm2_pred': array([ 638.91846, 1508.1882 , 1487.4618 , ..., 1090.6755 ,  877.505  ,
         573.1279 ], dtype=float32),
 'm3_pred': array([ 638.91846, 1508.1882 , 1487.4618 , ..., 1090.6755 ,  877.505  ,
         573.1279 ], dtype=float32),
 'm4_pred': array([ 638.91846, 1508.1882 , 1487.4618 , ..., 1090.6755 ,  877.505  ,
         573.1279 ], dtype=float32),
 'm5_pred': array([ 638.91846, 1508.1882 , 1487.4618 , ..., 1090.6755 ,  877.505  ,
         573.1279 ], dtype=float32),
 'm6_pred': array([ 638.91846, 1508.1882 , 1487.4618 , ..., 1090.6755 ,  877.505  ,
         573.1279 ], dtype=float32)}

In [146]:
for k, v in predict_dict.items():
#     predict_dict[k] = [0 if i < 0 else i for i in v]
    for _v in v:
        if _v < 0:
            print(_v)
# predict_dict

-161.63087
-99.43228
-2.3683004
-45.21029
-120.02978
-59.173008
-609.9874
-2.09069
-52.325043
-67.45725


### Calculation of RMSE

In [147]:
pred_frame = pd.DataFrame(predict_dict)
pred_frame.index = X_test.index

In [148]:
full_test_array = pd.concat([X_test, y_test, pred_frame], axis=1)

In [149]:
full_test_array = pd.merge(full_test_array, test_id_array, how='left', left_index=True, right_index=True)

In [150]:
sub_file = SubmissionFile(
    validation_data=full_test_array,
    type_of_data='validation'
).execute()

In [151]:
sub_file.shape

(75624, 3)

In [152]:
sub_file['SquaredError'] = np.square(sub_file['Target'] - sub_file['Prediction'])

In [153]:
rmse = np.sqrt(np.sum(sub_file['SquaredError'])/sub_file.shape[0])
print('Final RMSE --> ', rmse)

Final RMSE -->  1697.605475466868


### Preparing Submission

In [98]:
# Preparing the Entire TRAIN data for approach two and fitting the model
train_data, target_frame = create_data_with_sliding_approach(data_without_target=train_arr, 
                                                             target_data=target)
print(train_data.columns)
id_array = train_data[["ID"]]
train_data.drop(columns=["ID"], inplace=True)
og_frame, encoded_train, encoder_model = encode_and_drop(train_data, "train", None)
print(encoded_train.columns)
encoded_train = encoded_train[feature_selection]
print(encoded_train.columns)
encoded_train = pd.concat([encoded_train, train_data[['b1', 'b2', 'b3', 'b4', 'b5']]], axis=1)

model_two_obj = ModelXgBoost(train_array=encoded_train, 
                             train_target=target_frame)
model_two_obj.train_model()  # Default h.params (Checkout the code)
model_two = model_two_obj.trained_model

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Index(['ID', 'Deposit', 'AccessoryRate', 'rateTypeEntity', 'RatePerUnit',
       'DaysOnDeposit', 'MainApplicantGender', 'Age', 'Region', 'Occupation',
       'Term', 'TotalContractValue', 'b1', 'b2', 'b3', 'b4', 'b5'],
      dtype='object')
Index(['Deposit', 'AccessoryRate', 'RatePerUnit', 'DaysOnDeposit', 'Age',
       'Term', 'TotalContractValue', 'b1', 'b2', 'b3', 'b4', 'b5', 'MONTHLY',
       'WEEKLY', 'Male', 'Mount Kenya Region', 'Nairobi Region', 'North Rift',
       'Null', 'Nyanza', 'South Rift', 'Western', 'Driver/Motorbike Rider',
       'Farmer', 'Government Employee', 'Labourer', 'Other', 'Teacher'],
      dtype='object')
Index(['Driver/Motorbike Rider', 'Nairobi Region', 'Western', 'Labourer',
       'TotalContractValue', 'Teacher', 'Mount Kenya Region', 'Farmer',
       'Nyanza', 'Government Employee', 'RatePerUnit', 'AccessoryRate',
       'WEEKLY'],
      dtype='object')


# Strategy to impute missing regions | did not work
most_common_region = test_set['Region'].value_counts().index[0]
test_set.loc[test_set['Region'].isna(), 'Region'] = most_common_region

In [99]:
test_id = test_set[["ID"]]
test_set.drop(columns=["ID"], inplace=True)

# Encoding and re-attaching using train encoding model
og_frame_test, encoded_test, encoder_model = encode_and_drop(test_set, "test", encoder_model)
encoded_test = encoded_test[feature_selection]
encoded_test = pd.concat([encoded_test, test_set[['b1', 'b2', 'b3', 'b4', 'b5']].reset_index(drop=True)], axis=1)

predict_dict = dict()

for col_no, predict_col in enumerate(['m1', 'm2', 'm3', 'm4', 'm5', 'm6']):
    predict_dict[f"m{col_no+1}_pred"] = model_two.predict(encoded_test)
    int_df = encoded_test[['b1', 'b2', 'b3', 'b4', 'b5']]
    encoded_test.drop(columns=['b1', 'b2', 'b3', 'b4', 'b5'], inplace=True)
    concatinating_df = slide_variable_window(predictor_array=int_df, 
                                             var_to_add=pd.DataFrame(predict_dict[f"m{col_no+1}_pred"]))
    encoded_test = pd.concat([encoded_test, concatinating_df], axis=1)  # We add the newly created columns
    
# predict_dict

['MONTHLY', 'WEEKLY', 'Male', 'Mount Kenya Region', 'Nairobi Region', 'North Rift', 'Null', 'Nyanza', 'South Rift', 'Western', 'Driver/Motorbike Rider', 'Farmer', 'Government Employee', 'Labourer', 'Other', 'Teacher']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [106]:
for ke, va in predict_dict.items():
    predict_dict[ke] = [0 if i < 0 else i for i in va]
    for _va in va:
        if _va < 0:
            print(_va)

-78.6378
-61.088306
-12.687273
-32.302616
-15.902531
-337.79108
-189.86002
-2.6176991
-396.0998


In [107]:
pred_frame_test = pd.DataFrame(predict_dict)
pred_frame_test.index = test_set.index

In [108]:
full_test_array_test = pd.concat([test_set, pred_frame_test], axis=1)
full_test_array_test = pd.merge(full_test_array_test, test_id, how='left', left_index=True, right_index=True)

In [109]:
sub_file = SubmissionFile(
    validation_data=full_test_array_test,
    type_of_data='test'
).execute()
sub_file.reset_index(drop=True, inplace=True)

In [110]:
sub_file.to_csv('../../submissions/submission_approach_2_bottom_feature_selection_null_region_category_neg_predictions_removed.csv')

In [111]:
sub_file

Unnamed: 0,ID,Target
0,ID_000RHRU x m1,208.234680
1,ID_000RHRU x m2,206.233871
2,ID_000RHRU x m3,233.535461
3,ID_000RHRU x m4,257.574280
4,ID_000RHRU x m5,283.720520
...,...,...
56011,ID_ZZOKWZJ x m2,956.071472
56012,ID_ZZOKWZJ x m3,918.225769
56013,ID_ZZOKWZJ x m4,946.485901
56014,ID_ZZOKWZJ x m5,952.303284
