# Import External Dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import random

from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import confusion_matrix, classification_report, auc, roc_curve, roc_auc_score

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, OrdinalEncoder, MinMaxScaler, RobustScaler 

from sklearn.linear_model import Lasso
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE, SVMSMOTE

pd.set_option('display.max_columns', 1000)

# Data Manipulation Function Definitions

In [2]:
def data_loader(directory):
    df = pd.read_csv(filepath_or_buffer=directory)
    return df

def prop_split_df(df, props, seed):
    random.seed(seed)
    df = df.copy(deep=True)
    indices = list(range(0,df.shape[0],1))
    index_sets = list()
    for iterator, prop in enumerate(props):
        if iterator+1 == len(props):
            index_sets.append(
                indices if int(round(prop*df.shape[0])) > len(indices)
                else set(random.sample(indices, int(round(prop*df.shape[0]))))
            )
            pass
        else:
            index_sets.append(
                set(random.sample(indices, int(round(prop*df.shape[0]))))
            )
            indices = list(set(indices) - index_sets[-1])
            pass
        pass
    dfs = list()
    for index_set in index_sets:
        dfs.append(
            df[
                df.index.isin(index_set)
            ].sample(
                frac=1,
                replace=False,
                random_state=seed,
                ignore_index=True
            ).reset_index(drop=True)
        )
        pass
    return dfs

def stratified_split(df, col, props, seed):
    df = df.copy(deep=True)
    col_values = list(set(df[col].values))
    dfs = list()
    for col_value in col_values:
        dfs.append(
            prop_split_df(
                df = df[df[col]==col_value].reset_index(drop=True),
                props = props,
                seed = seed
            )
        )
        pass
    dfs = np.array(dfs, dtype=object).T.tolist()
    dfs = [
        pd.concat(
            [df.astype(object) for df in df_tuple],
            axis=0
        ).reset_index(drop=True)
        for df_tuple in dfs
    ]
    return dfs

def oversampled_split(df, col, props, seed, algorithm, sampling_strategy, k_neighbors=5, m_neighbors=10):
    df = df.copy(deep=True)
    if algorithm.upper()=="SMOTE":
        smote_algorithm = SMOTE(sampling_strategy=sampling_strategy, random_state=seed, k_neighbors=k_neighbors, n_jobs=-1)
        pass
    elif algorithm.upper()=="BORDERLINESMOTE":
        smote_algorithm = BorderlineSMOTE(sampling_strategy=sampling_strategy, random_state=seed, k_neighbors=k_neighbors, n_jobs=-1, m_neighbors=m_neighbors)
        pass
    elif algorithm.upper()=="SVMSMOTE":
        smote_algorithm = SVMSMOTE(sampling_strategy=sampling_strategy, random_state=seed, k_neighbors=k_neighbors, n_jobs=-1, m_neighbors=m_neighbors)
        pass
    resampled_features, resampled_targets = smote_algorithm.fit_resample(df.drop([col], axis=1), df[col])
    df = pd.concat([resampled_features, resampled_targets], axis=1).reset_index(drop=True)
    dfs = prop_split_df(df, props, seed)
    return dfs

def undersampled_split(df, col, props, seed):
    df = df.copy(deep=True)
    value_counts = df.groupby(col).count().to_dict()
    value_counts = {key:min(value.values()) for key,value in value_counts.items()}
    critical_value = max(value_counts.values())
    global_prop = min([sum(props), 1.0])
    df = pd.concat(
        [
            df[
                df[col] == col_value
            ].sample(
                int(round(global_prop*critical_value)),
                replace=False,
                random_state=seed,
                ignore_index=True
            ) for col_value in set(df[col].values)
        ],
        axis=0
    ).reset_index(drop=True)
    dfs = prop_split_df(df, props, seed)
    return dfs


# Preprocessor Function Definitions

In [3]:
def encoder_ohe(df, cols):
    df = df.copy(deep=True)
    
    for col in cols:
        encoder = OneHotEncoder(categories="auto", dtype=int)
        original_cols = list(df.columns)
        encoded_vals = encoder.fit_transform(df[[col]]).toarray().T
        encoded_cols = list(encoder.get_feature_names_out())
        index = original_cols.index(col)
        new_cols = original_cols[:index] + encoded_cols + original_cols[index+1:]
        for iterator, encoded_col in enumerate(encoded_cols):
            df[encoded_col] = encoded_vals[iterator]
            pass
        df = df[new_cols]
        pass
    return df

def decoder_ohe(df, cols):
    df = df.copy(deep=True)
    
    def arg_max(arr):
        arr = list(arr)
        return arr.index(1)
    
    for col in cols:
        original_cols = list(df.columns)
        encoded_cols = [original_col for original_col in original_cols if original_col.startswith(col)]
        encoded_vals = df[encoded_cols].values
        decoded_vals = ["_".join(encoded_col.split("_")[1:]) for encoded_col in encoded_cols]
        first_index = original_cols.index(col+"_"+decoded_vals[0])
        last_index = original_cols.index(col+"_"+decoded_vals[-1])
        new_cols = original_cols[:first_index] + [col] + original_cols[last_index+1:]
        encoded_vals = np.apply_along_axis(arg_max, 1, encoded_vals)
        df[col] = encoded_vals
        df[col] = df[col].apply(lambda arg : decoded_vals[arg])
        df = df[new_cols]
        pass
    return df

def encoder_ord(df, cols):
    df = df.copy(deep=True)
    mapper = dict()
    
    for col in cols:
        encoder = OrdinalEncoder(categories="auto", dtype=int)
        encoded_vals = encoder.fit_transform(df[[col]])
        df[col] = encoded_vals
        original_vals = list(encoder.categories_[0])
        mapper[col] = original_vals
        pass
    return df, mapper

def decoder_ord(df, cols, mapper):
    df = df.copy(deep=True)
    
    for col in cols:
        df[col] = df[col].apply(lambda arg : mapper[col][arg])
        pass
    return df

# Parameters

In [4]:
PARAM_TRAIN_DIR = 'bible_train.csv'
PARAM_BIBLE_TEST_DIR = 'bible_test.csv'
PARAM_COMP_TEST_DIR = './robi-datathon-2-pre-assessment/test.csv'
PARENT_SEED = 1995

In [12]:
train_df = data_loader(PARAM_TRAIN_DIR)
test_df = data_loader(PARAM_BIBLE_TEST_DIR)
comp_test_df = pd.read_csv(PARAM_COMP_TEST_DIR)

# DROP These columns having 95%~100% missing values
train_df.drop(columns=['s56', 's57', 's54', 's55', 's59'], axis=0, inplace=True)
test_df.drop(columns=['s56', 's57', 's54', 's55', 's59'], axis=0, inplace=True) 
comp_test_df.drop(columns=['s56', 's57', 's54', 's55', 's59'], axis=0, inplace=True) 

categorical_columns = [col for col in train_df.columns if col.startswith('s') or col=='gender']
print(categorical_columns)

train_df = encoder_ohe(train_df, categorical_columns)
test_df = encoder_ohe(test_df, categorical_columns)
comp_test_df = encoder_ohe(comp_test_df, categorical_columns)
train_df

['gender', 's11', 's12', 's13', 's16', 's17', 's18', 's48', 's52', 's53', 's58', 's69', 's70', 's71']


Unnamed: 0,id,gender_F,gender_M,s11_N,s11_Y,s12_N,s12_Y,s13_0,s13_1,s16_A,s16_B,s16_C,s16_D,s17_A,s17_B,s17_C,s17_D,s18_A,s18_B,s18_C,s18_D,s48_0,s48_1,s52_0,s52_1,s52_l,s52_o,s53_,s53_.1,s58_A,s58_B,s69_0,s69_C`,s69_x,s69_~1,s70_op: A,s70_op: B,s70_op: C,s70_op: D,s71_a,s71_b,s71_c,s71_d,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,n15,label
0,b'gAAAAABinOicS09vrmgh0_JyEHihI13ptO0rCyHP7l76...,0,1,0,1,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0,0,16.144666,1.989441,2,2.318385,-32.839277,0.017176,-9.126056,1.732291,3.698504,4.804517,1.544484,0,0,0.631220,5,0
1,b'gAAAAABinOiWGC1WhR6WYP0DA5ssGv9rIekrWUwCdJ8F...,0,1,0,1,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0,0,7.144558,0.844866,3,6.197768,-32.576597,0.013857,-9.098287,1.505885,6.791357,6.110416,1.712354,0,0,0.392746,3,1
2,b'gAAAAABinOig-g3-Q1ggjlMhfUSdn21Aj5yVVeVvXbis...,1,0,0,1,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0,0,5.749840,0.781439,2,8.256767,-32.398679,0.010387,-9.378025,1.485863,7.265876,4.559419,1.537645,0,0,0.154409,4,0
3,b'gAAAAABinOiXdoaNUzihOSbyY1tjWtd5EgMaXkkvH6SV...,0,1,1,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,14.771959,1.248188,3,2.300011,-32.396746,0.016289,-9.261962,1.619210,3.737647,4.052003,1.637831,0,1,0.737560,1,0
4,b'gAAAAABinOiWbgAxe8Uy9tboiJGZEYK7zcGy6fv8_5Ao...,0,1,0,1,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,0,11.533397,2.062749,9,2.732090,-32.865595,0.008230,-8.885964,1.845862,10.660651,11.704121,1.568647,0,0,0.687640,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22653,b'gAAAAABinOib4xjKX0bEvicZ7B2DV5826EWDGMRZHgLp...,0,1,0,1,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0,0,16.110661,2.459900,8,8.008481,-33.064535,0.007564,-8.681313,1.804592,9.183880,11.659464,1.605332,0,0,0.359127,5,0
22654,b'gAAAAABinOib1tpwtda0l7kZyxpta3GVunCLgA7qVfcn...,1,0,0,1,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0,0,11.149699,1.578276,8,4.141254,-32.576965,0.006357,-8.900328,1.923674,5.608561,11.013365,1.703918,0,0,0.245611,0,0
22655,b'gAAAAABinOicscC37w1W9uiIlYL1U5D_mUtmgdUg8QTl...,1,0,0,1,0,1,0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0,0,13.429797,1.800562,3,7.763386,-32.575975,0.017782,-9.233722,1.455911,2.900249,6.058681,1.627479,0,1,0.826587,1,0
22656,b'gAAAAABinOib-JZ8iQDxjSRsa1dPT1TqeSQ_si4mAw5f...,0,1,0,1,0,1,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,16.094622,2.438280,8,6.942766,-32.577563,0.007324,-8.748941,2.074394,10.673259,11.387646,1.560391,0,0,0.103523,5,0


In [6]:
%%time
if 'id' in train_df.columns:
    train_df = train_df.drop('id', axis=1)
    
train_df = oversampled_split(train_df, col='label', 
                             props=[1], seed=PARENT_SEED, 
                             algorithm='SVMSMOTE', 
                             sampling_strategy='minority', 
                             k_neighbors=5, m_neighbors=10)[0]

train_df['label'].value_counts()

CPU times: total: 23.8 s
Wall time: 21.7 s


1    18850
0    18850
Name: label, dtype: int64

# Data Split

In [7]:
%%time
features = [col for col in train_df.columns if col!='id' and col!='label']
print(features)

x = train_df[features].to_numpy()
y = train_df['label'].to_numpy()

x_train, x_val, y_train, y_val = train_test_split(x, y, train_size = 0.8, shuffle=True, random_state=PARENT_SEED)

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)

x_test = test_df[features].to_numpy()
y_test = test_df['label'].to_numpy()
print(x_test.shape, y_test.shape)

['gender_F', 'gender_M', 's11_N', 's11_Y', 's12_N', 's12_Y', 's13_0', 's13_1', 's16_A', 's16_B', 's16_C', 's16_D', 's17_A', 's17_B', 's17_C', 's17_D', 's18_A', 's18_B', 's18_C', 's18_D', 's48_0', 's48_1', 's52_0', 's52_1', 's52_l', 's52_o', 's53_ ', 's53_  ', 's58_A', 's58_B', 's69_0', 's69_C`', 's69_x', 's69_~1', 's70_op: A', 's70_op: B', 's70_op: C', 's70_op: D', 's71_a', 's71_b', 's71_c', 's71_d', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'n15']
(30160, 57) (30160,)
(7540, 57) (7540,)
(5664, 57) (5664,)
CPU times: total: 62.5 ms
Wall time: 47.7 ms


# STACKING CLASSIFIER

In [8]:
%%time

estimators = [
    ('cat', CatBoostClassifier(
                        iterations=150,
                        learning_rate=0.5,
                        verbose=False, random_state=PARENT_SEED)
    ),
     
    ('xgb' , XGBClassifier(learning_rate=0.01, n_estimators=1000,
                       max_depth=5, min_child_weight=0,
                       gamma=0, subsample=0.7,
                       colsample_bytree=0.7, nthread=-1,
                       reg_alpha=0.00006, random_state=PARENT_SEED)
    ),
     
    ('lgbm' , LGBMClassifier(num_leaves=100,
                      learning_rate=0.01, n_estimators=1500,
                      max_bin = 200, bagging_fraction = 0.8,
                      bagging_freq = 5, feature_fraction = 0.2319,
                      feature_fraction_seed=9, bagging_seed=42,
                      min_child_samples =20, random_state=PARENT_SEED)
    ),
    
    ('svc', SVC(C=50, 
                kernel='rbf',
                max_iter=1000,
                random_state=PARENT_SEED)
    )
    ]

stacking_clf = StackingClassifier(estimators=estimators, 
                                  
                                  final_estimator=RandomForestClassifier(criterion='gini', 
                                            n_estimators=1000,
                                            max_depth=4,
                                            min_samples_split=4,
                                            min_samples_leaf=7,
                                            oob_score=True,
                                            random_state=PARENT_SEED),
                                  
                                  verbose=2,
                                  n_jobs=-1,
                                  stack_method='predict'
                                )

stacking_clf.fit(x_train, y_train)

y_pred = stacking_clf.predict(x_val)

cm = confusion_matrix(y_val, y_pred)
print(cm)
print(classification_report(y_val, y_pred))

fpr, tpr, thresholds = roc_curve(y_val, y_pred)
print(roc_auc_score(y_val, y_pred))

[[3515  252]
 [ 370 3403]]
              precision    recall  f1-score   support

           0       0.90      0.93      0.92      3767
           1       0.93      0.90      0.92      3773

    accuracy                           0.92      7540
   macro avg       0.92      0.92      0.92      7540
weighted avg       0.92      0.92      0.92      7540

0.917519032545877
CPU times: total: 17.2 s
Wall time: 7min 36s


In [9]:
%%time

# y_pred = stacking_clf.predict(x_test)

y_pred = stacking_clf.predict_proba(x_test)[:,1]
y_pred = [1 if value>=0.20 else 0 for value in list(y_pred)]

cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(roc_auc_score(y_test, y_pred))

[[4170  542]
 [ 333  619]]
              precision    recall  f1-score   support

           0       0.93      0.88      0.91      4712
           1       0.53      0.65      0.59       952

    accuracy                           0.85      5664
   macro avg       0.73      0.77      0.75      5664
weighted avg       0.86      0.85      0.85      5664

0.7675923085702872
CPU times: total: 6.52 s
Wall time: 3.67 s


# SUBMISSION

In [10]:
sub = data_loader("./robi-datathon-2-pre-assessment/sample_submission.csv")
sub

Unnamed: 0,id,label
0,b'gAAAAABinOicS09vrmgh0_JyEHihI13ptO0rCyHP7l76...,0
1,b'gAAAAABinOiWGC1WhR6WYP0DA5ssGv9rIekrWUwCdJ8F...,0
2,b'gAAAAABinOibTcOBFIVeA4nVF3FuFz_QX3ZlPPFc21gS...,0
3,b'gAAAAABinOig-g3-Q1ggjlMhfUSdn21Aj5yVVeVvXbis...,0
4,b'gAAAAABinOiXdoaNUzihOSbyY1tjWtd5EgMaXkkvH6SV...,1
...,...,...
95,b'gAAAAABinOid6vaEfdTvKtD-hsURGJC10Gjki5LDii2Q...,0
96,b'gAAAAABinOieP4SKyYWsHDIrIdYo-BBxX_y2wt7vpw__...,0
97,b'gAAAAABinOiWy1eGULwigSWr6ROaR4E6USudLSr87Vl7...,1
98,b'gAAAAABinOia4rgUkxLfMKBltq_5DeJSRRDbuB2z1rFj...,0


In [None]:
%%time

comp_x_test = comp_test_df[features].to_numpy()

comp_y_pred = stacking_clf.predict_proba(comp_x_test)[:,1]
comp_y_pred = [1 if value>=0.20 else 0 for value in list(comp_y_pred)]

submission_df = pd.DataFrame({'id':list(comp_test_df['id']), 'label':comp_y_pred})
submission_df