# Prepare Environment

# Import External Dependencies

In [29]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from mlxtend.classifier import StackingClassifier
from mlxtend.classifier import EnsembleVoteClassifier

from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

# Data Manipulation Function Definitions

In [30]:
def data_loader(directory):
    df = pd.read_csv(filepath_or_buffer=directory)
    return df

def prop_split_df(df, props, seed):
    random.seed(seed)
    df = df.copy(deep=True)
    indices = list(range(0,df.shape[0],1))
    index_sets = list()
    for iterator, prop in enumerate(props):
        if iterator+1 == len(props):
            index_sets.append(
                indices if int(round(prop*df.shape[0])) > len(indices)
                else set(random.sample(indices, int(round(prop*df.shape[0]))))
            )
            pass
        else:
            index_sets.append(
                set(random.sample(indices, int(round(prop*df.shape[0]))))
            )
            indices = list(set(indices) - index_sets[-1])
            pass
        pass
    dfs = list()
    for index_set in index_sets:
        dfs.append(
            df[
                df.index.isin(index_set)
            ].sample(
                frac=1,
                replace=False,
                random_state=seed,
                ignore_index=True
            ).reset_index(drop=True)
        )
        pass
    return dfs

def stratified_split(df, col, props, seed):
    df = df.copy(deep=True)
    col_values = list(set(df[col].values))
    dfs = list()
    for col_value in col_values:
        dfs.append(
            prop_split_df(
                df = df[df[col]==col_value].reset_index(drop=True),
                props = props,
                seed = seed
            )
        )
        pass
    dfs = np.array(dfs, dtype=object).T.tolist()
    dfs = [
        pd.concat(
            [df.astype(object) for df in df_tuple],
            axis=0
        ).reset_index(drop=True)
        for df_tuple in dfs
    ]
    return dfs

def oversampled_split(df, col, props, seed, algorithm, sampling_strategy, k_neighbors=5, m_neighbors=10):
    df = df.copy(deep=True)
    if algorithm.upper()=="SMOTE":
        smote_algorithm = SMOTE(sampling_strategy=sampling_strategy, random_state=seed, k_neighbors=k_neighbors, n_jobs=-1)
        pass
    elif algorithm.upper()=="BORDERLINESMOTE":
        smote_algorithm = BorderlineSMOTE(sampling_strategy=sampling_strategy, random_state=seed, k_neighbors=k_neighbors, n_jobs=-1, m_neighbors=m_neighbors)
        pass
    elif algorithm.upper()=="SVMSMOTE":
        smote_algorithm = SVMSMOTE(sampling_strategy=sampling_strategy, random_state=seed, k_neighbors=k_neighbors, n_jobs=-1, m_neighbors=m_neighbors)
        pass
    resampled_features, resampled_targets = smote_algorithm.fit_resample(df.drop([col], axis=1), df[col])
    df = pd.concat([resampled_features, resampled_targets], axis=1).reset_index(drop=True)
    dfs = prop_split_df(df, props, seed)
    return dfs

def undersampled_split(df, col, props, seed):
    df = df.copy(deep=True)
    value_counts = df.groupby(col).count().to_dict()
    value_counts = {key:min(value.values()) for key,value in value_counts.items()}
    critical_value = max(value_counts.values())
    global_prop = min([sum(props), 1.0])
    df = pd.concat(
        [
            df[
                df[col] == col_value
            ].sample(
                int(round(global_prop*critical_value)),
                replace=False,
                random_state=seed,
                ignore_index=True
            ) for col_value in set(df[col].values)
        ],
        axis=0
    ).reset_index(drop=True)
    dfs = prop_split_df(df, props, seed)
    return dfs


# Preprocessor Function Definitions

In [31]:
def encoder_ohe(df, cols):
    df = df.copy(deep=True)
    
    for col in cols:
        encoder = OneHotEncoder(categories="auto", dtype=int)
        original_cols = list(df.columns)
        encoded_vals = encoder.fit_transform(df[[col]]).toarray().T
        encoded_cols = list(encoder.get_feature_names_out())
        index = original_cols.index(col)
        new_cols = original_cols[:index] + encoded_cols + original_cols[index+1:]
        for iterator, encoded_col in enumerate(encoded_cols):
            df[encoded_col] = encoded_vals[iterator]
            pass
        df = df[new_cols]
        pass
    return df

def decoder_ohe(df, cols):
    df = df.copy(deep=True)
    
    def arg_max(arr):
        arr = list(arr)
        return arr.index(1)
    
    for col in cols:
        original_cols = list(df.columns)
        encoded_cols = [original_col for original_col in original_cols if original_col.startswith(col)]
        encoded_vals = df[encoded_cols].values
        decoded_vals = ["_".join(encoded_col.split("_")[1:]) for encoded_col in encoded_cols]
        first_index = original_cols.index(col+"_"+decoded_vals[0])
        last_index = original_cols.index(col+"_"+decoded_vals[-1])
        new_cols = original_cols[:first_index] + [col] + original_cols[last_index+1:]
        encoded_vals = np.apply_along_axis(arg_max, 1, encoded_vals)
        df[col] = encoded_vals
        df[col] = df[col].apply(lambda arg : decoded_vals[arg])
        df = df[new_cols]
        pass
    return df

def encoder_ord(df, cols):
    df = df.copy(deep=True)
    mapper = dict()
    
    for col in cols:
        encoder = OrdinalEncoder(categories="auto", dtype=int)
        encoded_vals = encoder.fit_transform(df[[col]])
        df[col] = encoded_vals
        original_vals = list(encoder.categories_[0])
        mapper[col] = original_vals
        pass
    return df, mapper

def decoder_ord(df, cols, mapper):
    df = df.copy(deep=True)
    
    for col in cols:
        df[col] = df[col].apply(lambda arg : mapper[col][arg])
        pass
    return df

# Runtime

In [32]:
PARAM_SEED = 1995
PARAM_TRAIN_DATA_DIR = "./data/train.csv"
PARAM_VALIDATION_DATA_DIR = "./data/bible_test.csv"
PARAM_TEST_DATA_DIR = "./data/test.csv"
PARAM_OHE_COLS = ["n12", "n13", "gender", "s48", "s53", "s58", "s69", "s11", "s12", "s13", "s16", "s17", "s18", "s52", "s70", "s71", "n15"]

### Dataset Preparation

In [33]:
df_train = data_loader(PARAM_TRAIN_DATA_DIR)
df_train = df_train.drop(["s54", "s55", "s56", "s57", "s59", "id"], axis=1)
df_train = encoder_ohe(df_train, PARAM_OHE_COLS)
df_train = oversampled_split(
                                df=df_train, 
                                col='label', 
                                props=[1], 
                                seed=PARAM_SEED, 
                                algorithm='SVMSMOTE', 
                                sampling_strategy='minority', 
                                k_neighbors=5, 
                                m_neighbors=10
                            )

df_val = data_loader(PARAM_VALIDATION_DATA_DIR)
df_val = df_val.drop(["s54", "s55", "s56", "s57", "s59", "id"], axis=1)
df_val = encoder_ohe(df_val, PARAM_OHE_COLS)

df_test = data_loader(PARAM_TEST_DATA_DIR)
comp_ids = list(df_test['id'])
df_test = df_test.drop(["s54", "s55", "s56", "s57", "s59", "id"], axis=1)
df_test = encoder_ohe(df_test, PARAM_OHE_COLS)

In [34]:
X_train = df_train[0].drop(['label'], axis=1)
X_test = df_val.drop(['label'], axis=1)

y_train = df_train[0]['label']
y_test = df_val['label']

### Model Training

In [35]:

clf1 = RandomForestClassifier(random_state=PARAM_SEED, n_estimators=300)
clf2 = XGBClassifier(learning_rate=0.01, n_estimators=1500,
                       max_depth=5, min_child_weight=0,
                       gamma=0, subsample=0.7,
                       colsample_bytree=0.7, nthread=-1,
                       reg_alpha=0.00006, random_state=PARAM_SEED)

eclf = EnsembleVoteClassifier(clfs=[clf1, clf2], voting='soft')


In [36]:
# # Cross Validation. Ignore for future training

# for clf, label in zip([clf1, clf2, eclf], 
#                       ['Random Forest', 
#                        'XGBoost', 
#                        'EnsembleVoteClassifier']):

#     scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
#     print("ROC_AUC: %0.2f [%s]" % (scores.mean(), label))

In [37]:
%%time
eclf_fit = eclf.fit(X_train.values, y_train.values)

CPU times: user 3min 21s, sys: 196 ms, total: 3min 21s
Wall time: 52 s


### Predict Test Set

In [38]:
y_pred = eclf_fit.predict_proba(X_test.values)[:,1]
y_score = [1 if value>=0.25 else 0 for value in list(y_pred)]

In [39]:
print('ROCAUC score:', roc_auc_score(y_test, y_score))
cm = confusion_matrix(y_test, y_score)
print(cm)
print(classification_report(y_test, y_score))
print(roc_auc_score(y_test, y_score))

ROCAUC score: 0.9431239388794568
[[4176  536]
 [   0  952]]
              precision    recall  f1-score   support

           0       1.00      0.89      0.94      4712
           1       0.64      1.00      0.78       952

    accuracy                           0.91      5664
   macro avg       0.82      0.94      0.86      5664
weighted avg       0.94      0.91      0.91      5664

0.9431239388794568


In [40]:
features = [col for col in df_test.columns if col != 'id']
y_pred_comp = eclf_fit.predict_proba(df_test[features].values)[:,1]
y_score_comp = [1 if value>=0.25 else 0 for value in list(y_comp)]

In [44]:
comp_dict = {"id": comp_ids, "label": y_score_comp}
submission_df = pd.DataFrame(comp_dict)

In [45]:
submission_df

Unnamed: 0,id,label
0,b'gAAAAABinOi328DZcweGB4_nOyHA3Dy6o1YKYKyf3COx...,1
1,b'gAAAAABinOikutEIBjkUXl9lYTg4RI6jc4NfiMUCcVsn...,1
2,b'gAAAAABinOjBM70jBXOroAlUSq5lNXMd_oP0PU7jLQE5...,1
3,b'gAAAAABinOimitAnqlgOcqnD_LeNL3WEbXNGvjd3QVPi...,0
4,b'gAAAAABinOi3W9p3Oka5MV_dc2TeorZUcIWOnnODSx7E...,0
...,...,...
85060,b'gAAAAABinOjbnJVk2-nOVQsYB9p4DK26fTLLik_UR2H0...,1
85061,b'gAAAAABinOi7ixyXrlKYlx8D9i0-TIPD5elP2k-vuekn...,1
85062,b'gAAAAABinOi31zWSlD0OMhbBd3_weh7Kq6aPeO4yYqns...,1
85063,b'gAAAAABinOjIe7jFVk9k7jiH8Y3rdpUHDTZG2T2isunp...,1


In [43]:
#submission_df.to_csv("eclf_v4_svmsmoat.csv", index=False)