# Prepare Environment

# Import External Dependencies

In [11]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

# Data Manipulation Function Definitions

In [60]:
def data_loader(directory):
    df = pd.read_csv(filepath_or_buffer=directory)
    return df

def prop_split_df(df, props, seed):
    random.seed(seed)
    df = df.copy(deep=True)
    indices = list(range(0,df.shape[0],1))
    index_sets = list()
    for iterator, prop in enumerate(props):
        if iterator+1 == len(props):
            index_sets.append(
                indices if int(round(prop*df.shape[0])) > len(indices)
                else set(random.sample(indices, int(round(prop*df.shape[0]))))
            )
            pass
        else:
            index_sets.append(
                set(random.sample(indices, int(round(prop*df.shape[0]))))
            )
            indices = list(set(indices) - index_sets[-1])
            pass
        pass
    dfs = list()
    for index_set in index_sets:
        dfs.append(
            df[
                df.index.isin(index_set)
            ].sample(
                frac=1,
                replace=False,
                random_state=seed,
                ignore_index=True
            ).reset_index(drop=True)
        )
        pass
    return dfs

def stratified_split(df, col, props, seed):
    df = df.copy(deep=True)
    col_values = list(set(df[col].values))
    dfs = list()
    for col_value in col_values:
        dfs.append(
            prop_split_df(
                df = df[df[col]==col_value].reset_index(drop=True),
                props = props,
                seed = seed
            )
        )
        pass
    dfs = np.array(dfs, dtype=object).T.tolist()
    dfs = [
        pd.concat(
            [df.astype(object) for df in df_tuple],
            axis=0
        ).reset_index(drop=True)
        for df_tuple in dfs
    ]
    return dfs

def oversampled_split(df, col, props, seed, algorithm, sampling_strategy, k_neighbors=5, m_neighbors=10):
    df = df.copy(deep=True)
    if algorithm.upper()=="SMOTE":
        smote_algorithm = SMOTE(sampling_strategy=sampling_strategy, random_state=seed, k_neighbors=k_neighbors, n_jobs=-1)
        pass
    elif algorithm.upper()=="BORDERLINESMOTE":
        smote_algorithm = BorderlineSMOTE(sampling_strategy=sampling_strategy, random_state=seed, k_neighbors=k_neighbors, n_jobs=-1, m_neighbors=m_neighbors)
        pass
    elif algorithm.upper()=="SVMSMOTE":
        smote_algorithm = SVMSMOTE(sampling_strategy=sampling_strategy, random_state=seed, k_neighbors=k_neighbors, n_jobs=-1, m_neighbors=m_neighbors)
        pass
    resampled_features, resampled_targets = smote_algorithm.fit_resample(df.drop([col], axis=1), df[col])
    df = pd.concat([resampled_features, resampled_targets], axis=1).reset_index(drop=True)
    dfs = prop_split_df(df, props, seed)
    return dfs

def undersampled_split(df, col, props, seed):
    df = df.copy(deep=True)
    value_counts = df.groupby(col).count().to_dict()
    value_counts = {key:min(value.values()) for key,value in value_counts.items()}
    critical_value = max(value_counts.values())
    global_prop = min([sum(props), 1.0])
    df = pd.concat(
        [
            df[
                df[col] == col_value
            ].sample(
                int(round(global_prop*critical_value)),
                replace=False,
                random_state=seed,
                ignore_index=True
            ) for col_value in set(df[col].values)
        ],
        axis=0
    ).reset_index(drop=True)
    dfs = prop_split_df(df, props, seed)
    return dfs


Unnamed: 0_level_0,id,gender,s11,s12,s13,s16,s17,s18,s48,s52,...,n6,n7,n8,n9,n10,n11,n12,n13,n14,n15
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3808,3808,3808,3808,3808,3808,3808,3808,3808,3808,...,3808,3808,3808,3808,3808,3808,3808,3808,3808,3808
1,3808,3808,3808,3808,3808,3808,3808,3808,3808,3808,...,3808,3808,3808,3808,3808,3808,3808,3808,3808,3808


# Preprocessor Function Definitions

In [None]:
def encoder_ohe(df, cols):
    df = df.copy(deep=True)
    
    for col in cols:
        encoder = OneHotEncoder(categories="auto", dtype=int)
        original_cols = list(df.columns)
        encoded_vals = encoder.fit_transform(df[[col]]).toarray().T
        encoded_cols = list(encoder.get_feature_names_out())
        index = original_cols.index(col)
        new_cols = original_cols[:index] + encoded_cols + original_cols[index+1:]
        for iterator, encoded_col in enumerate(encoded_cols):
            df[encoded_col] = encoded_vals[iterator]
            pass
        df = df[new_cols]
        pass
    return df

def decoder_ohe(df, cols):
    df = df.copy(deep=True)
    
    def arg_max(arr):
        arr = list(arr)
        return arr.index(1)
    
    for col in cols:
        original_cols = list(df.columns)
        encoded_cols = [original_col for original_col in original_cols if original_col.startswith(col)]
        encoded_vals = df[encoded_cols].values
        decoded_vals = ["_".join(encoded_col.split("_")[1:]) for encoded_col in encoded_cols]
        first_index = original_cols.index(col+"_"+decoded_vals[0])
        last_index = original_cols.index(col+"_"+decoded_vals[-1])
        new_cols = original_cols[:first_index] + [col] + original_cols[last_index+1:]
        encoded_vals = np.apply_along_axis(arg_max, 1, encoded_vals)
        df[col] = encoded_vals
        df[col] = df[col].apply(lambda arg : decoded_vals[arg])
        df = df[new_cols]
        pass
    return df

def encoder_ord(df, cols):
    df = df.copy(deep=True)
    mapper = dict()
    
    for col in cols:
        encoder = OrdinalEncoder(categories="auto", dtype=int)
        encoded_vals = encoder.fit_transform(df[[col]])
        df[col] = encoded_vals
        original_vals = list(encoder.categories_[0])
        mapper[col] = original_vals
        pass
    return df, mapper

def decoder_ord(df, cols, mapper):
    df = df.copy(deep=True)
    
    for col in cols:
        df[col] = df[col].apply(lambda arg : mapper[col][arg])
        pass
    return df