In [1]:
from function.validation import *
from function.rsmote import *
from function.SMOTERounding import *

In [2]:
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from imblearn.pipeline import make_pipeline

from imblearn.over_sampling import SMOTE, SMOTEN, SMOTENC, SVMSMOTE, KMeansSMOTE, RandomOverSampler,BorderlineSMOTE, ADASYN
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, NearMiss, RepeatedEditedNearestNeighbours
from imblearn.combine import SMOTEENN, SMOTETomek 
from imblearn.metrics import geometric_mean_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# GN
ans_gn_name = ['amikacin',
'amoxicillin/clavulanic acid',
'cefalexin',
'cefovecin',
'doxycycline',
'enrofloxacin',
'gentamicin',
'imipenem',
'marbofloxacin',
'nitrofurantoin',
'trimethoprim/sulfamethoxazole']
gn_ans_col_name = ["ans_" + e for e in ans_gn_name]

# GP
ans_gp_name = ['amikacin',
 'amoxicillin/clavulanic acid',
 'cefalexin',
 'cefovecin',
 'clindamycin',
 'doxycycline',
 'enrofloxacin',
 'marbofloxacin',
 'nitrofurantoin',
 'trimethoprim/sulfamethoxazole',
 'vancomycin']
gp_ans_col_name = ["ans_" + e for e in ans_gp_name]

Count : Majority / Minority

In [3]:
vitek_id = "GP"
ans_col_name = gn_ans_col_name if vitek_id == "GN" else gp_ans_col_name
df_count = pd.DataFrame()
for i in range(11):
    anti_name = ans_col_name[i].replace("ans_", "").replace("/", "_")
    df_train = pd.read_csv(f"./Dataset/{vitek_id}/Train/Train_{vitek_id}_{anti_name}.csv")
    df_test = pd.read_csv(f"./Dataset/{vitek_id}/Test/Test_{vitek_id}_{anti_name}.csv")
    df_train = df_train.append(df_test)
    value_count = df_train[ans_col_name[i]].value_counts()
    ratio = np.round(value_count[0]/value_count[1], 2)
    df_count = df_count.append({"antimicrobial": anti_name.replace("_", "/"),"majority": int(value_count[0]), "minority": int(value_count[1]), "ratio": ratio}, ignore_index=True)
df_count["majority"] = df_count["majority"].astype(int)
df_count["minority"] = df_count["minority"].astype(int)

In [4]:
df_count

Unnamed: 0,antimicrobial,majority,minority,ratio
0,amikacin,2932,190,15.43
1,amoxicillin/clavulanic acid,1711,1411,1.21
2,cefalexin,2218,904,2.45
3,cefovecin,2159,963,2.24
4,clindamycin,2296,826,2.78
5,doxycycline,2946,176,16.74
6,enrofloxacin,3007,115,26.15
7,marbofloxacin,2960,162,18.27
8,nitrofurantoin,2921,201,14.53
9,trimethoprim/sulfamethoxazole,2094,1028,2.04


Read CSV

In [6]:
df_gn = pd.read_excel("./Dataset/Origin/trained_GN_Dataset_CS.xlsx")
df_gp = pd.read_excel("./Dataset/Origin/trained_GP_Dataset_CS.xlsx")

Cross Validation

In [7]:
def training(X_train, y_train):
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    # dt = DecisionTreeClassifier(random_state=0)
    # rf = RandomForestClassifier(random_state=0)
    # svm = SVC(probability=True, random_state=0)
    xgb = XGBClassifier(eval_metric=f1_score, verbosity=0, random_state=0, tree_method='gpu_hist', gpu_id=1,)
    smotes = {
        "None": None,
        "SMOTE": SMOTE(random_state=0, n_jobs=-1),
        "ADASYN": ADASYN(random_state=0, n_jobs=-1),
        "BorderlineSMOTE": BorderlineSMOTE(random_state=0, n_jobs=-1),
        "SVMSMOTE": SVMSMOTE(random_state=0, n_jobs=-1),
        "RSMOTE": RSmoteKClasses(random_state=0),
    }
    df_cross = pd.DataFrame()
    for smote in smotes.items():
        a = cross_validation(X_train, y_train.astype(bool), skf, {"XGB": xgb},
                             [lambda _X_train, _X_test, _y_train, _y_test: (pd.get_dummies(
                                 _X_train), get_dummies_dataframe_columns(pd.get_dummies(_X_train), _X_test), _y_train, _y_test)],
                             [] if smote[1] is None else [lambda _X, _y: SMOTERounding(smote[1]).fit_resample(_X, _y)]
                             )
        
        a.index.name = "Algorithm"
        a["Imblanace Handlering"] = smote[0]
        df_cross = df_cross.append(a)
    df_cross["Antimicrobial"] = y_train.name.replace("ans_", "")
    df_cross = df_cross.reset_index().set_index(["Antimicrobial", "Algorithm", "Imblanace Handlering"])
    return df_cross

In [9]:
def crossSMOTE(vitek_id, df_report):
    sir_col_name = df_report.columns[df_report.columns.str.startswith("S/I/R_")]
    ans_col_name = gn_ans_col_name if vitek_id == "GN" else gp_ans_col_name
    df_cross_eval = pd.DataFrame()
    for i in range(11):
        anti_name = ans_col_name[i].replace("ans_", "").replace("/", "_")
        anti_train = pd.read_csv(f"./Dataset/{vitek_id}/Train/Train_{vitek_id}_{anti_name}.csv")
        X_train = anti_train[["species","bacteria_genus", "submitted_sample_category"]+ list(sir_col_name)]
        y_train = anti_train[ans_col_name[i]]
        cross_eval = training(X_train, y_train)
        df_cross_eval = df_cross_eval.append(cross_eval)
    return df_cross_eval

In [10]:
CrossGN = crossSMOTE("GN", df_gn)

In [None]:
CrossGP = crossSMOTE("GN", df_gp)

In [12]:
CrossGN.to_csv("./Evaluation Result/GN_10_Fold_Cross_Validation_Compare_SMOTE.csv")

In [None]:
CrossGP.to_csv("./Evaluation Result/GP_10_Fold_Cross_Validation_Compare_SMOTE.csv")