In [1]:
from function.validation import *
from function.rsmote import *
from function.SMOTERounding import *

import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from imblearn.pipeline import make_pipeline

from imblearn.over_sampling import SMOTE, SMOTEN, SMOTENC, SVMSMOTE, KMeansSMOTE, RandomOverSampler,BorderlineSMOTE, ADASYN
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, NearMiss, RepeatedEditedNearestNeighbours
from imblearn.combine import SMOTEENN, SMOTETomek 
from imblearn.metrics import geometric_mean_score

from IPython.display import Markdown, display
import os

import warnings
warnings.filterwarnings('ignore')

In [None]:
pip list

In [2]:
# GN
ans_gn_name = ['amikacin',
'amoxicillin/clavulanic acid',
'cefalexin',
'cefovecin',
'doxycycline',
'enrofloxacin',
'gentamicin',
'imipenem',
'marbofloxacin',
'nitrofurantoin',
'trimethoprim/sulfamethoxazole']
gn_ans_col_name = ["ans_" + e for e in ans_gn_name]

# GP
ans_gp_name = ['amikacin',
 'amoxicillin/clavulanic acid',
 'cefalexin',
 'cefovecin',
 'clindamycin',
 'doxycycline',
 'enrofloxacin',
 'marbofloxacin',
 'nitrofurantoin',
 'trimethoprim/sulfamethoxazole',
 'vancomycin']
gp_ans_col_name = ["ans_" + e for e in ans_gp_name]

กำหนด SMOTE Alogorithm ตามที่เลือกไว้จากการเปรียบเทียบในไฟล์ 2_cross_validation_compare_smote.ipynb

In [3]:
GN_SMOTE = [
    RSmoteKClasses(random_state=0), #amikancin 
    SVMSMOTE(random_state=0, n_jobs=-1), #amoxicillin/clavulanic acid
    RSmoteKClasses(random_state=0), #cefalexin
    ADASYN(random_state=0, n_jobs=-1), #cefovecin
    BorderlineSMOTE(random_state=0, n_jobs=-1), #doxycycline
    RSmoteKClasses(random_state=0), #enrofloxacin
    RSmoteKClasses(random_state=0), #gentamicin
    SVMSMOTE(random_state=0 ,n_jobs=-1), #imipenem
    RSmoteKClasses(random_state=0), #marbofloxacin
    SVMSMOTE(random_state=0, n_jobs=-1), #nitrofurantoin
    SVMSMOTE(random_state=0, n_jobs=-1), #trimethoprim/sulfamethoxazole
]

GP_SMOTE = [
    BorderlineSMOTE(random_state=0, n_jobs=-1), #amikancin 
    BorderlineSMOTE(random_state=0, n_jobs=-1), #amoxicillin/clavulanic acid
    SMOTE(random_state=0, n_jobs=-1), #cefalexin
    SVMSMOTE(random_state=0, n_jobs=-1), #cefovecin
    SVMSMOTE(random_state=0, n_jobs=-1), #clindamycin
    RSmoteKClasses(random_state=0), #doxycycline
    RSmoteKClasses(random_state=0), #enrofloxacin
    BorderlineSMOTE(random_state=0, n_jobs=-1), #marbofloxacin
    RSmoteKClasses(random_state=0), #nitrofurantoin
    SVMSMOTE(random_state=0, n_jobs=-1), #trimethoprim/sulfamethoxazole
    SMOTE(random_state=0, n_jobs=-1) #vancomycin
]

In [4]:
gn_smote_name = [
    "R-SMOTE", #amikancin 
    "SVM-SMOTE", #amoxicillin/clavulanic acid
    "R-SMOTE", #cefalexin
    "ADASYN", #cefovecin
    "Borderline-SMOTE", #doxycycline
    "R-SMOTE", #enrofloxacin
    "R-SMOTE", #gentamicin
    "SVM-SMOTE", #imipenem
    "R-SMOTE", #marbofloxacin
    "SVM-SMOTE", #nitrofurantoin
    "SVM-SMOTE", #trimethoprim/sulfamethoxazole
]

gp_smote_name = [
    "Borderline-SMOTE", #amikancin 
    "Borderline-SMOTE", #amoxicillin/clavulanic acid
    "SMOTE", #cefalexin
    "SVM-SMOTE", #cefovecin
    "SVM-SMOTE", #clindamycin
    "R-SMOTE", #doxycycline
    "R-SMOTE", #enrofloxacin
    "Borderline-SMOTE", #marbofloxacin
    "R-SMOTE", #nitrofurantoin
    "SVM-SMOTE", #trimethoprim/sulfamethoxazole
    "SMOTE" #vancomycin
]

In [5]:
def getData(vitek_id: str, i : int):
    ans_col_name = gn_ans_col_name if vitek_id == "GN" else gp_ans_col_name
    ans_name = ans_col_name[i].replace("ans_", "").replace("/", "_")
    df_train = pd.read_csv(f"./Dataset/{vitek_id}/Train/Train_{vitek_id}_{ans_name}.csv")

    X_train = df_train[["species","submitted_sample_category","bacteria_genus"] + list(df_train.columns[df_train.columns.str.startswith("S/I/R")])]
    y_train = df_train[ans_col_name[i]]
    return X_train , y_train 

In [29]:
def getModel(vitek_id: str, anti_name: str): 
    #anti_name = anti_name.replace("/", "_") 
    gs = joblib.load(f"Grid_Search_CV/{vitek_id}/{anti_name}.joblib")
    print("/***********************")
    print(vitek_id,anti_name)
    params = gs.get_params()
    print(params)
    xgb = XGBClassifier(eval_metric=f1_score, verbosity=0, use_label_encoder=False,
                        random_state=0, tree_method='gpu_hist', gpu_id=1,
                        colsample_bytree=params["colsample_bytree"],
                        gamma=params["gamma"],
                        learning_rate=params["learning_rate"],
                        max_depth=params["max_depth"],
                        n_estimators=params["n_estimators"],
                        subsample=params["subsample"])
    return xgb

In [None]:
getModel

In [7]:
def crossValidation(vitek_id: str):
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    ans_col_name = gn_ans_col_name if vitek_id == "GN" else gp_ans_col_name
    smote = GN_SMOTE if vitek_id == "GN" else GP_SMOTE
    
    eval_df = pd.DataFrame()
    for i in range(11): 
        anti_name = ans_col_name[i].replace("ans_", "").replace("/", "_")
        df_train = pd.read_csv(f"./Dataset/{vitek_id}/Train/Train_{vitek_id}_{anti_name}.csv")
        sir_col_name = df_train.columns[df_train.columns.str.startswith("S/I/R_")]
        X_train = df_train[["species", "bacteria_genus", "submitted_sample_category"]+ list(sir_col_name)]
        y_train = df_train[ans_col_name[i]]
        
        # xgb_model = getModel(vitek_id, anti_name)
        xgb_model = XGBClassifier(eval_metric=f1_score, verbosity=0, use_label_encoder=False,
                        random_state=0, tree_method='gpu_hist', gpu_id=1)
        # print(anti_name)
        cross_df =  cross_validation(X_train, y_train, skf,
                    {anti_name.replace("_", "/"): xgb_model},
                    [lambda _X_train, _X_test, _y_train, _y_test: (pd.get_dummies(_X_train),
                    get_dummies_dataframe_columns(pd.get_dummies(_X_train), _X_test), _y_train, _y_test)],
                    [lambda _X, _y: SMOTERounding(smote[i]).fit_resample(_X, _y)]
                    )
        eval_df = eval_df.append(cross_df)
    return eval_df

In [8]:
# ใช้พารามิเตอร์ที่จูนด้วย grid_search (ใช้ Test set วัดประสิทธิภาพ)
from tqdm import tnrange
def eval_test_paramerter_tuning(vitek_id: str):
    ans_col_name = gn_ans_col_name if vitek_id == "GN" else gp_ans_col_name
    smote = GN_SMOTE if vitek_id == "GN" else GP_SMOTE
    eval_df = pd.DataFrame()
    schema = {}
    smote_name = gn_smote_name if vitek_id == "GN" else gp_smote_name
    for i in tnrange(11, desc='tnrange'): 
        anti_name = ans_col_name[i].replace("ans_", "").replace("/", "_")
        df_train = pd.read_csv(f"./Dataset/{vitek_id}/Train/Train_{vitek_id}_{anti_name}.csv")
        df_test = pd.read_csv(f"./Dataset/{vitek_id}/Test/Test_{vitek_id}_{anti_name}.csv")
        sir_col_name = df_train.columns[df_train.columns.str.startswith("S/I/R_")]
        X_train = df_train[["species", "bacteria_genus", "submitted_sample_category"]+ list(sir_col_name)]
        X_test = df_test[["species", "bacteria_genus", "submitted_sample_category"]+ list(df_test.columns[df_test.columns.str.startswith("S/I/R_")])]
        y_train = df_train[ans_col_name[i]]
        y_test = df_test[ans_col_name[i]]
        
        xgb_model = getModel(vitek_id, anti_name) # XGB Model
        X_train_dummies = pd.get_dummies(X_train) # One-Hot Train
        X_test_dummies = get_dummies_dataframe_columns(X_train_dummies, X_test) # One-Hot Test
        X_train_res, y_train_res = SMOTERounding(smote[i]).fit_resample(X_train_dummies, y_train) # SMOTE
        
        xgb_model.fit(X_train_res, y_train_res)
        
        joblib.dump(xgb_model, f"./Model/{vitek_id}/{anti_name}.joblib") # dump model
        schema[anti_name] = list(X_train_res.columns) # schema
        
        # result = evaluation(X_test_dummies, y_test.astype(bool), {"Before": xgb_default, "After": xgb_model})
        result = evaluation(X_test_dummies, y_test.astype(bool), {"After": xgb_model})
        result.index.name = "Before/After"
        result.insert(0, "Antimicrobial", ans_col_name[i].replace("ans_", ""))
        eval_df = eval_df.append(result)
        
    # export schema
    schema_path = f"./Model/{vitek_id}/{vitek_id}_schema.txt"
    schema_file = open(schema_path ,'w') if os.path.exists(schema_path) else open(schema_path ,'x')
    schema_file.write(str(schema))
    schema_file.close()
        
    return eval_df

In [14]:
# ใช้ defult parameter (ใช้ Test set วัดประสิทธิภาพ)
from tqdm import tnrange
def eval_test_default_parameter(vitek_id: str):
    ans_col_name = gn_ans_col_name if vitek_id == "GN" else gp_ans_col_name
    smote = GN_SMOTE if vitek_id == "GN" else GP_SMOTE
    eval_df = pd.DataFrame()
    schema = {}
    smote_name = gn_smote_name if vitek_id == "GN" else gp_smote_name
    for i in tnrange(11, desc='tnrange'):
        anti_name = ans_col_name[i].replace("ans_", "").replace("/", "_")
        df_train = pd.read_csv(f"./Dataset/{vitek_id}/Train/Train_{vitek_id}_{anti_name}.csv")
        df_test = pd.read_csv(f"./Dataset/{vitek_id}/Test/Test_{vitek_id}_{anti_name}.csv")
        sir_col_name = df_train.columns[df_train.columns.str.startswith("S/I/R_")]
        X_train = df_train[["species", "bacteria_genus", "submitted_sample_category"]+ list(sir_col_name)]
        X_test = df_test[["species", "bacteria_genus", "submitted_sample_category"]+ list(df_test.columns[df_test.columns.str.startswith("S/I/R_")])]
        y_train = df_train[ans_col_name[i]]
        y_test = df_test[ans_col_name[i]]
        
        xgb_default = XGBClassifier(eval_metric=f1_score, verbosity=0, random_state=0, tree_method='gpu_hist', gpu_id=1, use_label_encoder=False)
        X_train_dummies = pd.get_dummies(X_train) # One-Hot Train
        X_test_dummies = get_dummies_dataframe_columns(X_train_dummies, X_test) # One-Hot Test
        X_train_res, y_train_res = SMOTERounding(smote[i]).fit_resample(X_train_dummies, y_train) # SMOTE
        
        xgb_default.fit(X_train_res, y_train_res)
        
        joblib.dump(xgb_default, f"./Model/{vitek_id}/{anti_name}.joblib") # dump model
        schema[anti_name] = list(X_train_res.columns) # schema
        
        result = evaluation(X_test_dummies, y_test.astype(bool), {"Before": xgb_default})
        result.index.name = "Before/After"
        result.insert(0, "Antimicrobial", ans_col_name[i].replace("ans_", ""))
        eval_df = eval_df.append(result)
    
    # Export Schema (Schema ชื่อ columns ของข้อมูลสำหรับนำไปเรียกใช้งานโมเดล)
    schema_path = f"./Model/{vitek_id}/{vitek_id}_schema.txt"
    schema_file = open(schema_path ,'w') if os.path.exists(schema_path) else open(schema_path ,'x')
    schema_file.write(str(schema))
    schema_file.close()
        
    return eval_df

In [None]:
eval_test_default_parameter("GN")

In [None]:
eval_test_default_parameter("GP")

In [None]:
crossValidation("GN")

In [None]:
crossValidation("GP")

In [30]:
eval_test_paramerter_tuning("GN")

tnrange:   0%|          | 0/11 [00:00<?, ?it/s]

/***********************
GN amikacin
{'cv': StratifiedKFold(n_splits=10, random_state=0, shuffle=True), 'error_score': nan, 'estimator__memory': None, 'estimator__steps': (('sam', <function.SMOTERounding.SMOTERounding object at 0x00000215DDFB1BE0>), ('clf', XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False,
              eval_metric=<function f1_score at 0x00000215DBB9C5E0>,
              feature_types=None, gamma=None, gpu_id=1, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_pa

TypeError: 'dict' object is not callable

In [22]:
eval_test_paramerter_tuning("GP")

tnrange:   0%|          | 0/11 [00:00<?, ?it/s]

{'cv': StratifiedKFold(n_splits=10, random_state=0, shuffle=True), 'error_score': nan, 'estimator__memory': None, 'estimator__steps': (('sam', <function.SMOTERounding.SMOTERounding object at 0x00000215DD78FD30>), ('clf', XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False,
              eval_metric=<function f1_score at 0x00000215DBB9C5E0>,
              feature_types=None, gamma=None, gpu_id=1, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, ran

KeyError: 'colsample_bytree'