In [1]:
from function.validation import *
from function.rsmote import *
from function.SMOTERounding import *

import os
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.decomposition import PCA
from xgboost import XGBClassifier, plot_importance
from imblearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE, SMOTEN, SMOTENC, SVMSMOTE, KMeansSMOTE, RandomOverSampler,BorderlineSMOTE, ADASYN
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, NearMiss, RepeatedEditedNearestNeighbours
from imblearn.combine import SMOTEENN, SMOTETomek 
from imblearn.metrics import geometric_mean_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
from tqdm.notebook import tqdm
from time import sleep
for _ in tqdm(range(10), desc='tqdm'):
    sleep(0.5)
from tqdm.notebook import tnrange
for _ in tnrange(10, desc='tnrange'):
    sleep(0.5)

tqdm:   0%|          | 0/10 [00:00<?, ?it/s]

tnrange:   0%|          | 0/10 [00:00<?, ?it/s]

กำหนด SMOTE Alogorithm ตามที่เลือกไว้จากการเปรียบเทียบในไฟล์ 2_cross_validation_compare_smote.ipynb

In [3]:
# SMOTE Alogorithm เรียงตามชื่อยา (ชื่อยาเรียงตามตัวอักษร)
GN_SMOTE = [
    RSmoteKClasses(random_state=0), #amikancin 
    SVMSMOTE(random_state=0, n_jobs=-1), #amoxicillin/clavulanic acid
    RSmoteKClasses(random_state=0), #cefalexin
    ADASYN(random_state=0, n_jobs=-1), #cefovecin
    BorderlineSMOTE(random_state=0, n_jobs=-1), #doxycycline
    RSmoteKClasses(random_state=0), #enrofloxacin
    RSmoteKClasses(random_state=0), #gentamicin
    SVMSMOTE(random_state=0 ,n_jobs=-1), #imipenem
    RSmoteKClasses(random_state=0), #marbofloxacin
    SVMSMOTE(random_state=0, n_jobs=-1), #nitrofurantoin
    SVMSMOTE(random_state=0, n_jobs=-1), #trimethoprim/sulfamethoxazole
]

GP_SMOTE = [
    BorderlineSMOTE(random_state=0, n_jobs=-1), #amikancin 
    BorderlineSMOTE(random_state=0, n_jobs=-1), #amoxicillin/clavulanic acid
    SMOTE(random_state=0, n_jobs=-1), #cefalexin
    SVMSMOTE(random_state=0, n_jobs=-1), #cefovecin
    SVMSMOTE(random_state=0, n_jobs=-1), #clindamycin
    RSmoteKClasses(random_state=0), #doxycycline
    RSmoteKClasses(random_state=0), #enrofloxacin
    BorderlineSMOTE(random_state=0, n_jobs=-1), #marbofloxacin
    RSmoteKClasses(random_state=0), #nitrofurantoin
    SVMSMOTE(random_state=0, n_jobs=-1), #trimethoprim/sulfamethoxazole
    SMOTE(random_state=0, n_jobs=-1) #vancomycin
]

In [4]:
# GN
ans_gn_name = ['amikacin',
'amoxicillin/clavulanic acid',
'cefalexin',
'cefovecin',
'doxycycline',
'enrofloxacin',
'gentamicin',
'imipenem',
'marbofloxacin',
'nitrofurantoin',
'trimethoprim/sulfamethoxazole']
gn_ans_col_name = ["ans_" + e for e in ans_gn_name]

# GP
ans_gp_name = ['amikacin',
 'amoxicillin/clavulanic acid',
 'cefalexin',
 'cefovecin',
 'clindamycin',
 'doxycycline',
 'enrofloxacin',
 'marbofloxacin',
 'nitrofurantoin',
 'trimethoprim/sulfamethoxazole',
 'vancomycin']
gp_ans_col_name = ["ans_" + e for e in ans_gp_name]

In [5]:
def getData(vitek_id: str, i : int):
    ans_col_name = gn_ans_col_name if vitek_id == "GN" else gp_ans_col_name
    ans_name = ans_col_name[i].replace("ans_", "").replace("/", "_")
    df_train = pd.read_csv(f"./Dataset/{vitek_id}/Train/Train_{vitek_id}_{ans_name}.csv")

    X_train = df_train[["species","submitted_sample_category","bacteria_genus"] + list(df_train.columns[df_train.columns.str.startswith("S/I/R")])]
    y_train = df_train[ans_col_name[i]]
    return X_train , y_train 

In [6]:
if not os.path.exists("./Grid_Search_CV") :
    os.makedirs("./Grid_Search_CV")
    os.makedirs("./Grid_Search_CV/GP")
    os.makedirs("./Grid_Search_CV/GN")

In [7]:
params_xgb = {
    "n_estimators": [100, 1000],
    "gamma": [0.5, 1, 2],
    "max_depth": [5 , 6 , 7],
    "subsample": [0.5, 0.7, 1],
    "colsample_bytree": [0.5, 0.7, 1],
    "learning_rate": [0.3 , 0.1]
}

def grid_search_export(path: str, smote: SMOTERounding, X: pd.DataFrame, y: pd.DataFrame):
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    xgb = XGBClassifier(eval_metric=f1_score, verbosity=0,
                        random_state=0, tree_method='gpu_hist', gpu_id=1)
    pip = Pipeline((
        ("sam", smote),
        ("clf", xgb)
    ))
    gs = GridSearchCV(
        pip, {"clf__" + key: value for key, value in params_xgb.items()}, scoring='f1', cv=skf, n_jobs=5)
    gs.fit(X, y)
    joblib.dump(gs, path)

#### ปรับจูนพารามิเตอร์ของโมเดล GN (แต่ละโมเดลรันนานมาก แนะนำว่าควรแบ่งรันที่ละ 1-2 โมเดล)

In [8]:
#COPY รัน 2,8
# vitek_id = "GN"
# ans_col_name = gn_ans_col_name if vitek_id == "GN" else gp_ans_col_name
# smote = GN_SMOTE if vitek_id == "GN" else GP_SMOTE
# for i in range(11):
#     X_train, y_train = getData(vitek_id, i)
#     ans_name = ans_col_name[i].replace("ans_", "").replace("/", "_")
#     grid_search_export(f"./Grid_Search_CV/{vitek_id}/{ans_name}.joblib",
#                        SMOTERounding(smote[i]), pd.get_dummies(X_train), y_train)

In [9]:
from tqdm import tqdm_notebook as tqdm
vitek_id = "GN"
ans_col_name = gn_ans_col_name if vitek_id == "GN" else gp_ans_col_name
smote = GN_SMOTE if vitek_id == "GN" else GP_SMOTE
for i in tqdm(range(11), desc='tqdm'):
    X_train, y_train = getData(vitek_id, i)
    ans_name = ans_col_name[i].replace("ans_", "").replace("/", "_")
    grid_search_export(f"./Grid_Search_CV/{vitek_id}/{ans_name}.joblib",
                       SMOTERounding(smote[i]), pd.get_dummies(X_train), y_train)

tqdm:   0%|          | 0/11 [00:00<?, ?it/s]

#### ปรับจูนพารามิเตอร์ของโมเดล GP

In [10]:
from tqdm import tnrange
vitek_id = "GP"
ans_col_name = gn_ans_col_name if vitek_id == "GN" else gp_ans_col_name
smote = GN_SMOTE if vitek_id == "GN" else GP_SMOTE
for i in tnrange(11, desc='tnrange'):
    X_train, y_train = getData(vitek_id, i)
    ans_name = ans_col_name[i].replace("ans_", "").replace("/", "_")
    grid_search_export(f"./Grid_Search_CV/{vitek_id}/{ans_name}.joblib",
                       SMOTERounding(smote[i]), pd.get_dummies(X_train), y_train)

tnrange:   0%|          | 0/11 [00:00<?, ?it/s]