In [1]:
from function.validation import *
from function.rsmote import *
from function.SMOTERounding import *

import os
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.decomposition import PCA
from xgboost import XGBClassifier, plot_importance
from imblearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE, SMOTEN, SMOTENC, SVMSMOTE, KMeansSMOTE, RandomOverSampler,BorderlineSMOTE, ADASYN
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, NearMiss, RepeatedEditedNearestNeighbours
from imblearn.combine import SMOTEENN, SMOTETomek 
from imblearn.metrics import geometric_mean_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# GN
ans_gn_name = ['amikacin',
'amoxicillin/clavulanic acid',
'cefalexin',
'cefovecin',
'doxycycline',
'enrofloxacin',
'gentamicin',
'imipenem',
'marbofloxacin',
'nitrofurantoin',
'trimethoprim/sulfamethoxazole']
gn_ans_col_name = ["ans_" + e for e in ans_gn_name]

# GP
ans_gp_name = ['amikacin',
 'amoxicillin/clavulanic acid',
 'cefalexin',
 'cefovecin',
 'clindamycin',
 'doxycycline',
 'enrofloxacin',
 'marbofloxacin',
 'nitrofurantoin',
 'trimethoprim/sulfamethoxazole',
 'vancomycin']
gp_ans_col_name = ["ans_" + e for e in ans_gp_name]

กำหนด SMOTE Alogorithm ตามที่เลือกไว้จากการเปรียบเทียบในไฟล์ 2_cross_validation_compare_smote.ipynb

In [3]:
GN_SMOTE = [
    RSmoteKClasses(random_state=0), #amikancin 
    SVMSMOTE(random_state=0, n_jobs=-1), #amoxicillin/clavulanic acid
    RSmoteKClasses(random_state=0), #cefalexin
    ADASYN(random_state=0, n_jobs=-1), #cefovecin
    BorderlineSMOTE(random_state=0, n_jobs=-1), #doxycycline
    RSmoteKClasses(random_state=0), #enrofloxacin
    RSmoteKClasses(random_state=0), #gentamicin
    SVMSMOTE(random_state=0 ,n_jobs=-1), #imipenem
    RSmoteKClasses(random_state=0), #marbofloxacin
    SVMSMOTE(random_state=0, n_jobs=-1), #nitrofurantoin
    SVMSMOTE(random_state=0, n_jobs=-1), #trimethoprim/sulfamethoxazole
]

GP_SMOTE = [
    BorderlineSMOTE(random_state=0, n_jobs=-1), #amikancin 
    BorderlineSMOTE(random_state=0, n_jobs=-1), #amoxicillin/clavulanic acid
    SMOTE(random_state=0, n_jobs=-1), #cefalexin
    SVMSMOTE(random_state=0, n_jobs=-1), #cefovecin
    SVMSMOTE(random_state=0, n_jobs=-1), #clindamycin
    RSmoteKClasses(random_state=0), #doxycycline
    RSmoteKClasses(random_state=0), #enrofloxacin
    BorderlineSMOTE(random_state=0, n_jobs=-1), #marbofloxacin
    RSmoteKClasses(random_state=0), #nitrofurantoin
    SVMSMOTE(random_state=0, n_jobs=-1), #trimethoprim/sulfamethoxazole
    SMOTE(random_state=0, n_jobs=-1) #vancomycin
]

In [4]:
from tqdm import tqdm_notebook as tqdm
# from time import sleep
# for _ in tqdm(range(10), desc='tqdm'):
#     sleep(0.5)
def evaluate_by_case(ans_df: pd.DataFrame, predict_df: pd.DataFrame):
        row = len(ans_df)
        acc_sum = 0
        prec_sum = 0
        rec_sum = 0
        f1_sum = 0
        count_all_correct = 0
        for i in range(row):
            if ans_df.iloc[i].any() == False:  # No recommend case
                if predict_df.iloc[i].any() == False:
                    score = 1
                    count_all_correct += 1
                else:
                    score = 0
                acc_sum += accuracy_score(ans_df.iloc[i], predict_df.iloc[i])
                prec_sum += score
                rec_sum += score
                f1_sum += score
            else:  # Normal case
                if predict_df.iloc[i].any() == False:
                    prec_score = 0
                else:
                    prec_score = precision_score(
                        ans_df.iloc[i], predict_df.iloc[i])
                acc_sum += accuracy_score(ans_df.iloc[i], predict_df.iloc[i])
                prec_sum += prec_score
                rec_sum += recall_score(ans_df.iloc[i], predict_df.iloc[i])
                f1_sum += f1_score(ans_df.iloc[i], predict_df.iloc[i])
                if accuracy_score(ans_df.iloc[i], predict_df.iloc[i]) == 1:
                    count_all_correct += 1
        print("all correct :", count_all_correct)
        return {
            "accuracy": acc_sum/row,
            "precision": prec_sum/row,
            "recall": rec_sum/row,
            "f1": f1_sum/row
        }

In [5]:
# read excel
df_gn = pd.read_excel("./Dataset/Origin/trained_GN_Dataset_CS.xlsx")
df_gp = pd.read_excel("./Dataset/Origin/trained_GP_Dataset_CS.xlsx")
# รวมกลุ่ม submitted_sample_category ที่น้อยกว่า 10 เป็นกลุ่มใหม่ที่ชื่อว่า "xxrare"
df_gn = binning_less_than(df_gn, "submitted_sample_category", 5, "xxrare")
df_gp = binning_less_than(df_gp, "submitted_sample_category", 5, "xxrare")
# เลือกชุดข้อมูล test by case
df_gn_test = df_gn[df_gn["type"] == "test"]
df_gp_test = df_gp[df_gp["type"] == "test"]

In [6]:
display(df_gn_test)

Unnamed: 0,hn,date_of_submission,species,submitted_sample,submitted_sample_category,vitek_id,bacteria_genus,report_issued_date,type,ans_amikacin,...,S/I/R_neomycin,S/I/R_nitrofurantoin,S/I/R_piperacillin,S/I/R_polymyxin b,S/I/R_pradofloxacin,S/I/R_rifampicin,S/I/R_tetracycline,S/I/R_tobramycin,S/I/R_trimethoprim/sulfamethoxazole,id
3130,5910814,2021-01-07,dog,ub mucosa,urinary tract,GN,escherichia,2021-01-12,test,False,...,S,S,,,I,,R,,R,3131
3131,6400789,2021-01-20,dog,lung,xxrare,GN,escherichia,2021-01-25,test,True,...,S,R,,,R,,R,,R,3132
3132,6315628,2021-01-06,dog,prostate,urogenital system,GN,pseudomonas,2021-01-08,test,False,...,,,,,,,,,,3133
3133,6400547,2021-01-14,dog,urine,urinary tract,GN,escherichia,2021-01-20,test,True,...,S,S,,,I,,R,,S,3134
3134,6007003,2021-01-12,dog,urine,urinary tract,GN,klebsiella,2021-01-19,test,False,...,S,I,,,S,,S,,S,3135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3421,6003184,2019-12-21,cat,urine,urinary tract,GN,enterobacter,2019-12-26,test,False,...,S,S,,,I,,S,,R,3422
3422,6216107,2019-12-17,cat,ear swab,ear,GN,burkholderia,2019-12-13,test,False,...,,,,,,,R,,S,3423
3423,6215839,2019-12-07,cat,urine,urinary tract,GN,pseudomonas,2019-12-13,test,True,...,,,,,,,,,,3424
3424,6216056,2019-12-12,cat,ub mucosa,urinary tract,GN,escherichia,2019-12-13,test,False,...,S,S,,,S,,S,,S,3425


In [7]:
from tqdm import tnrange
def test_by_case(vitek_id: str):
    df_test = df_gn_test if vitek_id == "GN" else df_gp_test
    ans_col_name = gn_ans_col_name if vitek_id == "GN" else gp_ans_col_name
    smote = GN_SMOTE if vitek_id == "GN" else GP_SMOTE
    
    with open(f"./Model/{vitek_id}/{vitek_id}_schema.txt",'r') as schema_file :
        schema = eval(schema_file.read())

    X_test = df_test[["species","submitted_sample_category","bacteria_genus"] + list(df_test.columns[df_test.columns.str.startswith("S/I/R")])]
    Y_test = df_test[ans_col_name]
    df_predict = pd.DataFrame()
    for i in range(11):
        anti_name = ans_col_name[i].replace("ans_", "").replace("/", "_")
        model = joblib.load(f"./Model/{vitek_id}/{anti_name}.joblib")
        df_schema = pd.DataFrame(columns=schema[anti_name])  # create schema
        X_dummies = get_dummies_dataframe_columns(df_schema, pd.get_dummies(X_test))  # one-hot
        df_predict[ans_col_name[i]] = model.predict(X_dummies) # predict
       

    return evaluate_by_case(Y_test, df_predict)

In [8]:
test_by_case("GP")

all correct : 147


{'accuracy': 0.932941583131696,
 'precision': 0.7861216730038024,
 'recall': 0.826299112801014,
 'f1': 0.7892521082635152}

In [9]:
test_by_case("GN")

all correct : 172


{'accuracy': 0.9367321867321861,
 'precision': 0.8190315315315314,
 'recall': 0.8859234234234231,
 'f1': 0.8333386958386962}

In [14]:
#TEST 100 case
# read excel
df_gn_togo= pd.read_excel("./100case/GN_togo.xlsx")
df_gp_togo= pd.read_excel("./100case/GP_togo.xlsx")

# เลือกชุดข้อมูล test by case
df_gn_test=df_gn_togo
df_gp_test=df_gp_togo


In [15]:
#test by captain change vitek_id for case "GN" and "GP"
vitek_id="GP"
df_test = df_gn_test if vitek_id == "GN" else df_gp_test
ans_col_name = gn_ans_col_name if vitek_id == "GN" else gp_ans_col_name
smote = GN_SMOTE if vitek_id == "GN" else GP_SMOTE

with open(f"./Model/{vitek_id}/{vitek_id}_schema.txt",'r') as schema_file :
    schema = eval(schema_file.read())

X_test = df_test[["species","submitted_sample_category","bacteria_genus"] + list(df_test.columns[df_test.columns.str.startswith("S/I/R")])]
Y_test = df_test[ans_col_name]
df_predict = pd.DataFrame()
for i in range(11):
    anti_name = ans_col_name[i].replace("ans_", "").replace("/", "_")
    model = joblib.load(f"./Model/{vitek_id}/{anti_name}.joblib")
    df_schema = pd.DataFrame(columns=schema[anti_name])  # create schema
    X_dummies = get_dummies_dataframe_columns(df_schema, pd.get_dummies(X_test))  # one-hot
    df_predict[ans_col_name[i]] = model.predict(X_dummies) # predict
display(df_predict)
# folder ที่จะใส่ชื่อ 100case
df_predict.to_csv('./100case/predict_GN_100case.csv', index=False)
# def evaluate_by_case(ans_df: pd.DataFrame, predict_df: pd.DataFrame):
ans_df=Y_test 
predict_df=df_predict        
row = len(ans_df)
acc_sum = 0
prec_sum = 0
rec_sum = 0
f1_sum = 0
count_all_correct = 0
for i in range(row):
    #หมายถึง ถ้าแถว(row ->)คำตอบจริง ไม่มีเคสแนะนำยาเลย(true) 
    if ans_df.iloc[i].any() == False:  # No recommend case
        if predict_df.iloc[i].any() == False:
            score = 1
            count_all_correct += 1
        else:
            score = 0
        acc_sum += accuracy_score(ans_df.iloc[i], predict_df.iloc[i])
        prec_sum += score
        rec_sum += score
        f1_sum += score
    else:  # Normal case
        if predict_df.iloc[i].any() == False:
            prec_score = 0
        else:
            prec_score = precision_score(
                ans_df.iloc[i], predict_df.iloc[i])
        acc_sum += accuracy_score(ans_df.iloc[i], predict_df.iloc[i])
        prec_sum += prec_score
        rec_sum += recall_score(ans_df.iloc[i], predict_df.iloc[i])
        f1_sum += f1_score(ans_df.iloc[i], predict_df.iloc[i])
        if accuracy_score(ans_df.iloc[i], predict_df.iloc[i]) == 1:
            count_all_correct += 1
print("all correct :", count_all_correct)
print(
    "accuracy", acc_sum/row,"\n"
    "precision", prec_sum/row,"\n"
    "recall", rec_sum/row,"\n"
    "f1", f1_sum/row
)

Unnamed: 0,ans_amikacin,ans_amoxicillin/clavulanic acid,ans_cefalexin,ans_cefovecin,ans_clindamycin,ans_doxycycline,ans_enrofloxacin,ans_marbofloxacin,ans_nitrofurantoin,ans_trimethoprim/sulfamethoxazole,ans_vancomycin
0,0,1,0,0,0,0,1,1,1,0,0
1,0,1,1,1,1,1,0,0,0,0,0
2,0,1,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,0
5,0,1,0,1,1,0,0,0,0,1,0
6,0,1,0,1,1,1,0,0,0,1,0
7,0,1,0,1,1,0,0,0,0,1,0
8,0,1,1,1,1,1,0,0,0,1,0
9,0,0,0,0,0,0,0,0,0,0,0


all correct : 13
accuracy 0.8363636363636362 
precision 0.667202380952381 
recall 0.8320833333333335 
f1 0.7251948051948054


In [16]:
test_by_case("GN")

all correct : 17


{'accuracy': 0.9012539184952976,
 'precision': 0.760919540229885,
 'recall': 0.8031609195402301,
 'f1': 0.7562944718117129}

In [17]:
test_by_case("GP")

all correct : 13


{'accuracy': 0.8363636363636362,
 'precision': 0.667202380952381,
 'recall': 0.8320833333333335,
 'f1': 0.7251948051948054}