In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
from datetime import datetime
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.model_selection import StratifiedKFold, KFold
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [43]:
base_dir = '/content/drive/MyDrive/研究所/雅涵實驗室/給學弟妹/嘉基計畫 For 致鈞&芳瑗/input/1213/'
result_dir = '/content/'
dataFolders = [
    "struct",
    "unigram(triage_cc)",
    "unigram(PI)",
    "bigram(PI)",
    "CUI(PI)",
    "struct + unigram(triage_cc)",
     "struct + unigram(PI)",
    "struct + bigram(PI)",
    "struct + CUI(PI)",
    "struct + unigram(triage_cc) + unigram(PI)",
    "struct + unigram(triage_cc) + bigram(PI)",
    "struct + unigram(triage_cc) + CUI(PI)"
]

sampling_methods = {
    'SMOTETomek': SMOTETomek(),
    'SMOTE': SMOTE(),
    'RUS': RandomUnderSampler(),
    'ROS': RandomOverSampler(),
}

# defined classifiers with candidates param
classifiers = {
    'XGB': {
        'model': XGBClassifier(),
        'params': {
            'learning_rate': [0.01, 0.05, 0.1],
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 7],
            'subsample': [0.5, 0.7, 1],
            'colsample_bytree': [0.7, 0.8, 0.9],
        },
    },
    'RF': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_depth': [5, 10, 15],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [3, 5, 10, 15],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.01, 0.1, 1, 10],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear']
        }
    },
}

In [44]:
def compute_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    return sensitivity, specificity, tn, fp, fn, tp

def get_results_on_test(clf, X_test, y_test, sampling_method):
    try:
        # Predict the probabilities for the test data
        y_pred_proba = clf.predict_proba(X_test)[:, 1]
        # Convert probabilities to binary predictions
        y_pred = (y_pred_proba > 0.5).astype(int)

        test_sensitivity, test_specificity = compute_metrics(y_test, y_pred)
        test_auc = roc_auc_score(y_test, y_pred_proba)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        return test_auc, test_sensitivity, test_specificity, tn, fp, fn, tp, y_test, y_pred_proba

    except Exception as e:
        print(f"Error in get_results_on_test: {str(e)}")
        return None, None, None, None, None, None, None, None, None

In [45]:
def main():
    try:
        test_results_list = []
        train_cv_results_list = []

        for dataFolder in tqdm(dataFolders, desc="處理數據文件夾"):
            X_train = pd.read_csv(base_dir + dataFolder + '/train/X_train_pre.csv')
            X_test = pd.read_csv(base_dir + dataFolder + '/test/X_test_pre.csv')
            y_train = pd.read_csv(base_dir + dataFolder + '/train/y_train_pre.csv').squeeze()
            y_test = pd.read_csv(base_dir + dataFolder + '/test/y_test_pre.csv').squeeze()

            for name, classifier in tqdm(classifiers.items(), desc="進行網格搜索", leave=False):
                for sampling_method, sampler in sampling_methods.items():
                    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
                    grid_search = GridSearchCV(classifier['model'], classifier['params'], cv=5, scoring='roc_auc')
                    grid_search.fit(X_resampled, y_resampled)
                    best_model = grid_search.best_estimator_
                    best_model.fit(X_resampled, y_resampled)

                    # 10-fold CV
                    skf = StratifiedKFold(n_splits=10)
                    cv_results = {'sensitivity': [], 'specificity': [], 'auc': [], 'tn': 0, 'fp': 0, 'fn': 0, 'tp': 0}
                    for train_index, val_index in skf.split(X_resampled, y_resampled):
                        X_train_cv, X_val_cv = X_resampled.iloc[train_index], X_resampled.iloc[val_index]
                        y_train_cv, y_val_cv = y_resampled.iloc[train_index], y_resampled.iloc[val_index]

                        best_model.fit(X_train_cv, y_train_cv)
                        y_pred = best_model.predict(X_val_cv)
                        y_pred_proba = best_model.predict_proba(X_val_cv)[:, 1]

                        sens, spec, tn, fp, fn, tp = compute_metrics(y_val_cv, y_pred)
                        cv_results['sensitivity'].append(sens)
                        cv_results['specificity'].append(spec)
                        cv_results['auc'].append(roc_auc_score(y_val_cv, y_pred_proba))
                        cv_results['tn'] += tn
                        cv_results['fp'] += fp
                        cv_results['fn'] += fn
                        cv_results['tp'] += tp

                    # 計算平均值和標準差
                    train_cv_results_list.append({
                        'DataFolder': dataFolder,
                        'Classifier': name,
                        'SamplingMethod': sampling_method,
                        'Best Params': grid_search.best_params_,
                        'AUC': np.mean(cv_results['auc']),
                        'Sensitivity': np.mean(cv_results['sensitivity']),
                        'Specificity': np.mean(cv_results['specificity']),
                        'TN': cv_results['tn'],
                        'FP': cv_results['fp'],
                        'FN': cv_results['fn'],
                        'TP': cv_results['tp'],
                    })

                    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
                    y_pred = (y_pred_proba > 0.5).astype(int)
                    test_sensitivity, test_specificity, _, _, _, _ = compute_metrics(y_test, y_pred)
                    test_auc = roc_auc_score(y_test, y_pred_proba)
                    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

                    test_results_list.append({
                        'DataFolder': dataFolder,
                        'Classifier': name,
                        'SamplingMethod': sampling_method,
                        'Best Params': grid_search.best_params_,
                        'Test AUC': test_auc,
                        'Test Sensitivity': test_sensitivity,
                        'Test Specificity': test_specificity,
                        'TN': tn,
                        'FP': fp,
                        'FN': fn,
                        'TP': tp,
                    })
        return train_cv_results_list, test_results_list

    except Exception as e:
        print(f"發生錯誤: {e}")

In [None]:
train_result, test_results = main()

In [None]:
pd.DataFrame(train_result)

In [None]:
pd.DataFrame(test_results)