#### Combination Test
#### Step
1. Execute the prediction combination function.
2. Input the final selected gene file.
3. Create training/testing set labels.
4. Select one gene from each group for combination.
5. Create training/testing datasets.
6. Adjust model parameters.
7. Perform model prediction.
8. Output and save the final prediction results.
#### Importance
* The input file must include `ID` and `cluster`.
* Ensure that all genes are present in each dataset (code include a check to verify that the selected features are present).
* Adjust parameters manually until no overfitting occurs (you can remove unsuitable models).
* Other than the file path, `cluster_num` and `param_grids`, no further changes are necessary.
* The final combination test results should not be used for further filtering or selection.

In [2]:
import os
import pandas as pd

from itertools import combinations
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    accuracy_score,
    matthews_corrcoef,
    f1_score,
)

In [3]:
def is_unique(lst):
    return len(lst) == len(set(lst))


def prepare_data(data, combination, label, cluster_num):
    X = []
    cluster_values = []
    for i in range(cluster_num):

        gene_values = data.loc[data[data.columns[0]].isin([combination[i]])]
        gene_values = gene_values.iloc[:, 1::].values.flatten().tolist()
        cluster_values.append(gene_values)

    for i in range(len(label)):
        sample = [cluster_values[j][i] for j in range(cluster_num)]
        X.append(sample)
    return X

In [4]:
def predict_comb(
    train_data, train_label, test_data, test_label, cluster_num, param_grids, final_gene
):
    combinations_list = list(combinations(final_gene["ID"], cluster_num))
    result = []
    models = {
        "XGBoost": xgb.XGBClassifier(random_state=42),
        "RandomForest": RandomForestClassifier(random_state=42),
        "SVM": SVC(random_state=42),
        "Logistic Regression": LogisticRegression(random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
    }

    for model_name, model in models.items():
        # 取出所有可能的組合
        for combination in combinations_list:
            lst = []
            for i in range(cluster_num):
                lst.append(
                    int(
                        final_gene.loc[final_gene["ID"] == combination[i]][
                            "cluster"
                        ].iloc[0]
                    )
                )
            # 檢查是否在不同群
            if is_unique(lst):
                # 準備訓練 / 測試資料
                X_test = prepare_data(test_data, combination, test_label, cluster_num)
                X_train = prepare_data(
                    train_data, combination, train_label, cluster_num
                )

                # 模型參數調整
                param_grid = param_grids[model_name]
                grid_search = GridSearchCV(
                    estimator=model, param_grid=param_grid, cv=5, n_jobs=-1
                )
                grid_search.fit(X_train, train_label)
                print("Best Parameters:", grid_search.best_params_)
                print("Best Score:", grid_search.best_score_)
                model = grid_search.best_estimator_

                # 預測
                y_pred_train = model.predict(X_train)
                accuracy_train = accuracy_score(train_label, y_pred_train)
                print("Train accuracy: ", round(accuracy_train, 2))
                y_pred = model.predict(X_test)
                
                accuracy = accuracy_score(test_label, y_pred)
                print("Test accuracy: ", round(accuracy, 2))

                # 檢查是否過擬合
                if abs(accuracy_train - accuracy) > 0.1:
                    # print("Train accuracy: ", round(accuracy_train, 2))
                    # print("Test accuracy: ", round(accuracy, 2))
                    print(f"========={model_name} overfitting =========\n")
                else:
                    # 輸出結果
                    tn, fp, fn, tp = confusion_matrix(test_label, y_pred).ravel()
                    sensitivity = tp / (tp + fn)
                    specificity = tn / (tn + fp)
                    precision = precision_score(test_label, y_pred)
                    f1 = f1_score(test_label, y_pred)
                    mcc = matthews_corrcoef(test_label, y_pred)

                    sorted_combination = [x for _, x in sorted(zip(lst, combination))]
                    result.append(
                        [model_name]
                        + sorted_combination
                        + [
                            round(accuracy, 2),
                            round(sensitivity, 2),
                            round(specificity, 2),
                            round(precision, 2),
                            round(f1, 2),
                            round(mcc, 2),
                        ]
                    )
    id_columns = [f"ID{i+1}" for i in range(cluster_num)]
    result = pd.DataFrame(
        result,
        columns=["Model"]
        + id_columns
        + ["accuracy", "sensitivity", "specificity", "precision", "f1_score", "mcc"],
    )

    return result

#### 輸入最終挑選出的特徵資料

In [5]:
cluster_num = 4  # 總共群數
# 須包含ID和cluster欄位
final_gene = pd.DataFrame()
final_gene["ID"] = ["cg03623835","cg13995230","cg08101264","cg26097573"]
final_gene["cluster"] = [1,2,3,4]
if (is_unique(final_gene["cluster"])) == False:
    print("Need unique cluster. Please modify input data.")
final_gene

Unnamed: 0,ID,cluster
0,cg03623835,1
1,cg13995230,2
2,cg08101264,3
3,cg26097573,4


#### 組織測試

#### GSE199057 80% 訓練資料 (850K)

In [16]:
beta_normalized_199057_train = "../result/GSE199057/train80/all_beta_normalized_train.csv"

data_199057_train = pd.read_csv(beta_normalized_199057_train)
data_199057_train

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.51,1.52,1.53,1.54,1.55,1.56,1.57,1.58,1.59,1.60
0,cg07881041,0.949957,0.952146,0.904001,0.954637,0.928165,0.932608,0.924301,0.923104,0.927477,...,0.749822,0.940172,0.805832,0.756693,0.945328,0.923933,0.860106,0.928845,0.960072,0.069006
1,cg03513874,0.937071,0.889363,0.843358,0.945049,0.913755,0.857649,0.919933,0.909622,0.838915,...,0.773122,0.759051,0.809406,0.947666,0.897360,0.591877,0.960830,0.820967,0.902224,0.062067
2,cg05451842,0.016502,0.006419,0.016312,0.022076,0.011215,0.015107,0.049540,0.006533,0.016118,...,0.015439,0.012183,0.028709,0.011260,0.029149,0.013135,0.006605,0.018643,0.013003,0.006081
3,cg14797042,0.962613,0.985027,0.964741,0.958186,0.967807,0.982290,0.965621,0.974083,0.958924,...,0.793072,0.965707,0.777952,0.882774,0.934462,0.324543,0.977903,0.966190,0.971721,0.956601
4,cg09838562,0.006557,0.007492,0.009818,0.002810,0.011285,0.003184,0.014299,0.005508,0.009941,...,0.005610,0.034703,0.028405,0.012297,0.011081,0.005421,0.008825,0.004700,0.011022,0.003067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711697,cg19812938,0.894460,0.878976,0.881958,0.896522,0.890799,0.872061,0.901984,0.908596,0.889975,...,0.910865,0.877215,0.886959,0.901002,0.894252,0.885177,0.909333,0.902665,0.908721,0.837224
711698,cg06272054,0.008978,0.004395,0.004080,0.004744,0.005768,0.008339,0.006556,0.005311,0.012396,...,0.007236,0.008636,0.005029,0.005122,0.009000,0.004814,0.004500,0.008904,0.003809,0.013886
711699,cg07255356,0.011516,0.009376,0.014099,0.016191,0.008926,0.013146,0.014820,0.003522,0.004258,...,0.009114,0.012956,0.016731,0.012227,0.010212,0.015228,0.003466,0.005697,0.001893,0.023454
711700,cg24220897,0.936086,0.921546,0.929182,0.897121,0.935147,0.924128,0.920541,0.937140,0.926174,...,0.942277,0.940429,0.927275,0.957908,0.930148,0.878596,0.825669,0.917967,0.944465,0.953320


In [17]:
# 檢查挑選出的特徵是否都有出現
data_199057_train = data_199057_train[data_199057_train["Unnamed: 0"].isin(final_gene["ID"])]
data_199057_train

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.51,1.52,1.53,1.54,1.55,1.56,1.57,1.58,1.59,1.60
123799,cg03623835,0.299345,0.210423,0.222154,0.210874,0.657466,0.347075,0.508225,0.388284,0.209645,...,0.295068,0.302622,0.181142,0.056862,0.19463,0.084341,0.051982,0.277137,0.663117,0.082687
432345,cg08101264,0.159984,0.066207,0.049443,0.060118,0.108439,0.08592,0.100863,0.074768,0.052619,...,0.024081,0.232909,0.03248,0.037434,0.275907,0.013082,0.388563,0.026935,0.081823,0.130442
543530,cg26097573,0.162422,0.105967,0.067082,0.077357,0.140334,0.101735,0.128838,0.056522,0.066291,...,0.037059,0.180647,0.093951,0.019172,0.033443,0.014354,0.006722,0.048161,0.169262,0.009473
579074,cg13995230,0.282255,0.143951,0.199722,0.17757,0.307255,0.111306,0.227454,0.087053,0.100732,...,0.168607,0.359197,0.193143,0.056402,0.176429,0.042079,0.054471,0.270213,0.457187,0.003946


In [18]:
y_train_199057 = [(0 if i < 122 else 1) for i in range((data_199057_train.shape[1] - 1))]
len(y_train_199057)

183

#### GSE199057 20% 測試資料 (850K)

In [8]:
beta_normalized_199057_test = "../result/GSE199057/test20/all_beta_normalized_test.csv"

data_199057_test = pd.read_csv(beta_normalized_199057_test)
data_199057_test

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15
0,cg07881041,0.927472,0.941936,0.947982,0.928381,0.931247,0.925848,0.932257,0.950240,0.930940,...,0.944924,0.928416,0.947826,0.927517,0.977813,0.950989,0.958686,0.741392,0.944875,0.912059
1,cg03513874,0.859518,0.953629,0.905202,0.912974,0.935897,0.916896,0.905473,0.967117,0.894788,...,0.900727,0.926264,0.889790,0.536257,0.940197,0.921652,0.916094,0.741500,0.943346,0.954594
2,cg05451842,0.018657,0.016980,0.025264,0.018310,0.015852,0.018687,0.017270,0.031023,0.023883,...,0.013569,0.009514,0.019578,0.010586,0.013800,0.016905,0.028205,0.019087,0.013242,0.015173
3,cg14797042,0.935490,0.983900,0.975419,0.976533,0.931274,0.972876,0.967565,0.966997,0.961535,...,0.974522,0.974677,0.927847,0.967679,0.945040,0.659536,0.954493,0.774994,0.756900,0.645753
4,cg09838562,0.006943,0.005521,0.007438,0.004382,0.027526,0.001536,0.015347,0.016933,0.011488,...,0.008291,0.008548,0.020874,0.002554,0.010194,0.009511,0.018963,0.004257,0.005500,0.013169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711697,cg19812938,0.891069,0.901822,0.887696,0.904106,0.873141,0.894906,0.889711,0.910268,0.887800,...,0.899579,0.912525,0.893304,0.901636,0.909357,0.898196,0.931023,0.917458,0.896928,0.901329
711698,cg06272054,0.005869,0.001670,0.001734,0.009249,0.004774,0.001222,0.007410,0.006697,0.006768,...,0.006737,0.006133,0.002398,0.009910,0.014088,0.009391,0.018384,0.002608,0.003067,0.007895
711699,cg07255356,0.017197,0.004223,0.015026,0.013784,0.009908,0.015376,0.004641,0.016061,0.008395,...,0.003118,0.008475,0.014726,0.013802,0.024885,0.009163,0.019117,0.018980,0.010364,0.017652
711700,cg24220897,0.933207,0.936893,0.906153,0.934489,0.939342,0.941347,0.910692,0.924511,0.943070,...,0.950181,0.964364,0.934486,0.959507,0.936499,0.946392,0.939365,0.952978,0.954326,0.828659


In [9]:
# 檢查挑選出的特徵是否都有出現
data_199057_test = data_199057_test[data_199057_test["Unnamed: 0"].isin(final_gene["ID"])]
data_199057_test

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15
123799,cg03623835,0.491715,0.45556,0.340337,0.292442,0.236854,0.4139,0.255038,0.365404,0.278428,...,0.321605,0.522381,0.248469,0.118895,0.042008,0.250222,0.692962,0.262893,0.153405,0.05656
432345,cg08101264,0.074369,0.107521,0.104277,0.114271,0.110898,0.071492,0.079707,0.087159,0.055928,...,0.165281,0.186447,0.041711,0.015423,0.020249,0.025273,0.031048,0.024328,0.026131,0.02181
543530,cg26097573,0.111557,0.141136,0.088451,0.116954,0.13272,0.110031,0.096083,0.124432,0.095772,...,0.073397,0.144586,0.093588,0.011572,0.018049,0.34642,0.171971,0.024992,0.07119,0.137579
579074,cg13995230,0.190327,0.358098,0.235945,0.141291,0.168178,0.08549,0.15205,0.279345,0.173955,...,0.23336,0.257717,0.187647,0.083456,0.052027,0.206719,0.421129,0.181645,0.535593,0.083687


In [10]:
y_test_199057 = [(0 if i < 30 else 1) for i in range((data_199057_test.shape[1] - 1))]
len(y_test_199057)

46

In [11]:
param_grids_199057 = {
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "Logistic Regression": {"C": [0.1, 1, 10], "solver": ["liblinear"]},
    "Decision Tree": {"max_depth": [3, 5, 7], "min_samples_split": [2, 5, 10]},
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1, 0.2],
    },
}

In [19]:
result_199057 = predict_comb(
    data_199057_train,
    y_train_199057,
    data_199057_test,
    y_test_199057,
    cluster_num,
    param_grids_199057,
    final_gene,
)
result_199057

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best Score: 0.895945945945946
Train accuracy:  0.96
Test accuracy:  0.85

Best Parameters: {'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.8797297297297298
Train accuracy:  0.92
Test accuracy:  0.91
Best Parameters: {'C': 10, 'kernel': 'rbf'}
Best Score: 0.8142642642642641
Train accuracy:  0.9
Test accuracy:  0.87
Best Parameters: {'C': 10, 'solver': 'liblinear'}
Best Score: 0.7815315315315315
Train accuracy:  0.77
Test accuracy:  0.78
Best Parameters: {'max_depth': 3, 'min_samples_split': 2}
Best Score: 0.8689189189189189
Train accuracy:  0.91
Test accuracy:  0.87


Unnamed: 0,Model,ID1,ID2,ID3,ID4,accuracy,sensitivity,specificity,precision,f1_score,mcc
0,RandomForest,cg03623835,cg13995230,cg08101264,cg26097573,0.91,0.75,1.0,1.0,0.86,0.81
1,SVM,cg03623835,cg13995230,cg08101264,cg26097573,0.87,0.62,1.0,1.0,0.77,0.72
2,Logistic Regression,cg03623835,cg13995230,cg08101264,cg26097573,0.78,0.38,1.0,1.0,0.55,0.53
3,Decision Tree,cg03623835,cg13995230,cg08101264,cg26097573,0.87,0.62,1.0,1.0,0.77,0.72


In [20]:
os.makedirs(f"../result/GSE199057", exist_ok=True)
result_199057.to_csv(
    "../result/GSE199057/result_combination.csv", index=False
)

#### 液態測試

#### GSE240324液態 80% 訓練資料 (850K)

In [6]:
beta_normalized_240324_train = "../result/GSE240324_nc/train80/all_beta_normalized_train.csv"

data_240324_train = pd.read_csv(beta_normalized_240324_train)
data_240324_train

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.32,1.33,1.34,1.35,1.36,1.37,1.38,1.39,1.40,1.41
0,cg00000029,0.685549,0.791333,0.696890,0.623362,0.698149,0.710291,0.665143,0.696180,0.709970,...,0.743897,0.761919,0.627261,0.652275,0.597384,0.555608,0.591851,0.600982,0.692012,0.726591
1,cg00000109,0.964831,0.975244,0.961848,0.962704,0.975189,0.963382,0.944664,0.965548,0.960843,...,0.972724,0.948947,0.960394,0.970841,0.963756,0.952863,0.951014,0.975419,0.977375,0.971696
2,cg00000155,0.986728,0.987037,0.983501,0.981999,0.982945,0.983214,0.992226,0.984224,0.985806,...,0.982742,0.989455,0.981693,0.986307,0.987674,0.981094,0.985012,0.985369,0.987768,0.978974
3,cg00000158,0.977880,0.986635,0.988113,0.992131,0.988678,0.984668,0.985334,0.989487,0.980792,...,0.982665,0.982576,0.980850,0.989491,0.984987,0.981756,0.975262,0.992268,0.978997,0.993653
4,cg00000165,0.184111,0.155440,0.177690,0.170676,0.308850,0.227477,0.132275,0.127454,0.116430,...,0.122764,0.163392,0.111906,0.168365,0.131222,0.130385,0.106051,0.155438,0.170329,0.171089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
728305,cg27666046,0.378769,0.360621,0.400223,0.496096,0.388571,0.421055,0.403705,0.419459,0.334062,...,0.389395,0.441076,0.504492,0.346711,0.487945,0.499030,0.513211,0.445986,0.434811,0.381477
728306,cg27666049,0.729204,0.795024,0.516987,0.669781,0.695430,0.558212,0.781682,0.653849,0.881698,...,0.826163,0.740019,0.748155,0.614330,0.753090,0.728242,0.803610,0.742983,0.875664,0.814265
728307,cg27666060,0.836714,0.870768,0.740984,0.869529,0.827582,0.701222,0.791696,0.834377,0.837241,...,0.913279,0.835574,0.825491,0.762190,0.859499,0.860822,0.903881,0.844045,0.922338,0.786324
728308,cg27666108,0.303649,0.362043,0.154818,0.222826,0.197095,0.166896,0.295587,0.283316,0.303922,...,0.335193,0.275901,0.243597,0.239433,0.150168,0.214529,0.205401,0.289420,0.294389,0.214488


In [7]:
# 檢查挑選出的特徵是否都有出現
data_240324_train = data_240324_train[data_240324_train["Unnamed: 0"].isin(final_gene["ID"])]
data_240324_train

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.32,1.33,1.34,1.35,1.36,1.37,1.38,1.39,1.40,1.41
105626,cg03623835,0.370895,0.31986,0.29664,0.291543,0.36997,0.311375,0.312679,0.361943,0.43911,...,0.35951,0.401454,0.244828,0.236527,0.247491,0.239009,0.274906,0.292672,0.320534,0.271081
231597,cg08101264,0.032554,0.035738,0.023305,0.042017,0.029255,0.042161,0.035924,0.034583,0.05169,...,0.034778,0.045061,0.032018,0.030279,0.03158,0.035683,0.021305,0.052358,0.037799,0.043049
391737,cg13995230,0.404293,0.489294,0.403566,0.535726,0.347451,0.348778,0.521183,0.49739,0.452616,...,0.570974,0.445742,0.571194,0.366522,0.533514,0.549144,0.596597,0.554268,0.556299,0.440487
689541,cg26097573,0.03758,0.037289,0.02043,0.038462,0.021363,0.035386,0.030876,0.027795,0.033444,...,0.020215,0.066623,0.038984,0.041139,0.029013,0.040614,0.044823,0.038323,0.040918,0.040734


In [8]:
y_train_240324 = [(0 if i < 38 else 1) for i in range((data_240324_train.shape[1] - 1))]
len(y_train_240324)

80

#### GSE240324液態 20% 測試資料 (850K)

In [9]:
beta_normalized_240324_test = "../result/GSE240324_nc/test20/all_beta_normalized_test.csv"

data_240324_test = pd.read_csv(beta_normalized_240324_test)
data_240324_test

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.10,0.11,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7
0,cg00000029,0.663961,0.689604,0.657223,0.756809,0.667762,0.681050,0.644549,0.646586,0.675877,...,0.668034,0.664365,0.686456,0.706067,0.717829,0.679072,0.670575,0.715247,0.601729,0.651416
1,cg00000109,0.959536,0.953428,0.958271,0.950245,0.971841,0.950838,0.949728,0.949216,0.944622,...,0.956577,0.962348,0.949653,0.960551,0.950503,0.947781,0.953919,0.942962,0.956306,0.956899
2,cg00000155,0.981583,0.981265,0.984640,0.977571,0.980719,0.983515,0.983618,0.989271,0.982877,...,0.987713,0.980653,0.982854,0.981789,0.979222,0.986797,0.983355,0.983787,0.973578,0.986480
3,cg00000158,0.988611,0.984117,0.989010,0.982621,0.983334,0.982908,0.978728,0.989605,0.986994,...,0.985124,0.988296,0.986540,0.988150,0.982234,0.989604,0.986094,0.984442,0.989132,0.986756
4,cg00000165,0.164008,0.141185,0.192280,0.125364,0.153074,0.190820,0.149277,0.167187,0.129192,...,0.226501,0.158189,0.148461,0.197371,0.163333,0.138543,0.162918,0.166732,0.123371,0.197489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
728305,cg27666046,0.436225,0.464260,0.467948,0.428961,0.363105,0.377027,0.428200,0.438684,0.458647,...,0.447234,0.490138,0.426661,0.377143,0.420370,0.350351,0.424130,0.451162,0.521353,0.440192
728306,cg27666049,0.811259,0.724441,0.762142,0.629818,0.762055,0.659826,0.624905,0.731080,0.654847,...,0.700464,0.746769,0.782406,0.634363,0.875526,0.756453,0.608945,0.767045,0.782483,0.764768
728307,cg27666060,0.818305,0.822389,0.840120,0.874163,0.700941,0.656756,0.791746,0.781479,0.859643,...,0.834385,0.858053,0.845742,0.826431,0.758388,0.772279,0.774734,0.827344,0.798959,0.763503
728308,cg27666108,0.198684,0.221159,0.202805,0.297718,0.175799,0.208276,0.273477,0.245154,0.321153,...,0.247357,0.226269,0.217433,0.216504,0.216583,0.204552,0.281274,0.255560,0.158266,0.177962


In [10]:
# 檢查挑選出的特徵是否都有出現
data_240324_test = data_240324_test[data_240324_test["Unnamed: 0"].isin(final_gene["ID"])]
data_240324_test

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.10,0.11,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7
105626,cg03623835,0.360222,0.34107,0.296715,0.27131,0.245992,0.330654,0.32076,0.242544,0.272909,...,0.338938,0.211037,0.234338,0.330334,0.306278,0.316415,0.339036,0.374875,0.21836,0.283877
231597,cg08101264,0.030446,0.048229,0.043321,0.031409,0.029284,0.029959,0.033195,0.038463,0.037202,...,0.043704,0.042231,0.032609,0.04147,0.02464,0.039595,0.034229,0.043539,0.026009,0.046613
391737,cg13995230,0.477889,0.504938,0.475898,0.468646,0.418004,0.3733,0.511281,0.412403,0.508408,...,0.490517,0.49893,0.477379,0.541546,0.415257,0.419089,0.467744,0.468622,0.533099,0.414611
689541,cg26097573,0.021453,0.021875,0.02815,0.036762,0.026071,0.032829,0.023171,0.02221,0.040175,...,0.039479,0.029456,0.029315,0.030053,0.029216,0.046388,0.036742,0.038404,0.040206,0.024433


In [11]:
y_test_240324 = [(0 if i < 12 else 1) for i in range((data_240324_test.shape[1] - 1))]
len(y_test_240324)

20

In [12]:
param_grids_240324 = {
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "Logistic Regression": {"C": [0.1, 1, 10], "solver": ["liblinear"]},
    "Decision Tree": {"max_depth": [3, 5, 7], "min_samples_split": [2, 5, 10]},
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1, 0.2],
    },
}

In [13]:
result_240324 = predict_comb(
    data_240324_train,
    y_train_240324,
    data_240324_test,
    y_test_240324,
    cluster_num,
    param_grids_240324,
    final_gene,
)
result_240324

Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Best Score: 0.5875
Train accuracy:  1.0
Test accuracy:  0.5

Best Parameters: {'max_depth': 7, 'min_samples_split': 5, 'n_estimators': 200}
Best Score: 0.6375
Train accuracy:  1.0
Test accuracy:  0.3

Best Parameters: {'C': 10, 'kernel': 'rbf'}
Best Score: 0.6
Train accuracy:  0.61
Test accuracy:  0.6
Best Parameters: {'C': 0.1, 'solver': 'liblinear'}
Best Score: 0.525
Train accuracy:  0.52
Test accuracy:  0.4

Best Parameters: {'max_depth': 7, 'min_samples_split': 5}
Best Score: 0.55
Train accuracy:  0.95
Test accuracy:  0.3



Unnamed: 0,Model,ID1,ID2,ID3,ID4,accuracy,sensitivity,specificity,precision,f1_score,mcc
0,SVM,cg03623835,cg13995230,cg08101264,cg26097573,0.6,0.38,0.75,0.5,0.43,0.13


In [14]:
os.makedirs(f"../result/GSE240324_nc", exist_ok=True)
result_240324.to_csv(
    "../result/GSE240324_nc/result_combination.csv", index=False
)