#### Combination Test
#### Step
1. Execute the prediction combination function.
2. Input the final selected gene file.
3. Create training/testing set labels.
4. Select one gene from each group for combination.
5. Create training/testing datasets.
6. Adjust model parameters.
7. Perform model prediction.
8. Output and save the final prediction results.
#### Importance
* The input file must include `ID` and `cluster`.
* Ensure that all genes are present in each dataset (code include a check to verify that the selected features are present).
* Adjust parameters manually until no overfitting occurs (you can remove unsuitable models).
* Other than the file path, `cluster_num` and `param_grids`, no further changes are necessary.
* The final combination test results should not be used for further filtering or selection.

In [1]:
import os
import pandas as pd

from itertools import combinations
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    accuracy_score,
    matthews_corrcoef,
    f1_score,
)

In [2]:
def is_unique(lst):
    return len(lst) == len(set(lst))


def prepare_data(data, combination, label, cluster_num):
    X = []
    cluster_values = []
    for i in range(cluster_num):

        gene_values = data.loc[data[data.columns[0]].isin([combination[i]])]
        gene_values = gene_values.iloc[:, 1::].values.flatten().tolist()
        cluster_values.append(gene_values)

    for i in range(len(label)):
        sample = [cluster_values[j][i] for j in range(cluster_num)]
        X.append(sample)
    return X

In [3]:
def predict_comb(
    train_data, train_label, test_data, test_label, cluster_num, param_grids, final_gene
):
    combinations_list = list(combinations(final_gene["ID"], cluster_num))
    result = []
    models = {
        "XGBoost": xgb.XGBClassifier(random_state=42),
        "RandomForest": RandomForestClassifier(random_state=42),
        "SVM": SVC(random_state=42),
        "Logistic Regression": LogisticRegression(random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
    }

    for model_name, model in models.items():
        # 取出所有可能的組合
        for combination in combinations_list:
            lst = []
            for i in range(cluster_num):
                lst.append(
                    int(
                        final_gene.loc[final_gene["ID"] == combination[i]][
                            "cluster"
                        ].iloc[0]
                    )
                )
            # 檢查是否在不同群
            if is_unique(lst):
                # 準備訓練 / 測試資料
                X_test = prepare_data(test_data, combination, test_label, cluster_num)
                X_train = prepare_data(
                    train_data, combination, train_label, cluster_num
                )

                # 模型參數調整
                param_grid = param_grids[model_name]
                grid_search = GridSearchCV(
                    estimator=model, param_grid=param_grid, cv=5, n_jobs=-1
                )
                grid_search.fit(X_train, train_label)
                print("Best Parameters:", grid_search.best_params_)
                print("Best Score:", grid_search.best_score_)
                model = grid_search.best_estimator_

                # 預測
                y_pred_train = model.predict(X_train)
                accuracy_train = accuracy_score(train_label, y_pred_train)
                print("Train accuracy: ", round(accuracy_train, 2))
                y_pred = model.predict(X_test)
                
                accuracy = accuracy_score(test_label, y_pred)
                print("Test accuracy: ", round(accuracy, 2))

                # 檢查是否過擬合
                if abs(accuracy_train - accuracy) > 0.1:
                    # print("Train accuracy: ", round(accuracy_train, 2))
                    # print("Test accuracy: ", round(accuracy, 2))
                    print(f"========={model_name} overfitting =========\n")
                else:
                    # 輸出結果
                    tn, fp, fn, tp = confusion_matrix(test_label, y_pred).ravel()
                    sensitivity = tp / (tp + fn)
                    specificity = tn / (tn + fp)
                    precision = precision_score(test_label, y_pred)
                    f1 = f1_score(test_label, y_pred)
                    mcc = matthews_corrcoef(test_label, y_pred)

                    sorted_combination = [x for _, x in sorted(zip(lst, combination))]
                    result.append(
                        [model_name]
                        + sorted_combination
                        + [
                            round(accuracy, 2),
                            round(sensitivity, 2),
                            round(specificity, 2),
                            round(precision, 2),
                            round(f1, 2),
                            round(mcc, 2),
                        ]
                    )
    id_columns = [f"ID{i+1}" for i in range(cluster_num)]
    result = pd.DataFrame(
        result,
        columns=["Model"]
        + id_columns
        + ["accuracy", "sensitivity", "specificity", "precision", "f1_score", "mcc"],
    )

    return result

#### 輸入最終挑選出的特徵資料

In [4]:
cluster_num = 4  # 總共群數
# 須包含ID和cluster欄位
final_gene = pd.DataFrame()
final_gene["ID"] = ["cg12315311","cg26097573","cg06530558","cg08101264"]
final_gene["cluster"] = [1,2,3,4]
if (is_unique(final_gene["cluster"])) == False:
    print("Need unique cluster. Please modify input data.")
final_gene

Unnamed: 0,ID,cluster
0,cg12315311,1
1,cg26097573,2
2,cg06530558,3
3,cg08101264,4


#### 組織測試

#### GSE199057 80% 訓練資料 (850K)

In [16]:
beta_normalized_199057_train = "../result/GSE199057/train80/all_beta_normalized_train_oversample_smote.csv"

data_199057_train = pd.read_csv(beta_normalized_199057_train)
data_199057_train

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.112,1.113,1.114,1.115,1.116,1.117,1.118,1.119,1.120,1.121
0,cg07881041,0.949957,0.952146,0.904001,0.954637,0.928165,0.932608,0.924301,0.923104,0.927477,...,0.644990,0.793087,0.932898,0.939335,0.764933,0.785068,0.785617,0.243787,0.797899,0.876849
1,cg03513874,0.937071,0.889363,0.843358,0.945049,0.913755,0.857649,0.919933,0.909622,0.838915,...,0.565574,0.786516,0.931714,0.779345,0.847313,0.581098,0.804814,0.255060,0.936677,0.622096
2,cg05451842,0.016502,0.006419,0.016312,0.022076,0.011215,0.015107,0.049540,0.006533,0.016118,...,0.021037,0.021203,0.017413,0.014292,0.021391,0.011055,0.028076,0.009630,0.015168,0.007096
3,cg14797042,0.962613,0.985027,0.964741,0.958186,0.967807,0.982290,0.965621,0.974083,0.958924,...,0.953071,0.973963,0.982536,0.963410,0.638995,0.767950,0.795198,0.969475,0.894064,0.877406
4,cg09838562,0.006557,0.007492,0.009818,0.002810,0.011285,0.003184,0.014299,0.005508,0.009941,...,0.014648,0.015983,0.017604,0.031771,0.017019,0.005599,0.026906,0.009751,0.012031,0.008260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711697,cg19812938,0.894460,0.878976,0.881958,0.896522,0.890799,0.872061,0.901984,0.908596,0.889975,...,0.897392,0.902431,0.895908,0.878582,0.920996,0.873753,0.888982,0.867385,0.899528,0.862359
711698,cg06272054,0.008978,0.004395,0.004080,0.004744,0.005768,0.008339,0.006556,0.005311,0.012396,...,0.005247,0.004645,0.005950,0.008489,0.006246,0.005086,0.005252,0.010990,0.005969,0.004662
711699,cg07255356,0.011516,0.009376,0.014099,0.016191,0.008926,0.013146,0.014820,0.003522,0.004258,...,0.020595,0.014122,0.006085,0.012686,0.014574,0.014800,0.016009,0.024115,0.011786,0.018081
711700,cg24220897,0.936086,0.921546,0.929182,0.897121,0.935147,0.924128,0.920541,0.937140,0.926174,...,0.924846,0.956301,0.895904,0.940107,0.916043,0.859563,0.929373,0.946021,0.951844,0.907334


In [17]:
# 檢查挑選出的特徵是否都有出現
data_199057_train = data_199057_train[data_199057_train["Unnamed: 0"].isin(final_gene["ID"])]
data_199057_train

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.112,1.113,1.114,1.115,1.116,1.117,1.118,1.119,1.120,1.121
138667,cg06530558,0.386288,0.396658,0.380326,0.336618,0.464026,0.214128,0.386809,0.194599,0.237053,...,0.394169,0.408872,0.345343,0.494975,0.172656,0.062785,0.194023,0.034377,0.147064,0.220417
165778,cg12315311,0.129935,0.086388,0.125819,0.209502,0.210359,0.099936,0.181646,0.068144,0.075993,...,0.117944,0.165122,0.19507,0.226828,0.079362,0.106432,0.085633,0.039659,0.062289,0.227089
432345,cg08101264,0.159984,0.066207,0.049443,0.060118,0.108439,0.08592,0.100863,0.074768,0.052619,...,0.065286,0.142415,0.124386,0.209362,0.038702,0.033672,0.031859,0.355507,0.089526,0.052285
543530,cg26097573,0.162422,0.105967,0.067082,0.077357,0.140334,0.101735,0.128838,0.056522,0.066291,...,0.110156,0.074263,0.127126,0.172707,0.283223,0.015567,0.09139,0.013756,0.02229,0.117496


In [18]:
y_train_199057 = [(0 if i < 122 else 1) for i in range((data_199057_train.shape[1] - 1))]
len(y_train_199057)

244

#### GSE199057 20% 測試資料 (850K)

In [19]:
beta_normalized_199057_test = "../result/GSE199057/test20/all_beta_normalized_test.csv"

data_199057_test = pd.read_csv(beta_normalized_199057_test)
data_199057_test

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15
0,cg07881041,0.927472,0.941936,0.947982,0.928381,0.931247,0.925848,0.932257,0.950240,0.930940,...,0.944924,0.928416,0.947826,0.927517,0.977813,0.950989,0.958686,0.741392,0.944875,0.912059
1,cg03513874,0.859518,0.953629,0.905202,0.912974,0.935897,0.916896,0.905473,0.967117,0.894788,...,0.900727,0.926264,0.889790,0.536257,0.940197,0.921652,0.916094,0.741500,0.943346,0.954594
2,cg05451842,0.018657,0.016980,0.025264,0.018310,0.015852,0.018687,0.017270,0.031023,0.023883,...,0.013569,0.009514,0.019578,0.010586,0.013800,0.016905,0.028205,0.019087,0.013242,0.015173
3,cg14797042,0.935490,0.983900,0.975419,0.976533,0.931274,0.972876,0.967565,0.966997,0.961535,...,0.974522,0.974677,0.927847,0.967679,0.945040,0.659536,0.954493,0.774994,0.756900,0.645753
4,cg09838562,0.006943,0.005521,0.007438,0.004382,0.027526,0.001536,0.015347,0.016933,0.011488,...,0.008291,0.008548,0.020874,0.002554,0.010194,0.009511,0.018963,0.004257,0.005500,0.013169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711697,cg19812938,0.891069,0.901822,0.887696,0.904106,0.873141,0.894906,0.889711,0.910268,0.887800,...,0.899579,0.912525,0.893304,0.901636,0.909357,0.898196,0.931023,0.917458,0.896928,0.901329
711698,cg06272054,0.005869,0.001670,0.001734,0.009249,0.004774,0.001222,0.007410,0.006697,0.006768,...,0.006737,0.006133,0.002398,0.009910,0.014088,0.009391,0.018384,0.002608,0.003067,0.007895
711699,cg07255356,0.017197,0.004223,0.015026,0.013784,0.009908,0.015376,0.004641,0.016061,0.008395,...,0.003118,0.008475,0.014726,0.013802,0.024885,0.009163,0.019117,0.018980,0.010364,0.017652
711700,cg24220897,0.933207,0.936893,0.906153,0.934489,0.939342,0.941347,0.910692,0.924511,0.943070,...,0.950181,0.964364,0.934486,0.959507,0.936499,0.946392,0.939365,0.952978,0.954326,0.828659


In [20]:
# 檢查挑選出的特徵是否都有出現
data_199057_test = data_199057_test[data_199057_test["Unnamed: 0"].isin(final_gene["ID"])]
data_199057_test

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15
138667,cg06530558,0.291172,0.526976,0.378724,0.199932,0.269773,0.355752,0.294835,0.484614,0.364713,...,0.333743,0.369395,0.252607,0.19114,0.048337,0.307253,0.846696,0.277073,0.497118,0.192412
165778,cg12315311,0.110378,0.222257,0.145078,0.087647,0.088052,0.08557,0.109986,0.191118,0.114152,...,0.16341,0.225691,0.105013,0.048842,0.031222,0.138042,0.504924,0.130736,0.171442,0.043431
432345,cg08101264,0.074369,0.107521,0.104277,0.114271,0.110898,0.071492,0.079707,0.087159,0.055928,...,0.165281,0.186447,0.041711,0.015423,0.020249,0.025273,0.031048,0.024328,0.026131,0.02181
543530,cg26097573,0.111557,0.141136,0.088451,0.116954,0.13272,0.110031,0.096083,0.124432,0.095772,...,0.073397,0.144586,0.093588,0.011572,0.018049,0.34642,0.171971,0.024992,0.07119,0.137579


In [21]:
y_test_199057 = [(0 if i < 30 else 1) for i in range((data_199057_test.shape[1] - 1))]
len(y_test_199057)

46

In [22]:
param_grids_199057 = {
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "Logistic Regression": {"C": [0.1, 1, 10], "solver": ["liblinear"]},
    "Decision Tree": {"max_depth": [3, 5, 7], "min_samples_split": [2, 5, 10]},
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1, 0.2],
    },
}

In [23]:
result_199057 = predict_comb(
    data_199057_train,
    y_train_199057,
    data_199057_test,
    y_test_199057,
    cluster_num,
    param_grids_199057,
    final_gene,
)
result_199057

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best Score: 0.8482142857142858
Train accuracy:  0.96
Test accuracy:  0.89
Best Parameters: {'max_depth': 7, 'min_samples_split': 5, 'n_estimators': 100}
Best Score: 0.835969387755102
Train accuracy:  0.96
Test accuracy:  0.91
Best Parameters: {'C': 10, 'kernel': 'rbf'}
Best Score: 0.7950680272108843
Train accuracy:  0.86
Test accuracy:  0.91
Best Parameters: {'C': 10, 'solver': 'liblinear'}
Best Score: 0.7418367346938775
Train accuracy:  0.73
Test accuracy:  0.67
Best Parameters: {'max_depth': 7, 'min_samples_split': 5}
Best Score: 0.8115646258503402
Train accuracy:  0.93
Test accuracy:  0.87


Unnamed: 0,Model,ID1,ID2,ID3,ID4,accuracy,sensitivity,specificity,precision,f1_score,mcc
0,XGBoost,cg12315311,cg26097573,cg06530558,cg08101264,0.89,0.75,0.97,0.92,0.83,0.76
1,RandomForest,cg12315311,cg26097573,cg06530558,cg08101264,0.91,0.81,0.97,0.93,0.87,0.81
2,SVM,cg12315311,cg26097573,cg06530558,cg08101264,0.91,0.81,0.97,0.93,0.87,0.81
3,Logistic Regression,cg12315311,cg26097573,cg06530558,cg08101264,0.67,0.75,0.63,0.52,0.62,0.37
4,Decision Tree,cg12315311,cg26097573,cg06530558,cg08101264,0.87,0.81,0.9,0.81,0.81,0.71


In [24]:
os.makedirs(f"../result/GSE199057", exist_ok=True)
result_199057.to_csv(
    "../result/GSE199057/result_combination_oversample.csv", index=False
)

#### 液態測試

#### GSE240324液態 80% 訓練資料 (850K)

In [25]:
beta_normalized_240324_train = "../result/GSE240324_nc/train80/all_beta_normalized_train.csv"

data_240324_train = pd.read_csv(beta_normalized_240324_train)
data_240324_train

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.32,1.33,1.34,1.35,1.36,1.37,1.38,1.39,1.40,1.41
0,cg00000029,0.685549,0.791333,0.696890,0.623362,0.698149,0.710291,0.665143,0.696180,0.709970,...,0.743897,0.761919,0.627261,0.652275,0.597384,0.555608,0.591851,0.600982,0.692012,0.726591
1,cg00000109,0.964831,0.975244,0.961848,0.962704,0.975189,0.963382,0.944664,0.965548,0.960843,...,0.972724,0.948947,0.960394,0.970841,0.963756,0.952863,0.951014,0.975419,0.977375,0.971696
2,cg00000155,0.986728,0.987037,0.983501,0.981999,0.982945,0.983214,0.992226,0.984224,0.985806,...,0.982742,0.989455,0.981693,0.986307,0.987674,0.981094,0.985012,0.985369,0.987768,0.978974
3,cg00000158,0.977880,0.986635,0.988113,0.992131,0.988678,0.984668,0.985334,0.989487,0.980792,...,0.982665,0.982576,0.980850,0.989491,0.984987,0.981756,0.975262,0.992268,0.978997,0.993653
4,cg00000165,0.184111,0.155440,0.177690,0.170676,0.308850,0.227477,0.132275,0.127454,0.116430,...,0.122764,0.163392,0.111906,0.168365,0.131222,0.130385,0.106051,0.155438,0.170329,0.171089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
728305,cg27666046,0.378769,0.360621,0.400223,0.496096,0.388571,0.421055,0.403705,0.419459,0.334062,...,0.389395,0.441076,0.504492,0.346711,0.487945,0.499030,0.513211,0.445986,0.434811,0.381477
728306,cg27666049,0.729204,0.795024,0.516987,0.669781,0.695430,0.558212,0.781682,0.653849,0.881698,...,0.826163,0.740019,0.748155,0.614330,0.753090,0.728242,0.803610,0.742983,0.875664,0.814265
728307,cg27666060,0.836714,0.870768,0.740984,0.869529,0.827582,0.701222,0.791696,0.834377,0.837241,...,0.913279,0.835574,0.825491,0.762190,0.859499,0.860822,0.903881,0.844045,0.922338,0.786324
728308,cg27666108,0.303649,0.362043,0.154818,0.222826,0.197095,0.166896,0.295587,0.283316,0.303922,...,0.335193,0.275901,0.243597,0.239433,0.150168,0.214529,0.205401,0.289420,0.294389,0.214488


In [26]:
# 檢查挑選出的特徵是否都有出現
data_240324_train = data_240324_train[data_240324_train["Unnamed: 0"].isin(final_gene["ID"])]
data_240324_train

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.32,1.33,1.34,1.35,1.36,1.37,1.38,1.39,1.40,1.41
186994,cg06530558,0.608686,0.629394,0.518108,0.631536,0.484308,0.419623,0.700088,0.618104,0.615828,...,0.716053,0.634906,0.648515,0.517477,0.649127,0.710549,0.674254,0.690381,0.703316,0.556394
231597,cg08101264,0.032554,0.035738,0.023305,0.042017,0.029255,0.042161,0.035924,0.034583,0.05169,...,0.034778,0.045061,0.032018,0.030279,0.03158,0.035683,0.021305,0.052358,0.037799,0.043049
346002,cg12315311,0.345222,0.355672,0.285053,0.364924,0.273513,0.241729,0.373539,0.374984,0.401725,...,0.365973,0.316987,0.261359,0.312258,0.213167,0.255125,0.238624,0.284561,0.359547,0.320355
689541,cg26097573,0.03758,0.037289,0.02043,0.038462,0.021363,0.035386,0.030876,0.027795,0.033444,...,0.020215,0.066623,0.038984,0.041139,0.029013,0.040614,0.044823,0.038323,0.040918,0.040734


In [27]:
y_train_240324 = [(0 if i < 38 else 1) for i in range((data_240324_train.shape[1] - 1))]
len(y_train_240324)

80

#### GSE240324液態 20% 測試資料 (850K)

In [28]:
beta_normalized_240324_test = "../result/GSE240324_nc/test20/all_beta_normalized_test.csv"

data_240324_test = pd.read_csv(beta_normalized_240324_test)
data_240324_test

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.10,0.11,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7
0,cg00000029,0.663961,0.689604,0.657223,0.756809,0.667762,0.681050,0.644549,0.646586,0.675877,...,0.668034,0.664365,0.686456,0.706067,0.717829,0.679072,0.670575,0.715247,0.601729,0.651416
1,cg00000109,0.959536,0.953428,0.958271,0.950245,0.971841,0.950838,0.949728,0.949216,0.944622,...,0.956577,0.962348,0.949653,0.960551,0.950503,0.947781,0.953919,0.942962,0.956306,0.956899
2,cg00000155,0.981583,0.981265,0.984640,0.977571,0.980719,0.983515,0.983618,0.989271,0.982877,...,0.987713,0.980653,0.982854,0.981789,0.979222,0.986797,0.983355,0.983787,0.973578,0.986480
3,cg00000158,0.988611,0.984117,0.989010,0.982621,0.983334,0.982908,0.978728,0.989605,0.986994,...,0.985124,0.988296,0.986540,0.988150,0.982234,0.989604,0.986094,0.984442,0.989132,0.986756
4,cg00000165,0.164008,0.141185,0.192280,0.125364,0.153074,0.190820,0.149277,0.167187,0.129192,...,0.226501,0.158189,0.148461,0.197371,0.163333,0.138543,0.162918,0.166732,0.123371,0.197489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
728305,cg27666046,0.436225,0.464260,0.467948,0.428961,0.363105,0.377027,0.428200,0.438684,0.458647,...,0.447234,0.490138,0.426661,0.377143,0.420370,0.350351,0.424130,0.451162,0.521353,0.440192
728306,cg27666049,0.811259,0.724441,0.762142,0.629818,0.762055,0.659826,0.624905,0.731080,0.654847,...,0.700464,0.746769,0.782406,0.634363,0.875526,0.756453,0.608945,0.767045,0.782483,0.764768
728307,cg27666060,0.818305,0.822389,0.840120,0.874163,0.700941,0.656756,0.791746,0.781479,0.859643,...,0.834385,0.858053,0.845742,0.826431,0.758388,0.772279,0.774734,0.827344,0.798959,0.763503
728308,cg27666108,0.198684,0.221159,0.202805,0.297718,0.175799,0.208276,0.273477,0.245154,0.321153,...,0.247357,0.226269,0.217433,0.216504,0.216583,0.204552,0.281274,0.255560,0.158266,0.177962


In [29]:
# 檢查挑選出的特徵是否都有出現
data_240324_test = data_240324_test[data_240324_test["Unnamed: 0"].isin(final_gene["ID"])]
data_240324_test

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.10,0.11,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7
186994,cg06530558,0.597841,0.615926,0.579736,0.598944,0.519175,0.484041,0.607999,0.611006,0.650881,...,0.61624,0.700546,0.599495,0.613499,0.540692,0.551357,0.549129,0.615291,0.562778,0.562826
231597,cg08101264,0.030446,0.048229,0.043321,0.031409,0.029284,0.029959,0.033195,0.038463,0.037202,...,0.043704,0.042231,0.032609,0.04147,0.02464,0.039595,0.034229,0.043539,0.026009,0.046613
346002,cg12315311,0.340467,0.309845,0.324651,0.350272,0.282502,0.239374,0.305675,0.260462,0.274462,...,0.292627,0.272918,0.265581,0.269943,0.261753,0.304908,0.281928,0.312278,0.186867,0.236092
689541,cg26097573,0.021453,0.021875,0.02815,0.036762,0.026071,0.032829,0.023171,0.02221,0.040175,...,0.039479,0.029456,0.029315,0.030053,0.029216,0.046388,0.036742,0.038404,0.040206,0.024433


In [30]:
y_test_240324 = [(0 if i < 12 else 1) for i in range((data_240324_test.shape[1] - 1))]
len(y_test_240324)

20

In [31]:
param_grids_240324 = {
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "Logistic Regression": {"C": [0.1, 1, 10], "solver": ["liblinear"]},
    "Decision Tree": {"max_depth": [3, 5, 7], "min_samples_split": [2, 5, 10]},
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1, 0.2],
    },
}

In [32]:
result_240324 = predict_comb(
    data_240324_train,
    y_train_240324,
    data_240324_test,
    y_test_240324,
    cluster_num,
    param_grids_240324,
    final_gene,
)
result_240324

Best Parameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 50}
Best Score: 0.6625
Train accuracy:  1.0
Test accuracy:  0.6

Best Parameters: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50}
Best Score: 0.7125
Train accuracy:  0.99
Test accuracy:  0.5

Best Parameters: {'C': 10, 'kernel': 'rbf'}
Best Score: 0.6375
Train accuracy:  0.65
Test accuracy:  0.6
Best Parameters: {'C': 10, 'solver': 'liblinear'}
Best Score: 0.625
Train accuracy:  0.66
Test accuracy:  0.6
Best Parameters: {'max_depth': 3, 'min_samples_split': 2}
Best Score: 0.625
Train accuracy:  0.8
Test accuracy:  0.45



Unnamed: 0,Model,ID1,ID2,ID3,ID4,accuracy,sensitivity,specificity,precision,f1_score,mcc
0,SVM,cg12315311,cg26097573,cg06530558,cg08101264,0.6,0.62,0.58,0.5,0.56,0.2
1,Logistic Regression,cg12315311,cg26097573,cg06530558,cg08101264,0.6,0.62,0.58,0.5,0.56,0.2


In [33]:
os.makedirs(f"../result/GSE240324_nc", exist_ok=True)
result_240324.to_csv(
    "../result/GSE240324_nc/result_combination.csv", index=False
)