#### Combination Testing
#### Step
1. Execute the prediction combination function.
2. Input the final selected gene file.
3. Create training/testing set labels.
4. Select one gene from each group for combination.
5. Create training/testing datasets.
6. Adjust model parameters.
7. Perform model prediction.
8. Output and save the final prediction results.
#### Importance
* The input file must include `ID` and `cluster`.
* Ensure that all genes are present in each dataset (code include a check to verify that the selected features are present).
* Adjust parameters manually until no overfitting occurs (you can remove unsuitable models).
* Other than the file path, `cluster_num` and `param_grids`, no further changes are necessary.
* The final combination test results should not be used for further filtering or selection.

In [28]:
import os
import pandas as pd

from itertools import combinations
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    accuracy_score,
    matthews_corrcoef,
    f1_score,
)

In [29]:
def is_unique(lst):
    return len(lst) == len(set(lst))


def prepare_data(data, combination, label, cluster_num):
    X = []
    cluster_values = []
    for i in range(cluster_num):

        gene_values = data.loc[data[data.columns[0]].isin([combination[i]])]
        gene_values = gene_values.iloc[:, 1::].values.flatten().tolist()
        cluster_values.append(gene_values)

    for i in range(len(label)):
        sample = [cluster_values[j][i] for j in range(cluster_num)]
        X.append(sample)
    return X

In [30]:
def predict_comb(
    train_data, train_label, test_data, test_label, cluster_num, param_grids, final_gene
):
    combinations_list = list(combinations(final_gene["ID"], cluster_num))
    result = []
    models = {
        "XGBoost": xgb.XGBClassifier(),
        "RandomForest": RandomForestClassifier(),
        "SVM": SVC(),
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
    }

    for model_name, model in models.items():
        # 取出所有可能的組合
        for combination in combinations_list:
            lst = []
            for i in range(cluster_num):
                lst.append(
                    int(
                        final_gene.loc[final_gene["ID"] == combination[i]][
                            "cluster"
                        ].iloc[0]
                    )
                )
            # 檢查是否在不同群
            if is_unique(lst):
                # 準備訓練 / 測試資料
                X_test = prepare_data(test_data, combination, test_label, cluster_num)
                X_train = prepare_data(
                    train_data, combination, train_label, cluster_num
                )

                # 模型參數調整
                param_grid = param_grids[model_name]
                grid_search = GridSearchCV(
                    estimator=model, param_grid=param_grid, cv=5, n_jobs=-1
                )
                grid_search.fit(X_train, train_label)
                print("Best Parameters:", grid_search.best_params_)
                print("Best Score:", grid_search.best_score_)
                model = grid_search.best_estimator_

                # 預測
                y_pred_train = model.predict(X_train)
                accuracy_train = accuracy_score(train_label, y_pred_train)
                # print("Train accuracy: ", round(accuracy_train, 2))
                y_pred = model.predict(X_test)
                accuracy = accuracy_score(test_label, y_pred)
                # print("Test accuracy: ", round(accuracy, 2))

                # 檢查是否過擬合
                if abs(accuracy_train - accuracy) > 0.05:
                    print("Train accuracy: ", round(accuracy_train, 2))
                    print("Test accuracy: ", round(accuracy, 2))
                    print(f"========={model_name} overfitting =========\n")

                # 輸出結果
                tn, fp, fn, tp = confusion_matrix(test_label, y_pred).ravel()
                sensitivity = tp / (tp + fn)
                specificity = tn / (tn + fp)
                precision = precision_score(test_label, y_pred)
                f1 = f1_score(test_label, y_pred)
                mcc = matthews_corrcoef(test_label, y_pred)

                sorted_combination = [x for _, x in sorted(zip(lst, combination))]
                result.append(
                    [model_name]
                    + sorted_combination
                    + [
                        round(accuracy, 2),
                        round(sensitivity, 2),
                        round(specificity, 2),
                        round(precision, 2),
                        round(f1, 2),
                        round(mcc, 2),
                    ]
                )
    id_columns = [f"ID{i+1}" for i in range(cluster_num)]
    result = pd.DataFrame(
        result,
        columns=["Model"]
        + id_columns
        + ["accuracy", "sensitivity", "specificity", "precision", "f1_score", "mcc"],
    )

    return result

#### 輸入最終挑選出的特徵資料

In [60]:
cluster_num = 2  # 總共群數
# 須包含ID和cluster欄位
input_path = "../result/GSE243529/train80/dbeta_TSS_0.01_cluster_final.csv"  # example
final_gene = pd.read_csv(input_path)
if (is_unique(final_gene["cluster"])) == False:
    print("Need unique cluster. Please modify input data.")
final_gene

Need unique cluster. Please modify input data.


Unnamed: 0,ID,cluster
0,cg02676175,2
1,cg13379236,3
2,cg19538614,3
3,cg15744637,3
4,cg01305745,3


#### 液態測試

#### GSE243529 訓練資料

In [32]:
beta_normalized_243529_0 = "../champ_result/GSE243529/all_beta_normalized_0.csv"
beta_normalized_243529_1 = "../champ_result/GSE243529/all_beta_normalized_1.csv"

data_243529_0 = pd.read_csv(beta_normalized_243529_0)
data_243529_1 = pd.read_csv(beta_normalized_243529_1)

data_243529_0 = data_243529_0.iloc[:, ::2]
data_243529_1 = data_243529_1.iloc[:, ::2]
data_243529 = pd.merge(data_243529_0, data_243529_1, on="Unnamed: 0")
data_243529

Unnamed: 0.1,Unnamed: 0,2_x,4_x,6_x,8_x,10_x,12_x,14_x,16_x,18_x,...,506_y,508_y,510_y,512_y,514_y,516_y,518_y,520_y,522_y,524_y
0,cg07881041,0.891637,0.932066,0.940202,0.945641,0.940250,0.932014,0.926817,0.947249,0.928761,...,0.980547,0.931209,0.912459,0.947734,0.941821,0.946169,0.930692,0.944137,0.932917,0.939890
1,cg03513874,0.942312,0.935187,0.964009,0.966716,0.945237,0.943195,0.941343,0.939910,0.957141,...,0.980271,0.945366,0.957127,0.955288,0.963101,0.951368,0.950666,0.942166,0.939584,0.945780
2,cg05451842,0.029975,0.022880,0.017531,0.026281,0.034589,0.030383,0.044970,0.033864,0.044376,...,0.037951,0.041580,0.035622,0.031457,0.044509,0.031586,0.050621,0.045672,0.054436,0.044102
3,cg14797042,0.983277,0.989621,0.987711,0.960712,0.966604,0.966022,0.973924,0.974524,0.963827,...,0.962688,0.975152,0.963661,0.977086,0.954387,0.980980,0.975220,0.985862,0.965568,0.965593
4,cg09838562,0.009447,0.009020,0.007164,0.007786,0.024895,0.032956,0.021911,0.022853,0.004058,...,0.022291,0.030099,0.024947,0.025726,0.016086,0.019250,0.011791,0.021468,0.016349,0.027341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730294,cg19812938,0.867959,0.903661,0.890041,0.896655,0.859622,0.875650,0.868883,0.890319,0.879433,...,0.834810,0.856512,0.862220,0.873557,0.867777,0.889017,0.879561,0.884909,0.872281,0.883036
730295,cg06272054,0.008151,0.011305,0.011354,0.009927,0.008303,0.017703,0.021295,0.021937,0.017683,...,0.013489,0.020654,0.021604,0.027031,0.017665,0.010472,0.015969,0.018077,0.015717,0.014337
730296,cg07255356,0.017649,0.016949,0.022004,0.019642,0.027163,0.032100,0.047964,0.032659,0.030648,...,0.030143,0.043287,0.026179,0.042548,0.036811,0.021260,0.024679,0.029346,0.027983,0.035189
730297,cg24220897,0.932095,0.921312,0.946663,0.965407,0.949585,0.953402,0.920959,0.944034,0.931178,...,0.817780,0.911173,0.905885,0.915353,0.924352,0.944823,0.900550,0.938926,0.903242,0.943392


In [33]:
# 檢查挑選出的特徵是否都有出現
data_243529 = data_243529[data_243529["Unnamed: 0"].isin(final_gene["ID"])]
data_243529

Unnamed: 0.1,Unnamed: 0,2_x,4_x,6_x,8_x,10_x,12_x,14_x,16_x,18_x,...,506_y,508_y,510_y,512_y,514_y,516_y,518_y,520_y,522_y,524_y
493997,cg13379236,0.69246,0.695963,0.685261,0.614583,0.573895,0.657834,0.701592,0.584539,0.616351,...,0.727405,0.537415,0.591056,0.557628,0.662244,0.623344,0.552558,0.596685,0.677702,0.625546
621922,cg19538614,0.260053,0.299338,0.207135,0.177675,0.271244,0.257977,0.24688,0.250327,0.226402,...,0.22822,0.241566,0.226783,0.255958,0.210242,0.244206,0.276625,0.235694,0.304671,0.264651
639967,cg02676175,0.484996,0.530123,0.624995,0.559408,0.423893,0.539501,0.430136,0.535847,0.424976,...,0.3373,0.409056,0.513829,0.365483,0.455773,0.543678,0.476182,0.1825,0.562093,0.499381
647949,cg15744637,0.098838,0.151711,0.128842,0.100453,0.1235,0.134434,0.139649,0.163241,0.14995,...,0.114481,0.125738,0.130422,0.158126,0.137657,0.150671,0.208612,0.13129,0.135042,0.109045
700613,cg01305745,0.776325,0.693077,0.670646,0.698656,0.722584,0.689515,0.710228,0.701426,0.699386,...,0.729499,0.578589,0.67982,0.679149,0.642883,0.707826,0.733224,0.676659,0.722411,0.689949


In [34]:
y_0 = [(1 if i < 128 else 0) for i in range((data_243529_0.shape[1] - 1))]
y_1 = [(1 if i < 134 else 0) for i in range((data_243529_1.shape[1] - 1))]
y_train_243529 = y_0 + y_1

#### GSE89093雙胞胎 測試資料

In [35]:
beta_normalized_89093 = "../champ_result/GSE89093_nc/all_beta_normalized.csv"
label_89093 = "../champ_result/GSE89093_nc/phenotype.csv"

data_89093 = pd.read_csv(beta_normalized_89093)
data_89093_label = pd.read_csv(label_89093)
data_89093

Unnamed: 0,ID_REF,0,1,2,3,4,5,6,7,8,...,82,83,84,85,86,87,88,89,90,91
0,cg00000029,0.486644,0.472790,0.440157,0.401009,0.403449,0.453738,0.505430,0.525945,0.539579,...,0.586559,0.557733,0.515352,0.570047,0.586289,0.428182,0.410767,0.535594,0.561125,0.534982
1,cg00000108,0.989213,0.993538,0.991012,0.995878,0.996242,0.992823,0.990628,0.992809,0.999285,...,0.980657,0.994747,0.987606,0.994711,0.982437,0.982954,0.994045,0.987665,0.993932,0.991112
2,cg00000109,0.984875,0.951471,0.948615,0.949571,0.963231,0.972137,0.972501,0.969343,0.950113,...,0.966235,0.960497,0.955022,0.955616,0.936133,0.955798,0.937965,0.965013,0.942853,0.945216
3,cg00000165,0.102033,0.168589,0.158073,0.141595,0.124284,0.179538,0.181141,0.198009,0.171876,...,0.164425,0.146259,0.162503,0.170724,0.151290,0.113963,0.176928,0.133602,0.147707,0.166224
4,cg00000236,0.798909,0.720664,0.745585,0.751120,0.738786,0.777767,0.776491,0.839313,0.830973,...,0.795153,0.799779,0.796590,0.831788,0.794952,0.783985,0.720944,0.756649,0.848983,0.820339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
453622,ch.9.98463211R,0.000802,0.017480,0.013947,0.011970,0.010956,0.011151,0.009306,0.008481,0.001029,...,0.007897,0.004232,0.011624,0.011549,0.007974,0.015673,0.013178,0.014985,0.016102,0.007668
453623,ch.9.98937537R,0.003578,0.016064,0.022522,0.009246,0.004903,0.017005,0.011053,0.009776,0.004001,...,0.010352,0.012666,0.015397,0.008783,0.011411,0.018158,0.014805,0.012377,0.012630,0.017469
453624,ch.9.98957343R,0.001104,0.025162,0.030792,0.011537,0.020435,0.014164,0.012570,0.012651,0.001225,...,0.014115,0.024711,0.018897,0.023771,0.023488,0.024783,0.018400,0.021078,0.032479,0.032993
453625,ch.9.98959675F,0.051028,0.062762,0.086426,0.094496,0.059940,0.091989,0.098019,0.132013,0.001345,...,0.135202,0.047782,0.079773,0.072900,0.141466,0.141630,0.130800,0.076828,0.080214,0.182504


In [36]:
data_89093_label = pd.DataFrame(data_89093_label["cancer_status"])

# 檢查挑選出的特徵是否都有出現
data_89093 = data_89093[data_89093["ID_REF"].isin(final_gene["ID"])]
data_89093

Unnamed: 0,ID_REF,0,1,2,3,4,5,6,7,8,...,82,83,84,85,86,87,88,89,90,91
25471,cg01305745,0.638166,0.680956,0.681443,0.621832,0.67206,0.656032,0.652598,0.715919,0.739194,...,0.660671,0.680028,0.555772,0.612494,0.657143,0.660234,0.653082,0.64017,0.731687,0.647158
50982,cg02676175,0.549307,0.437833,0.494183,0.720193,0.787223,0.421922,0.4094,0.647108,0.552143,...,0.491332,0.622798,0.499606,0.536625,0.437171,0.162358,0.484223,0.360982,0.354604,0.412771
230122,cg13379236,0.694362,0.637849,0.632367,0.651982,0.719588,0.718063,0.713096,0.706998,0.782798,...,0.775504,0.7031,0.742238,0.777616,0.690528,0.670476,0.693118,0.763106,0.676052,0.685293
271498,cg15744637,0.028924,0.047274,0.044786,0.038761,0.039348,0.041374,0.044651,0.043858,0.022129,...,0.034486,0.026328,0.029617,0.028564,0.040906,0.039448,0.034916,0.045758,0.040178,0.038868
329121,cg19538614,0.073263,0.175869,0.157871,0.181264,0.141273,0.134463,0.131353,0.275797,0.117453,...,0.168827,0.128677,0.100663,0.089679,0.143254,0.14356,0.140922,0.119762,0.173738,0.138592


In [37]:
y_test_89093 = [
    0 if data_89093_label.iloc[i, 0] == "healthy" else 1
    for i in range(data_89093_label.shape[0])
]

In [38]:
param_grids_243529_89093 = {
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "Logistic Regression": {"C": [0.1, 1, 10], "solver": ["liblinear"]},
    "Decision Tree": {"max_depth": [3, 5, 7], "min_samples_split": [2, 5, 10]},
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1, 0.2],
    },
}

In [39]:
result_243529_89093 = predict_comb(
    data_243529,
    y_train_243529,
    data_89093,
    y_test_89093,
    cluster_num,
    param_grids_243529_89093,
    final_gene,
)
result_243529_89093

Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best Score: 0.6615934065934066
Train accuracy:  0.71
Test accuracy:  0.53

Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best Score: 0.6365750915750916
Train accuracy:  0.7
Test accuracy:  0.5

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best Score: 0.6728754578754578
Train accuracy:  0.76
Test accuracy:  0.54

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best Score: 0.6634981684981686
Train accuracy:  0.75
Test accuracy:  0.52

Best Parameters: {'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.6845604395604395
Train accuracy:  0.72
Test accuracy:  0.51

Best Parameters: {'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 200}
Best Score: 0.6385347985347986
Train accuracy:  0.69
Test accuracy:  0.54

Best Parameters: {'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 100}
Best Sc

  _data = np.array(data, dtype=dtype, copy=copy,


Unnamed: 0,Model,ID1,ID2,accuracy,sensitivity,specificity,precision,f1_score,mcc
0,XGBoost,cg02676175,cg13379236,0.53,0.54,0.52,0.53,0.54,0.07
1,XGBoost,cg02676175,cg19538614,0.5,0.3,0.7,0.5,0.38,0.0
2,XGBoost,cg02676175,cg15744637,0.54,0.17,0.91,0.67,0.28,0.13
3,XGBoost,cg02676175,cg01305745,0.52,0.33,0.72,0.54,0.41,0.05
4,RandomForest,cg02676175,cg13379236,0.51,0.76,0.26,0.51,0.61,0.03
5,RandomForest,cg02676175,cg19538614,0.54,0.2,0.89,0.64,0.3,0.12
6,RandomForest,cg02676175,cg15744637,0.54,0.17,0.91,0.67,0.28,0.13
7,RandomForest,cg02676175,cg01305745,0.52,0.28,0.76,0.54,0.37,0.05
8,SVM,cg02676175,cg13379236,0.51,0.7,0.33,0.51,0.59,0.02
9,SVM,cg02676175,cg19538614,0.52,0.35,0.7,0.53,0.42,0.05


In [40]:
os.makedirs(f"../result/GSE243529_GSE89093", exist_ok=True)
result_243529_89093.to_csv(
    "../result/GSE243529_GSE89093/predict_combination.csv", index=False
)

#### GSE148663烏拉圭 測試資料

In [41]:
beta_normalized_148663 = "../champ_result/GSE148663/all_beta_normalized.csv" #使用無oversampling之原始資料
data_148663 = pd.read_csv(beta_normalized_148663)
data_148663

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.12,1.13,1.14,1.15,1.16,1.17,1.18,1.19,1.20,1.21
0,cg00000957,0.860452,0.861816,0.828862,0.854593,0.834130,0.825702,0.854593,0.861816,0.828862,...,0.863146,0.839264,0.838154,0.833727,0.820273,0.850236,0.853823,0.813853,0.829853,0.856274
1,cg00001349,0.651237,0.689401,0.717976,0.759530,0.650520,0.707983,0.759530,0.689401,0.717976,...,0.762429,0.711834,0.677838,0.607658,0.724661,0.587281,0.677297,0.781787,0.743716,0.770865
2,cg00001583,0.105578,0.145586,0.151494,0.154697,0.137455,0.114030,0.154697,0.145586,0.151494,...,0.129753,0.155541,0.100487,0.098573,0.131851,0.081984,0.130274,0.126544,0.149378,0.101541
3,cg00002028,0.089119,0.081822,0.091761,0.076214,0.092523,0.076705,0.076214,0.081822,0.091761,...,0.101627,0.104116,0.102976,0.077940,0.096559,0.058941,0.102959,0.086677,0.115429,0.086052
4,cg00002719,0.055126,0.066180,0.081197,0.060538,0.070888,0.081131,0.060538,0.066180,0.081197,...,0.042413,0.061513,0.046460,0.059188,0.080918,0.077113,0.069398,0.049803,0.052978,0.057628
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410742,cg27656573,0.944012,0.943306,0.950748,0.956639,0.936954,0.935400,0.956639,0.943306,0.950748,...,0.952644,0.936031,0.944811,0.948703,0.946242,0.940713,0.941930,0.946173,0.939021,0.953477
410743,cg27657363,0.879266,0.900482,0.897924,0.912973,0.878780,0.903373,0.912973,0.900482,0.897924,...,0.878098,0.872009,0.884012,0.874592,0.874513,0.887568,0.905831,0.896101,0.872920,0.871998
410744,cg27657537,0.081713,0.098497,0.087162,0.137535,0.103004,0.140770,0.137535,0.098497,0.087162,...,0.077534,0.117473,0.137702,0.077860,0.118235,0.081581,0.183504,0.134606,0.111502,0.065309
410745,cg27662611,0.094778,0.085894,0.069684,0.076433,0.097572,0.117694,0.076433,0.085894,0.069684,...,0.066020,0.092328,0.089029,0.105949,0.082550,0.122752,0.067702,0.064762,0.061859,0.061657


In [42]:
y_test_148663 = [(0 if i < 10 else 1) for i in range(32)]

# 檢查挑選出的特徵是否都有出現
data_148663 = data_148663[data_148663["Unnamed: 0"].isin(final_gene["ID"])]
data_148663

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.12,1.13,1.14,1.15,1.16,1.17,1.18,1.19,1.20,1.21
65410,cg15744637,0.092601,0.078883,0.070746,0.064469,0.150053,0.075991,0.064469,0.078883,0.070746,...,0.07458,0.080159,0.083957,0.071556,0.083098,0.069817,0.073028,0.073735,0.091964,0.061401
84664,cg01305745,0.652978,0.742951,0.749625,0.770792,0.612471,0.751017,0.770792,0.742951,0.749625,...,0.593302,0.646993,0.724909,0.71337,0.725306,0.720394,0.734664,0.737786,0.71434,0.753672
90677,cg02676175,0.361315,0.521305,0.616833,0.533898,0.638511,0.720535,0.533898,0.521305,0.616833,...,0.732635,0.590756,0.776856,0.657576,0.331018,0.523803,0.373197,0.431849,0.49297,0.62732
104368,cg19538614,0.191818,0.147849,0.398092,0.194692,0.190211,0.224114,0.194692,0.147849,0.398092,...,0.273539,0.222596,0.227264,0.188412,0.209681,0.163641,0.228998,0.167897,0.19297,0.182801
189019,cg13379236,0.806629,0.802891,0.844101,0.846237,0.803951,0.844067,0.846237,0.802891,0.844101,...,0.792853,0.792865,0.819751,0.807945,0.829341,0.800345,0.797902,0.828199,0.814068,0.787952


In [43]:
param_grids_243529_148663 = {
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "Logistic Regression": {"C": [0.1, 1, 10], "solver": ["liblinear"]},
    "Decision Tree": {"max_depth": [3, 5, 7], "min_samples_split": [2, 5, 10]},
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1, 0.2],
    },
}

In [44]:
result_243529_148663 = predict_comb(
    data_243529,
    y_train_243529,
    data_148663,
    y_test_148663,
    cluster_num,
    param_grids_243529_148663,
    final_gene,
)
result_243529_148663

Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best Score: 0.6615934065934066
Train accuracy:  0.71
Test accuracy:  0.43

Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best Score: 0.6365750915750916
Train accuracy:  0.7
Test accuracy:  0.5

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best Score: 0.6728754578754578
Train accuracy:  0.76
Test accuracy:  0.41

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best Score: 0.6634981684981686
Train accuracy:  0.75
Test accuracy:  0.36

Best Parameters: {'max_depth': 3, 'min_samples_split': 10, 'n_estimators': 50}
Best Score: 0.6864102564102564
Train accuracy:  0.71
Test accuracy:  0.48

Best Parameters: {'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 100}
Best Score: 0.6346703296703298
Train accuracy:  0.69
Test accuracy:  0.52

Best Parameters: {'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 50}
Best Sco

  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'max_depth': 3, 'min_samples_split': 10}
Best Score: 0.6729304029304028
Train accuracy:  0.71
Test accuracy:  0.39



Unnamed: 0,Model,ID1,ID2,accuracy,sensitivity,specificity,precision,f1_score,mcc
0,XGBoost,cg02676175,cg13379236,0.43,0.77,0.09,0.46,0.58,-0.19
1,XGBoost,cg02676175,cg19538614,0.5,0.5,0.5,0.5,0.5,0.0
2,XGBoost,cg02676175,cg15744637,0.41,0.14,0.68,0.3,0.19,-0.22
3,XGBoost,cg02676175,cg01305745,0.36,0.59,0.14,0.41,0.48,-0.31
4,RandomForest,cg02676175,cg13379236,0.48,0.91,0.05,0.49,0.63,-0.09
5,RandomForest,cg02676175,cg19538614,0.52,0.55,0.5,0.52,0.53,0.05
6,RandomForest,cg02676175,cg15744637,0.41,0.14,0.68,0.3,0.19,-0.22
7,RandomForest,cg02676175,cg01305745,0.36,0.59,0.14,0.41,0.48,-0.31
8,SVM,cg02676175,cg13379236,0.48,0.95,0.0,0.49,0.65,-0.15
9,SVM,cg02676175,cg19538614,0.45,0.64,0.27,0.47,0.54,-0.1


In [45]:
os.makedirs(f"../result/GSE243529_GSE148663", exist_ok=True)
result_243529_148663.to_csv(
    "../result/GSE243529_GSE148663/predict_combination.csv", index=False
)

#### 組織測試

#### 450K 訓練資料

In [46]:
beta_normalized_450K_0 = "../champ_result/GDC_breast_tissue_450k/all_beta_normalized_0.csv"
beta_normalized_450K_1 = "../champ_result/GDC_breast_tissue_450k/all_beta_normalized_1.csv"

data_450K_0 = pd.read_csv(beta_normalized_450K_0)
data_450K_1 = pd.read_csv(beta_normalized_450K_1)

data_450K_0 = data_450K_0.iloc[:, ::2]
data_450K_1 = data_450K_1.iloc[:, ::2]
data_450K = pd.merge(data_450K_0, data_450K_1, on="Unnamed: 0")
data_450K

Unnamed: 0.1,Unnamed: 0,2_x,4_x,6_x,8_x,10_x,12_x,14_x,16_x,18_x,...,882_y,884_y,886_y,888_y,890_y,892,894,896,898,900
0,cg00000957,0.825079,0.836188,0.855953,0.856379,0.833668,0.836432,0.827324,0.879485,0.859840,...,0.873098,0.903595,0.866140,0.850389,0.817217,0.890706,0.858225,0.856440,0.860701,0.843183
1,cg00001349,0.690023,0.802989,0.744400,0.826541,0.683470,0.822242,0.638978,0.743948,0.799018,...,0.857966,0.887382,0.866150,0.794813,0.796713,0.870688,0.861127,0.822202,0.711103,0.825332
2,cg00001583,0.095879,0.030527,0.058828,0.103293,0.054348,0.102374,0.084485,0.123488,0.271919,...,0.689587,0.851582,0.737801,0.489076,0.637176,0.662679,0.578332,0.183629,0.566236,0.554456
3,cg00002028,0.037414,0.028130,0.036667,0.026973,0.032372,0.027366,0.026462,0.051002,0.040978,...,0.061764,0.056748,0.033700,0.031019,0.047260,0.044413,0.065193,0.040534,0.034305,0.032082
4,cg00002837,0.393330,0.278496,0.354795,0.371494,0.372948,0.424968,0.423032,0.391704,0.458908,...,0.645243,0.753940,0.191435,0.462196,0.410381,0.804834,0.626200,0.455885,0.219933,0.434699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341408,cg27656573,0.950538,0.950720,0.955993,0.956690,0.947600,0.948253,0.958505,0.966815,0.976706,...,0.953580,0.979109,0.963358,0.968999,0.966742,0.972799,0.969889,0.960676,0.960480,0.946009
341409,cg27657363,0.959770,0.935633,0.956899,0.947483,0.956285,0.941275,0.938936,0.951144,0.963082,...,0.939027,0.947163,0.899868,0.955757,0.955844,0.965984,0.935275,0.935006,0.880587,0.893125
341410,cg27657537,0.071075,0.051070,0.052173,0.082928,0.097815,0.144147,0.104969,0.141579,0.192042,...,0.693521,0.116499,0.071309,0.280354,0.070012,0.701376,0.163864,0.069314,0.085330,0.095376
341411,cg27662611,0.044393,0.038378,0.051068,0.061392,0.050559,0.047999,0.036908,0.039893,0.043394,...,0.043874,0.044881,0.058513,0.057060,0.073408,0.075407,0.048024,0.042711,0.040815,0.059447


In [47]:
y_0 = [(0 if i < 47 else 1) for i in range(445)]  # 398 tumor/47 normal =445
y_1 = [(0 if i < 50 else 1) for i in range(450)]  # 400 tumor/50 normal =450
y_train_450K = y_0 + y_1

# 檢查挑選出的特徵是否都有出現
data_450K = data_450K[data_450K["Unnamed: 0"].isin(final_gene["ID"])]
data_450K

Unnamed: 0.1,Unnamed: 0,2_x,4_x,6_x,8_x,10_x,12_x,14_x,16_x,18_x,...,882_y,884_y,886_y,888_y,890_y,892,894,896,898,900
56425,cg15744637,0.047868,0.031748,0.043978,0.054314,0.034021,0.057449,0.037729,0.049467,0.038572,...,0.033289,0.034189,0.011312,0.022018,0.01968,0.022243,0.039181,0.032738,0.031085,0.067296
72978,cg01305745,0.695101,0.67343,0.662219,0.733781,0.798592,0.796947,0.792315,0.780187,0.8124,...,0.850521,0.913364,0.106866,0.886185,0.567833,0.872566,0.88464,0.901377,0.850846,0.368562
78297,cg02676175,0.262729,0.090647,0.188114,0.09829,0.12344,0.106426,0.133582,0.19034,0.096213,...,0.15562,0.096188,0.090891,0.077723,0.187307,0.180139,0.09254,0.099415,0.091777,0.342642
90467,cg19538614,0.069397,0.046278,0.103853,0.080952,0.083748,0.061636,0.075952,0.072473,0.098416,...,0.077161,0.059008,0.166385,0.38246,0.473965,0.184197,0.06528,0.081902,0.071313,0.309062
159536,cg13379236,0.856664,0.794824,0.877258,0.817793,0.856344,0.863313,0.802324,0.818501,0.8162,...,0.562665,0.281121,0.721915,0.92626,0.890527,0.61537,0.48504,0.505763,0.729556,0.849225


#### 850K 測試資料

In [48]:
beta_normalized_850K = "../champ_result/GDC_breast_tissue_850k/all_beta_normalized.csv"

data_850K = pd.read_csv(beta_normalized_850K)
data_850K = data_850K.iloc[:, ::2]
data_850K

Unnamed: 0.1,Unnamed: 0,2,4,6,8,10,12,14,16,18,...,142,144,146,148,150,152,154,156,158,160
0,cg07881041,0.943035,0.928695,0.941790,0.954223,0.964751,0.952338,0.957722,0.929608,0.936385,...,0.937143,0.946295,0.877926,0.929869,0.895322,0.936065,0.913382,0.938807,0.902528,0.928855
1,cg03513874,0.972519,0.959357,0.947300,0.969835,0.976845,0.973046,0.975435,0.951467,0.959029,...,0.956534,0.960032,0.910107,0.937854,0.950362,0.956762,0.962674,0.951583,0.956751,0.944323
2,cg05451842,0.021666,0.021658,0.014463,0.010813,0.018068,0.013690,0.019758,0.011594,0.023432,...,0.032426,0.027482,0.027400,0.025836,0.022738,0.016141,0.028114,0.014931,0.016914,0.037160
3,cg14797042,0.935528,0.913968,0.928262,0.902112,0.907727,0.908210,0.933233,0.906875,0.886385,...,0.926348,0.925875,0.911936,0.923218,0.885068,0.905364,0.922666,0.931447,0.911759,0.923486
4,cg09838562,0.044716,0.043590,0.029264,0.043756,0.064916,0.030081,0.036418,0.045762,0.012505,...,0.036724,0.041850,0.044523,0.039296,0.014619,0.038566,0.042800,0.041302,0.023060,0.037469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
722691,cg19812938,0.912999,0.891436,0.916324,0.908014,0.909659,0.916272,0.920529,0.904213,0.900462,...,0.903546,0.893882,0.898353,0.909622,0.897880,0.898839,0.913186,0.897586,0.902874,0.904917
722692,cg06272054,0.007895,0.015901,0.005838,0.003103,0.011144,0.004788,0.007561,0.005181,0.008294,...,0.007622,0.015032,0.004769,0.007554,0.002489,0.009814,0.009527,0.006864,0.010792,0.010832
722693,cg07255356,0.022079,0.003488,0.006069,0.007304,0.013703,0.014209,0.010002,0.010563,0.018904,...,0.011926,0.008654,0.022689,0.015903,0.000041,0.007952,0.013918,0.023236,0.021588,0.019415
722694,cg24220897,0.957847,0.941408,0.948018,0.963731,0.979047,0.960126,0.973814,0.938672,0.950388,...,0.967152,0.967088,0.965381,0.962175,0.963680,0.948178,0.952919,0.928063,0.991733,0.952388


In [49]:
y_test_850K = [(1 if i < 50 else 0) for i in range(80)]

# 檢查挑選出的特徵是否都有出現
data_850K = data_850K[data_850K["Unnamed: 0"].isin(final_gene["ID"])]
data_850K

Unnamed: 0.1,Unnamed: 0,2,4,6,8,10,12,14,16,18,...,142,144,146,148,150,152,154,156,158,160
499535,cg13379236,0.763281,0.818032,0.760685,0.718002,0.744761,0.788005,0.845586,0.714417,0.783895,...,0.823742,0.754253,0.776048,0.839867,0.701125,0.734326,0.766877,0.752042,0.72326,0.721929
628048,cg19538614,0.276054,0.318521,0.270917,0.183618,0.20297,0.171557,0.248207,0.285501,0.224947,...,0.24175,0.268999,0.23434,0.292398,0.286223,0.293218,0.293265,0.259124,0.213234,0.382358
643600,cg02676175,0.533065,0.524492,0.392884,0.599335,0.831313,0.680869,0.603723,0.536548,0.685403,...,0.553698,0.605633,0.500519,0.618077,0.408883,0.573791,0.535702,0.726381,0.667751,0.684275
650493,cg15744637,0.053368,0.049018,0.040947,0.04314,0.035678,0.056246,0.05064,0.050145,0.046909,...,0.047454,0.059431,0.044109,0.032564,0.034852,0.063712,0.048973,0.051649,0.058925,0.054024
696210,cg01305745,0.799745,0.824651,0.791297,0.795163,0.799044,0.753664,0.810954,0.743851,0.729085,...,0.837155,0.793545,0.556523,0.734532,0.715745,0.768198,0.70931,0.725467,0.733396,0.737386


In [50]:
param_grids_450K_850K = {
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "Logistic Regression": {"C": [0.1, 1, 10], "solver": ["liblinear"]},
    "Decision Tree": {"max_depth": [3, 5, 7], "min_samples_split": [2, 5, 10]},
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1, 0.2],
    },
}

In [51]:
result_450K_850K = predict_comb(
    data_450K,
    y_train_450K,
    data_850K,
    y_test_850K,
    cluster_num,
    param_grids_450K_850K,
    final_gene,
)
result_450K_850K

Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
Best Score: 0.8916201117318435
Train accuracy:  0.89
Test accuracy:  0.62

Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
Best Score: 0.8916201117318435
Train accuracy:  0.89
Test accuracy:  0.62

Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
Best Score: 0.8916201117318435
Train accuracy:  0.89
Test accuracy:  0.62

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Best Score: 0.8983240223463687
Train accuracy:  0.97
Test accuracy:  0.62

Best Parameters: {'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 50}
Best Score: 0.8916201117318435
Train accuracy:  0.89
Test accuracy:  0.62

Best Parameters: {'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 50}
Best Score: 0.8916201117318435
Train accuracy:  0.89
Test accuracy:  0.62

Best Parameters: {'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 50}
Best Sco

  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'max_depth': 5, 'min_samples_split': 2}
Best Score: 0.8793296089385475
Train accuracy:  0.92
Test accuracy:  0.62



Unnamed: 0,Model,ID1,ID2,accuracy,sensitivity,specificity,precision,f1_score,mcc
0,XGBoost,cg02676175,cg13379236,0.62,1.0,0.0,0.62,0.77,0.0
1,XGBoost,cg02676175,cg19538614,0.62,1.0,0.0,0.62,0.77,0.0
2,XGBoost,cg02676175,cg15744637,0.62,1.0,0.0,0.62,0.77,0.0
3,XGBoost,cg02676175,cg01305745,0.62,1.0,0.0,0.62,0.77,0.0
4,RandomForest,cg02676175,cg13379236,0.62,1.0,0.0,0.62,0.77,0.0
5,RandomForest,cg02676175,cg19538614,0.62,1.0,0.0,0.62,0.77,0.0
6,RandomForest,cg02676175,cg15744637,0.62,1.0,0.0,0.62,0.77,0.0
7,RandomForest,cg02676175,cg01305745,0.62,1.0,0.0,0.62,0.77,0.0
8,SVM,cg02676175,cg13379236,0.62,1.0,0.0,0.62,0.77,0.0
9,SVM,cg02676175,cg19538614,0.62,1.0,0.0,0.62,0.77,0.0


In [52]:
os.makedirs(f"../result/GDC_breast_tissue_450K_850K", exist_ok=True)
result_450K_850K.to_csv(
    "../result/GDC_breast_tissue_450K_850K/predict_combination.csv", index=False
)