#### Combination Validation
#### Step
1. Execute the prediction combination function.
2. Input the final selected gene file.
3. Create training/testing set labels.
4. Select one gene from each group for combination.
5. Create training/testing datasets.
6. Adjust model parameters.
7. Perform model prediction.
8. Output and save the final prediction results.
#### Importance
* The input file must include `ID` and `cluster`.
* Ensure that all genes are present in each dataset (code include a check to verify that the selected features are present).
* Adjust parameters manually until no overfitting occurs (you can remove unsuitable models).
* Other than the file path, `cluster_num` and `param_grids`, no further changes are necessary.
* The final combination test results should not be used for further filtering or selection.

In [2]:
import os
import pandas as pd

from itertools import combinations
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    accuracy_score,
    matthews_corrcoef,
    f1_score,
)

In [3]:
def is_unique(lst):
    return len(lst) == len(set(lst))


def prepare_data(data, combination, label, cluster_num):
    X = []
    cluster_values = []
    for i in range(cluster_num):

        gene_values = data.loc[data[data.columns[0]].isin([combination[i]])]
        gene_values = gene_values.iloc[:, 1::].values.flatten().tolist()
        cluster_values.append(gene_values)

    for i in range(len(label)):
        sample = [cluster_values[j][i] for j in range(cluster_num)]
        X.append(sample)
    return X

In [4]:
def predict_comb(
    train_data, train_label, test_data, test_label, cluster_num, param_grids, final_gene
):
    combinations_list = list(combinations(final_gene["ID"], cluster_num))
    result = []
    models = {
        "XGBoost": xgb.XGBClassifier(random_state=42),
        "RandomForest": RandomForestClassifier(random_state=42),
        "SVM": SVC(random_state=42),
        "Logistic Regression": LogisticRegression(random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
    }

    for model_name, model in models.items():
        # 取出所有可能的組合
        for combination in combinations_list:
            lst = []
            for i in range(cluster_num):
                lst.append(
                    int(
                        final_gene.loc[final_gene["ID"] == combination[i]][
                            "cluster"
                        ].iloc[0]
                    )
                )
            # 檢查是否在不同群
            if is_unique(lst):
                # 準備訓練 / 測試資料
                X_test = prepare_data(test_data, combination, test_label, cluster_num)
                X_train = prepare_data(
                    train_data, combination, train_label, cluster_num
                )

                # 模型參數調整
                param_grid = param_grids[model_name]
                grid_search = GridSearchCV(
                    estimator=model, param_grid=param_grid, cv=5, n_jobs=-1
                )
                grid_search.fit(X_train, train_label)
                print("Best Parameters:", grid_search.best_params_)
                print("Best Score:", grid_search.best_score_)
                model = grid_search.best_estimator_

                # 預測
                y_pred_train = model.predict(X_train)
                accuracy_train = accuracy_score(train_label, y_pred_train)
                # print("Train accuracy: ", round(accuracy_train, 2))
                y_pred = model.predict(X_test)
                
                accuracy = accuracy_score(test_label, y_pred)
                # print("Test accuracy: ", round(accuracy, 2))

                # 檢查是否過擬合
                if abs(accuracy_train - accuracy) > 0.1:
                    print("Train accuracy: ", round(accuracy_train, 2))
                    print("Test accuracy: ", round(accuracy, 2))
                    print(f"========={model_name} overfitting =========\n")
                else:
                    # 輸出結果
                    tn, fp, fn, tp = confusion_matrix(test_label, y_pred).ravel()
                    sensitivity = tp / (tp + fn)
                    specificity = tn / (tn + fp)
                    precision = precision_score(test_label, y_pred)
                    f1 = f1_score(test_label, y_pred)
                    mcc = matthews_corrcoef(test_label, y_pred)

                    sorted_combination = [x for _, x in sorted(zip(lst, combination))]
                    result.append(
                        [model_name]
                        + sorted_combination
                        + [
                            round(accuracy, 2),
                            round(sensitivity, 2),
                            round(specificity, 2),
                            round(precision, 2),
                            round(f1, 2),
                            round(mcc, 2),
                        ]
                    )
    id_columns = [f"ID{i+1}" for i in range(cluster_num)]
    result = pd.DataFrame(
        result,
        columns=["Model"]
        + id_columns
        + ["accuracy", "sensitivity", "specificity", "precision", "f1_score", "mcc"],
    )

    return result

#### 輸入最終挑選出的特徵資料

In [5]:
cluster_num = 4  # 總共群數
# 須包含ID和cluster欄位
input_path = "../result/GDC_rectal_tissue_450k/train80/dbeta_TSS_0.1_final_consensus.csv"  # example
final_gene = pd.read_csv(input_path)
if (is_unique(final_gene["cluster"])) == False:
    print("Need unique cluster. Please modify input data.")
final_gene

Need unique cluster. Please modify input data.


Unnamed: 0,gene,dbeta,feature,ID,cluster
0,ESM1,-0.11561,TSS1500,cg16462183,2
1,INHBE,-0.110601,TSS1500,cg23998391,1
2,MIR654,-0.102229,TSS1500,cg13995230,2
3,ADNP2,-0.117257,TSS200,cg24959938,4
4,CLDN1,-0.117225,TSS1500,cg03623835,3
5,UNC13C,-0.114714,TSS1500,cg06530558,3
6,VSTM1,-0.111184,TSS1500,cg12315311,1
7,ACOT8,-0.103003,TSS1500,cg08101264,4
8,PRNT,-0.115671,TSS200,cg26097573,2


#### 組織驗證挑選最佳組合

#### GDC rectal 80% 訓練資料 (450K)

In [6]:
beta_normalized_450k_train = "../result/GDC_rectal_tissue_450k/train80/all_beta_normalized_train_oversample_smote.csv"

data_450k_train = pd.read_csv(beta_normalized_450k_train)
data_450k_train

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.320,1.321,1.322,1.323,1.324,1.325,1.326,1.327,1.328,1.329
0,cg00000957,0.856113,0.839813,0.836635,0.839600,0.872039,0.827300,0.867226,0.856248,0.844131,...,0.888416,0.837523,0.884360,0.875896,0.859817,0.845340,0.885423,0.805013,0.740653,0.824682
1,cg00001349,0.804604,0.819941,0.846237,0.840729,0.873266,0.785063,0.836406,0.826497,0.811564,...,0.905431,0.816566,0.915958,0.831712,0.862241,0.805669,0.853217,0.868710,0.755751,0.867710
2,cg00002719,0.185599,0.100295,0.095473,0.241155,0.081563,0.074698,0.085243,0.095008,0.074505,...,0.315933,0.025216,0.678708,0.512912,0.456226,0.825931,0.707738,0.300958,0.737305,0.481995
3,cg00002837,0.522515,0.489466,0.523515,0.450116,0.455411,0.539920,0.507077,0.486059,0.485272,...,0.432962,0.509807,0.604417,0.438432,0.282946,0.159257,0.691693,0.283691,0.216872,0.658773
4,cg00003287,0.170130,0.250159,0.257916,0.176988,0.135848,0.194919,0.151219,0.182795,0.184079,...,0.109457,0.083747,0.111750,0.112370,0.181156,0.083998,0.105068,0.089882,0.113225,0.124865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364222,cg27656573,0.964895,0.947315,0.947070,0.952066,0.952888,0.947812,0.959611,0.964744,0.953798,...,0.964225,0.968654,0.947149,0.952817,0.967555,0.935477,0.943424,0.961357,0.960380,0.964356
364223,cg27657363,0.928772,0.932969,0.930216,0.928250,0.943854,0.905401,0.950299,0.949012,0.936415,...,0.957469,0.685197,0.963930,0.930816,0.946185,0.904104,0.759986,0.742072,0.891878,0.946043
364224,cg27657537,0.104119,0.084282,0.074635,0.072809,0.114334,0.118694,0.088020,0.057756,0.095004,...,0.068791,0.071161,0.059028,0.081142,0.084324,0.070557,0.040480,0.049996,0.062273,0.070444
364225,cg27662611,0.048974,0.065580,0.069607,0.055919,0.060528,0.055811,0.038455,0.036974,0.042911,...,0.020892,0.046774,0.064979,0.038843,0.043956,0.064004,0.026731,0.016975,0.051021,0.039027


In [7]:
# 檢查挑選出的特徵是否都有出現
data_450k_train = data_450k_train[data_450k_train["Unnamed: 0"].isin(final_gene["ID"])]
data_450k_train

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.320,1.321,1.322,1.323,1.324,1.325,1.326,1.327,1.328,1.329
145131,cg03623835,0.633356,0.637032,0.731575,0.594957,0.36312,0.707712,0.39921,0.606651,0.387067,...,0.568532,0.17406,0.232095,0.389215,0.269779,0.150418,0.134825,0.195584,0.164666,0.349648
177522,cg16462183,0.435937,0.48247,0.536316,0.428038,0.497578,0.494944,0.465784,0.514611,0.435076,...,0.225787,0.069921,0.593644,0.246944,0.252701,0.158914,0.087351,0.219676,0.129321,0.223183
281342,cg23998391,0.475583,0.530824,0.547876,0.467997,0.479369,0.444251,0.459765,0.509861,0.439025,...,0.356923,0.202771,0.346374,0.295459,0.385034,0.375122,0.269976,0.324312,0.244065,0.34248
294750,cg13995230,0.476425,0.618211,0.717254,0.412525,0.604492,0.634726,0.613637,0.656775,0.588639,...,0.130321,0.10891,0.324795,0.281015,0.293501,0.119166,0.24383,0.344406,0.171896,0.317968
300809,cg06530558,0.587413,0.676972,0.783795,0.505649,0.604029,0.645609,0.626039,0.655508,0.576665,...,0.131181,0.179,0.336692,0.318616,0.3743,0.112205,0.207078,0.400902,0.233969,0.377705
337504,cg24959938,0.90888,0.895216,0.901242,0.896473,0.90349,0.885423,0.917571,0.933263,0.866184,...,0.326525,0.368189,0.601603,0.547588,0.548921,0.192581,0.419778,0.651593,0.356803,0.662557
344040,cg12315311,0.214797,0.402377,0.472497,0.245297,0.288065,0.396734,0.301748,0.409767,0.321223,...,0.100564,0.084747,0.24424,0.188337,0.16242,0.069068,0.116133,0.128946,0.080326,0.165383
353336,cg08101264,0.275559,0.223555,0.229948,0.185686,0.172816,0.261271,0.196021,0.230712,0.203701,...,0.451667,0.029976,0.05463,0.288316,0.025461,0.04944,0.035049,0.034043,0.014856,0.11332
356892,cg26097573,0.345549,0.357183,0.370347,0.346646,0.355444,0.314372,0.33794,0.352961,0.247051,...,0.052799,0.039361,0.602425,0.361529,0.102892,0.052456,0.036606,0.064699,0.531414,0.30781


In [8]:
y_train_450k = [(0 if i < 330 else 1) for i in range((data_450k_train.shape[1] - 1))]
len(y_train_450k)

660

#### GDC rectal 20% 驗證資料 (450K)

In [9]:
beta_normalized_450k_test = "../result/GDC_rectal_tissue_450k/test20/all_beta_normalized_test.csv"

data_450k_test = pd.read_csv(beta_normalized_450k_test)
data_450k_test

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.70,1.71,1.72,1.73,1.74,1.75,1.76,1.77,1.78,1.79
0,cg00000957,0.841645,0.878402,0.824603,0.870629,0.819618,0.861250,0.854611,0.874880,0.876507,...,0.853425,0.888920,0.880878,0.897998,0.880955,0.848965,0.586306,0.736696,0.836194,0.880422
1,cg00001349,0.781077,0.827815,0.851704,0.849607,0.810340,0.844776,0.842892,0.802335,0.796125,...,0.856622,0.825568,0.879687,0.887105,0.864475,0.822985,0.403195,0.922766,0.703989,0.783747
2,cg00002719,0.168058,0.148233,0.160577,0.187961,0.177772,0.155861,0.182414,0.070352,0.091167,...,0.399348,0.855122,0.735879,0.381706,0.652804,0.312329,0.019925,0.684634,0.279649,0.645581
3,cg00002837,0.541533,0.448796,0.561836,0.437426,0.415671,0.384319,0.490708,0.491981,0.514943,...,0.216837,0.601028,0.539407,0.449504,0.167965,0.649064,0.163995,0.186060,0.574544,0.217697
4,cg00003287,0.211042,0.172519,0.155959,0.167675,0.229391,0.137551,0.171420,0.163743,0.194323,...,0.122832,0.086341,0.116527,0.090576,0.182057,0.225298,0.071859,0.116868,0.158122,0.153059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364222,cg27656573,0.954560,0.960501,0.946738,0.949828,0.954480,0.958009,0.962520,0.961466,0.980986,...,0.963501,0.947570,0.928578,0.979188,0.960578,0.973118,0.977977,0.940996,0.965334,0.899346
364223,cg27657363,0.924865,0.938085,0.932968,0.936173,0.940811,0.941744,0.931885,0.959380,0.934351,...,0.734614,0.946445,0.864682,0.956747,0.952311,0.959375,0.688525,0.823765,0.748757,0.914971
364224,cg27657537,0.067138,0.130632,0.066866,0.105192,0.080608,0.109999,0.099383,0.112128,0.075398,...,0.063255,0.075057,0.089792,0.060118,0.047457,0.069348,0.038607,0.059501,0.071004,0.063170
364225,cg27662611,0.017825,0.051081,0.056978,0.036000,0.054314,0.043427,0.039723,0.032144,0.018054,...,0.026045,0.051144,0.055598,0.008559,0.032049,0.032776,0.030342,0.033784,0.060608,0.025639


In [10]:
# 檢查挑選出的特徵是否都有出現
data_450k_test = data_450k_test[data_450k_test["Unnamed: 0"].isin(final_gene["ID"])]
data_450k_test

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.70,1.71,1.72,1.73,1.74,1.75,1.76,1.77,1.78,1.79
145131,cg03623835,0.835409,0.558956,0.410664,0.437383,0.490781,0.412512,0.63879,0.602848,0.636551,...,0.233041,0.146891,0.151397,0.41134,0.309217,0.297739,0.117183,0.294462,0.232998,0.481209
177522,cg16462183,0.453217,0.453904,0.444456,0.393933,0.355217,0.399137,0.41484,0.442978,0.402264,...,0.2454,0.124137,0.336501,0.479167,0.284337,0.156505,0.158488,0.252308,0.15742,0.292446
281342,cg23998391,0.475745,0.541269,0.299819,0.429055,0.532486,0.447408,0.507242,0.50059,0.501028,...,0.344454,0.28054,0.507421,0.418927,0.334097,0.301518,0.269049,0.329684,0.335595,0.35595
294750,cg13995230,0.645737,0.443829,0.378987,0.320422,0.455547,0.33001,0.337708,0.415765,0.637772,...,0.199654,0.333945,0.691154,0.675661,0.283505,0.329887,0.2587,0.224727,0.235881,0.371209
300809,cg06530558,0.595874,0.533171,0.529668,0.37625,0.522682,0.374542,0.491007,0.468704,0.629327,...,0.205633,0.224212,0.194244,0.57279,0.340881,0.36468,0.129917,0.304763,0.26487,0.3378
337504,cg24959938,0.890263,0.899898,0.811715,0.888924,0.900804,0.867255,0.896918,0.919894,0.897054,...,0.491078,0.284143,0.873924,0.845877,0.449985,0.618401,0.527336,0.559561,0.485245,0.507912
344040,cg12315311,0.502864,0.338522,0.242266,0.274285,0.27102,0.203599,0.144594,0.243362,0.33024,...,0.168751,0.104582,0.291822,0.241479,0.253541,0.217065,0.03716,0.146127,0.131724,0.164691
353336,cg08101264,0.199735,0.226434,0.170371,0.184577,0.133099,0.225432,0.215585,0.149264,0.158454,...,0.219024,0.057423,0.069843,0.28616,0.632681,0.114645,0.013943,0.03053,0.014844,0.578503
356892,cg26097573,0.518767,0.348254,0.413238,0.302565,0.332476,0.276977,0.296997,0.271061,0.259465,...,0.110714,0.065885,0.083991,0.269917,0.048094,0.243027,0.036614,0.038315,0.07861,0.092298


In [11]:
y_test_450k = [(0 if i < 11 else 1) for i in range((data_450k_test.shape[1] - 1))]
len(y_test_450k)

91

In [12]:
param_grids_450k = {
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "Logistic Regression": {"C": [0.1, 1, 10], "solver": ["liblinear"]},
    "Decision Tree": {"max_depth": [3, 5, 7], "min_samples_split": [2, 5, 10]},
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1, 0.2],
    },
}

In [13]:
result_450k = predict_comb(
    data_450k_train,
    y_train_450k,
    data_450k_test,
    y_test_450k,
    cluster_num,
    param_grids_450k,
    final_gene,
)
result_450k

Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Best Score: 0.9924242424242425
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best Score: 0.9909090909090909
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Best Score: 0.9818181818181818
Best Parameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
Best Score: 0.9863636363636363
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best Score: 0.9878787878787879
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
Best Score: 0.9863636363636363
Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}
Best Score: 0.9696969696969697
Best Parameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
Best Score: 0.9712121212121211
Best Parameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
Best Score: 0.9939393939393939
Best Parameters: {'learning_r

Unnamed: 0,Model,ID1,ID2,ID3,ID4,accuracy,sensitivity,specificity,precision,f1_score,mcc
0,XGBoost,cg23998391,cg16462183,cg03623835,cg24959938,0.96,0.96,0.91,0.99,0.97,0.81
1,XGBoost,cg23998391,cg16462183,cg06530558,cg24959938,0.96,0.96,0.91,0.99,0.97,0.81
2,XGBoost,cg23998391,cg16462183,cg03623835,cg08101264,0.96,0.98,0.82,0.98,0.98,0.79
3,XGBoost,cg23998391,cg16462183,cg06530558,cg08101264,0.95,0.98,0.73,0.96,0.97,0.73
4,XGBoost,cg12315311,cg16462183,cg03623835,cg24959938,0.96,0.96,0.91,0.99,0.97,0.81
...,...,...,...,...,...,...,...,...,...,...,...
115,Decision Tree,cg12315311,cg13995230,cg06530558,cg08101264,0.95,0.96,0.82,0.97,0.97,0.75
116,Decision Tree,cg12315311,cg26097573,cg03623835,cg24959938,0.96,0.96,0.91,0.99,0.97,0.81
117,Decision Tree,cg12315311,cg26097573,cg06530558,cg24959938,0.92,0.96,0.64,0.95,0.96,0.62
118,Decision Tree,cg12315311,cg26097573,cg03623835,cg08101264,0.95,0.95,0.91,0.99,0.97,0.78


In [14]:
os.makedirs(f"../result/GDC_rectal_tissue_450k", exist_ok=True)
result_450k.to_csv(
    "../result/GDC_rectal_tissue_450k/predict_combination.csv", index=False
)