#### Combination Validation Filter
#### Step
1. Execute the prediction combination function.
2. Input the selected gene file.
3. Create training/testing set labels.
4. Select one gene from each group for combination.
5. Create training/testing datasets.
6. Adjust model parameters.
7. Perform model prediction.
8. Output and save the final prediction results.
#### Importance
* The input file must include `ID` and `cluster`.
* Ensure that all genes are present in each dataset (code include a check to verify that the selected features are present).
* Adjust parameters manually until no overfitting occurs (you can remove unsuitable models).
* Other than the file path, `cluster_num` and `param_grids`, no further changes are necessary.

In [1]:
import os
import pandas as pd

from itertools import combinations
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    accuracy_score,
    matthews_corrcoef,
    f1_score,
)

In [2]:
def is_unique(lst):
    return len(lst) == len(set(lst))


def prepare_data(data, combination, label, cluster_num):
    X = []
    cluster_values = []
    for i in range(cluster_num):

        gene_values = data.loc[data[data.columns[0]].isin([combination[i]])]
        gene_values = gene_values.iloc[:, 1::].values.flatten().tolist()
        cluster_values.append(gene_values)

    for i in range(len(label)):
        sample = [cluster_values[j][i] for j in range(cluster_num)]
        X.append(sample)
    return X

In [3]:
def predict_comb(
    train_data, train_label, test_data, test_label, cluster_num, param_grids, final_gene
):
    combinations_list = list(combinations(final_gene["ID"], cluster_num))
    result = []
    models = {
        "XGBoost": xgb.XGBClassifier(),
        "RandomForest": RandomForestClassifier(),
        "SVM": SVC(),
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
    }

    for model_name, model in models.items():
        # 取出所有可能的組合
        for combination in combinations_list:
            lst = []
            for i in range(cluster_num):
                lst.append(
                    int(
                        final_gene.loc[final_gene["ID"] == combination[i]][
                            "cluster"
                        ].iloc[0]
                    )
                )
            # 檢查是否在不同群
            if is_unique(lst):
                # 準備訓練 / 測試資料
                X_test = prepare_data(test_data, combination, test_label, cluster_num)
                X_train = prepare_data(
                    train_data, combination, train_label, cluster_num
                )

                # 模型參數調整
                param_grid = param_grids[model_name]
                grid_search = GridSearchCV(
                    estimator=model, param_grid=param_grid, cv=5, n_jobs=-1
                )
                grid_search.fit(X_train, train_label)
                print("Best Parameters:", grid_search.best_params_)
                print("Best Score:", grid_search.best_score_)
                model = grid_search.best_estimator_

                # 預測
                y_pred_train = model.predict(X_train)
                accuracy_train = accuracy_score(train_label, y_pred_train)
                # print("Train accuracy: ", round(accuracy_train, 2))
                y_pred = model.predict(X_test)
                accuracy = accuracy_score(test_label, y_pred)
                # print("Test accuracy: ", round(accuracy, 2))

                # 檢查是否過擬合
                if abs(accuracy_train - accuracy) > 0.1:
                    print("Train accuracy: ", round(accuracy_train, 2))
                    print("Test accuracy: ", round(accuracy, 2))
                    print(f"========={model_name} overfitting =========\n")
                else:
                    # 輸出結果
                    tn, fp, fn, tp = confusion_matrix(test_label, y_pred).ravel()
                    sensitivity = tp / (tp + fn)
                    specificity = tn / (tn + fp)
                    precision = precision_score(test_label, y_pred)
                    f1 = f1_score(test_label, y_pred)
                    mcc = matthews_corrcoef(test_label, y_pred)

                    sorted_combination = [x for _, x in sorted(zip(lst, combination))]
                    result.append(
                        [model_name]
                        + sorted_combination
                        + [
                            round(accuracy, 2),
                            round(sensitivity, 2),
                            round(specificity, 2),
                            round(precision, 2),
                            round(f1, 2),
                            round(mcc, 2),
                        ]
                    )
    id_columns = [f"ID{i+1}" for i in range(cluster_num)]
    result = pd.DataFrame(
        result,
        columns=["Model"]
        + id_columns
        + ["accuracy", "sensitivity", "specificity", "precision", "f1_score", "mcc"],
    )

    return result

#### 輸入特徵資料

In [None]:
dbeta = pd.read_csv(
    "../result/GDC_breast_tissue_450k/train80/dbeta_GSE243529_TSS_0.15.csv"
)
dbeta

In [None]:
cluster_num = 4  # 總共群數
# 須包含ID和cluster欄位
input_path = (
    "../result/GDC_breast_tissue_450k_GSE243529/RFE/cluster_RFE.csv"  # example
)
final_gene = pd.read_csv(input_path)
final_gene = pd.merge(final_gene, dbeta, on="gene", how="inner")
final_gene

#### 液態驗證

#### GSE243529 訓練80%資料

In [None]:
beta_normalized_243529_train = (
    "../result/GSE243529/train80/all_beta_normalized_train.csv"
)
data_243529_train = pd.read_csv(beta_normalized_243529_train)
data_243529_train

In [None]:
# 檢查挑選出的特徵是否都有出現
data_243529_train = data_243529_train[
    data_243529_train["Unnamed: 0"].isin(final_gene["ID"])
]
data_243529_train

In [12]:
y_train_243529 = [
    (0 if i < 218 else 1) for i in range((data_243529_train.shape[1] - 1))
]

#### GSE243529 驗證20%資料

In [None]:
beta_normalized_243529_val = "../result/GSE243529/test20/all_beta_normalized_test.csv"
data_243529_val = pd.read_csv(beta_normalized_243529_val)
data_243529_val

In [None]:
data_243529_val = data_243529_val[data_243529_val["Unnamed: 0"].isin(final_gene["ID"])]
data_243529_val

In [15]:
y_val_243529 = [(0 if i < 50 else 1) for i in range((data_243529_val.shape[1] - 1))]

In [21]:
param_grids_243529 = {
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "Logistic Regression": {"C": [0.1, 1, 10], "solver": ["liblinear"]},
    "Decision Tree": {"max_depth": [3, 5, 7], "min_samples_split": [2, 5, 10]},
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1, 0.2],
    },
}

In [None]:
result_243529 = predict_comb(
    data_243529_train,
    y_train_243529,
    data_243529_val,
    y_val_243529,
    cluster_num,
    param_grids_243529,
    final_gene,
)
result_243529

In [40]:
os.makedirs(f"../result/GSE243529", exist_ok=True)
result_243529.to_csv(
    "../result/GSE243529/predict_combination.csv", index=False
)

#### 組織驗證

#### 450K 訓練80%資料

In [None]:
beta_normalized_450K_train = "../result/GDC_breast_tissue_450k/train80/all_beta_normalized_train.csv"

data_450K_train = pd.read_csv(beta_normalized_450K_train)
data_450K_train

In [None]:
# 檢查挑選出的特徵是否都有出現
data_450K_train = data_450K_train[
    data_450K_train["Unnamed: 0"].isin(final_gene["ID"])
]
data_450K_train

In [25]:
y_train_450K = [
    (0 if i < 637 else 1) for i in range(data_450K_train.shape[1] - 1)
]

#### 450K 測試20%資料

In [None]:
beta_normalized_450K_test = "../result/GDC_breast_tissue_450k/test20/all_beta_normalized_test.csv"

data_450K_test = pd.read_csv(beta_normalized_450K_test)
data_450K_test

In [None]:
y_test_450K = [(0 if i < 18 else 1) for i in range(data_450K_test.shape[1] - 1)]

# 檢查挑選出的特徵是否都有出現
data_450K_test = data_450K_test[data_450K_test["Unnamed: 0"].isin(final_gene["ID"])]
data_450K_test

In [28]:
param_grids_450K = {
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "Logistic Regression": {"C": [0.1, 1, 10], "solver": ["liblinear"]},
    "Decision Tree": {"max_depth": [3, 5, 7], "min_samples_split": [2, 5, 10]},
    "RandomForest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "min_samples_split": [2, 5, 10],
    },
    "XGBoost": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1, 0.2],
    },
}

In [None]:
result_450K = predict_comb(
    data_450K_train,
    y_train_450K,
    data_450K_test,
    y_test_450K,
    cluster_num,
    param_grids_450K,
    final_gene,
)
result_450K

In [52]:
os.makedirs(f"../result/GDC_breast_tissue_450K", exist_ok=True)
result_450K.to_csv(
    "../result/GDC_breast_tissue_450K/predict_combination.csv", index=False
)