In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score
from sklearn.feature_selection import RFECV,RFE
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score,confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import RandomOverSampler


#### GSE243529液態RFE分析 (過程中有嘗試使用450K組織分析)

In [1]:
import pandas as pd
gene = pd.read_csv("../result/GDC_breast_tissue_450k/train80/dbeta_GSE243529_TSS_0.15.csv")
gene

Unnamed: 0,ID,gene,dbeta,feature
0,cg03760483,ALOX12,0.157175,TSS200
1,cg00074348,APLNR,-0.187434,TSS1500
2,cg04290171,CD46,-0.201098,TSS1500
3,cg00044665,CDH5,0.206568,TSS200
4,cg14666310,CEACAM5,-0.210341,TSS1500
5,cg07157107,CHRNA6,-0.24219,TSS200
6,cg02295216,CKLF,0.17259,TSS1500
7,cg19223467,CLEC9A,-0.152339,TSS200
8,cg23631930,CMTM5,0.172075,TSS200
9,cg12440062,CRISP2,0.200273,TSS200


In [2]:
beta_normalized_train_GSE243529 = pd.read_csv(
    "../result/GSE243529/train80/all_beta_normalized_train.csv"
)
beta_normalized_val_GSE243529 = pd.read_csv(
    "../result/GSE243529/test20/all_beta_normalized_test.csv"
)
beta_normalized_train_450k = pd.read_csv(
    "../result/GDC_breast_tissue_450k/train80/all_beta_normalized_train_oversample_smote_GSE243529.csv"
)
beta_normalized_val_450k = pd.read_csv(
    "../result/GDC_breast_tissue_450k/test20/all_beta_normalized_test.csv"
)

In [3]:
print(beta_normalized_train_GSE243529.head())
print(beta_normalized_train_450k.head())

   Unnamed: 0         0         1         2         3         4         5  \
0  cg07881041  0.926368  0.940674  0.946855  0.939445  0.939303  0.940384   
1  cg03513874  0.950997  0.971682  0.962746  0.942406  0.943713  0.967288   
2  cg05451842  0.059963  0.019628  0.017954  0.025166  0.039009  0.022807   
3  cg14797042  0.951165  0.983220  0.972501  0.971123  0.983623  0.978537   
4  cg09838562  0.038691  0.010217  0.018484  0.023737  0.023379  0.014493   

          6         7         8  ...       408       409       410       411  \
0  0.956939  0.940137  0.929561  ...  0.942149  0.935299  0.952236  0.907088   
1  0.961146  0.959091  0.958049  ...  0.932625  0.964292  0.952845  0.946885   
2  0.043550  0.041088  0.030474  ...  0.018186  0.022440  0.048736  0.033335   
3  0.969215  0.978323  0.971569  ...  0.975007  0.973183  0.972620  0.974413   
4  0.013606  0.024030  0.028147  ...  0.016635  0.022670  0.022354  0.033122   

        412       413       414       415       416     

In [4]:
def intersect_gene(beta_normalized):
    return beta_normalized[beta_normalized["Unnamed: 0"].isin(gene["ID"])]


beta_normalized_train_GSE243529 = intersect_gene(beta_normalized_train_GSE243529)
beta_normalized_val_GSE243529 = intersect_gene(beta_normalized_val_GSE243529)
beta_normalized_train_450k = intersect_gene(beta_normalized_train_450k)
beta_normalized_val_450k = intersect_gene(beta_normalized_val_450k)
print(beta_normalized_train_GSE243529.shape)
print(beta_normalized_train_450k.shape)
print(beta_normalized_val_GSE243529.shape)
print(beta_normalized_val_450k.shape)

(60, 419)
(60, 1275)
(60, 106)
(60, 180)


In [5]:
normal_count_train_GSE243529 = 218
train_normal_GSE243529 = beta_normalized_train_GSE243529.iloc[
    :, 1 : normal_count_train_GSE243529 + 1
]
train_tumor_GSE243529 = beta_normalized_train_GSE243529.iloc[
    :, normal_count_train_GSE243529 + 1 :
]
normal_count_train_450k = 637
train_normal_450k = beta_normalized_train_450k.iloc[:,1:normal_count_train_450k+1]
train_tumor_450k = beta_normalized_train_450k.iloc[:,normal_count_train_450k+1:]

In [6]:
X_train_GSE243529 = pd.concat([train_normal_GSE243529, train_tumor_GSE243529], axis=1).T
y_train_GSE243529 = [(0 if i < normal_count_train_GSE243529 else 1) for i in range(X_train_GSE243529.shape[0])]
print(X_train_GSE243529.shape)
print("GSE243529訓練集樣本數量: ", len(y_train_GSE243529))
print(
    f"GSE243529訓練集中各類別樣本數量: \nnormal={normal_count_train_GSE243529}, tumor={X_train_GSE243529.shape[0]-normal_count_train_GSE243529}"
)

(418, 60)
GSE243529訓練集樣本數量:  418
GSE243529訓練集中各類別樣本數量: 
normal=218, tumor=200


In [7]:
X_train_450k = pd.concat([train_normal_450k, train_tumor_450k], axis=1).T
y_train_450k = [(0 if i < normal_count_train_450k else 1) for i in range(X_train_450k.shape[0])]
print(X_train_450k.shape)
print("450K訓練集樣本數量: ", len(y_train_450k))
print(
    f"450K訓練集中各類別樣本數量: \nnormal={normal_count_train_450k}, tumor={X_train_450k.shape[0]-normal_count_train_450k}"
)

(1274, 60)
450K訓練集樣本數量:  1274
450K訓練集中各類別樣本數量: 
normal=637, tumor=637


In [8]:
normal_count_val_GSE243529 =  50
val_normal_GSE243529 = beta_normalized_val_GSE243529.iloc[
    :, 1 : normal_count_val_GSE243529 + 1
]
val_tumor_GSE243529 = beta_normalized_val_GSE243529.iloc[
    :, normal_count_val_GSE243529 + 1 :
]

In [9]:
normal_count_val_450k =  18
val_normal_450k = beta_normalized_val_450k.iloc[
    :, 1 : normal_count_val_450k + 1
]
val_tumor_450k = beta_normalized_val_450k.iloc[
    :, normal_count_val_450k + 1 :
]

In [10]:
X_val_GSE243529 = pd.concat([val_normal_GSE243529, val_tumor_GSE243529], axis=1).T
y_val_GSE243529 = [(0 if i < normal_count_val_GSE243529 else 1) for i in range(X_val_GSE243529.shape[0])]
print(X_val_GSE243529.shape)
print("GSE243529測試集樣本數量: ", len(y_val_GSE243529))
print(
    f"GSE243529測試集中各類別樣本數量: \nnormal={normal_count_val_GSE243529}, tumor={X_val_GSE243529.shape[0]-normal_count_val_GSE243529}"
)

(105, 60)
GSE243529測試集樣本數量:  105
GSE243529測試集中各類別樣本數量: 
normal=50, tumor=55


In [11]:
X_val_450k = pd.concat([val_normal_450k, val_tumor_450k], axis=1).T
y_val_450k = [(0 if i < normal_count_val_450k else 1) for i in range(X_val_450k.shape[0])]
print(X_val_450k.shape)
print("450K測試集樣本數量: ", len(y_val_450k))
print(
    f"450K測試集中各類別樣本數量: \nnormal={normal_count_val_450k}, tumor={X_val_450k.shape[0]-normal_count_val_450k}"
)

(179, 60)
450K測試集樣本數量:  179
450K測試集中各類別樣本數量: 
normal=18, tumor=161


In [12]:
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.metrics import (
    precision_score,
    accuracy_score,
    matthews_corrcoef,
    confusion_matrix,
    f1_score,
)
from sklearn import svm
from sklearn import ensemble
import xgboost as xgb
from sklearn.model_selection import GridSearchCV


def set_parameters(model, param_grid):
    grid_search = GridSearchCV(
        estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2
    )
    return grid_search


# xgb
param_xgb = {

    "learning_rate": [0.001, 0.01, 0.1],
    "n_estimators": [50, 100, 200],
    "max_depth": [2, 3, 4],
    "subsample": [0.6, 0.7, 0.8],
    "colsample_bytree": [0.6,0.7,0.8],
    "min_child_weight": [40,50,60],
}
xgb_model = set_parameters(xgb.XGBClassifier(random_state=42), param_xgb)

# rf
param_rf = {

    "n_estimators": [50, 100, 150,200],
    "min_samples_split": [80,90],
    "min_samples_leaf": [80,90],
    "max_features": [5, 10, 20],
}
rf_model = set_parameters(ensemble.RandomForestClassifier(random_state=42), param_rf)

# SVM
param_svm = {

    'kernel': ['rbf', 'poly', 'sigmoid'],
    'C':[50,100,200]
}
svm_model = set_parameters(svm.SVC(random_state=42), param_svm)

# dt
param_dt = {
    
    "min_samples_split": [70,80,90],
    "min_samples_leaf": [70,80,90],
    "max_features": [5, 10, 20],
}
dt_model = set_parameters(DecisionTreeClassifier(random_state=42), param_dt)

In [15]:
select_models = {
    "XGBoost": xgb.XGBClassifier(random_state=42),
    # "Random Forest": ensemble.RandomForestClassifier(random_state=42),
    # "Decision Tree": DecisionTreeClassifier(random_state=42),
    # "SVM": svm.SVC(kernel='linear',random_state=42),
}
models = {
    "XGBoost": xgb_model,
    "Random Forest": rf_model,
    "Decision Tree": dt_model,
    "SVM": svm_model,
}

In [439]:
results = {
    model_name: {
        "select_model":[],
        "num_features": [],
        "accuracy": [],
        "sensitivity": [],
        "specificity": [],
        "precision":[],
        "f1":[],
        "mcc": []
    }
    for model_name in models
}

In [16]:
import seaborn as sns
import numpy as np

feature_count = 35
for select_name,select_model in select_models.items():
    for i in range(25, feature_count + 1, 5):
        print("Select", i, "genes:")
        rfe = RFE(estimator=select_model, n_features_to_select=i)
        X_train_rfe = rfe.fit_transform(X_train_GSE243529, y_train_GSE243529)
        X_val_rfe = rfe.transform(X_val_GSE243529)
        selected_genes = np.where(rfe.support_)[0]
        df_feature_importances = pd.DataFrame(
            {
                "gene": gene['ID'].iloc[selected_genes],
                # "importance": rfe.estimator_.feature_importances_,
            }
        )
        # df_feature_importances.sort_values("importance", ascending=False, inplace=True)

        print("gene importances:")
        print(pd.DataFrame(df_feature_importances))
        # pd.DataFrame(df_feature_importances).to_csv("../result/GDC_breast_tissue_450k_GSE243529/RFE/method_2/GSE243529_SVM_filter_35.csv",index=False)
        # plt.figure(figsize=(10, 8))
        # sns.barplot(
        #     x="importance",
        #     y="gene",
        #     hue="gene",
        #     data=df_feature_importances,
        #     palette="magma",
        # )
        # plt.title("Gene Importances")
        # plt.xlabel("Importance")
        # plt.ylabel("Gene")
        # plt.show()

        for test_name, test_model in models.items():
            if (i==30) and (test_name=="Decision Tree"):
                break
            print("Validation model:", test_name)
            grid_search = test_model
            grid_search.fit(X_train_rfe, y_train_GSE243529)
            print("Best Parameters:", grid_search.best_params_)
            print("Best Score:", grid_search.best_score_)
            best_model = grid_search.best_estimator_

            # train
            y_pred_train = best_model.predict(X_train_rfe)
            accuracy_train = accuracy_score(y_train_GSE243529, y_pred_train)
            print("Training Accuracy:", accuracy_train)

            # test
            y_pred = best_model.predict(X_val_rfe)
            accuracy = accuracy_score(y_val_GSE243529, y_pred)
            print("Testing Accuracy:", accuracy)
            
            tn, fp, fn, tp = confusion_matrix(y_val_GSE243529, y_pred).ravel()
            sensitivity = tp / (tp + fn)
            specificity = tn / (tn + fp)
            precision = precision_score(y_val_GSE243529, y_pred)
            mcc = matthews_corrcoef(y_val_GSE243529, y_pred)
            f1 = f1_score(y_val_GSE243529, y_pred)


            results[test_name]["mcc"].append(round(mcc, 2))
            results[test_name]["accuracy"].append(round(accuracy,2))
            results[test_name]["sensitivity"].append(round(sensitivity,2))
            results[test_name]["specificity"].append(round(specificity,2))
            results[test_name]["precision"].append(round(precision))
            results[test_name]["f1"].append(round(f1))
            results[test_name]["num_features"].append(i)
            results[test_name]["select_model"].append(select_name)

Select 35 genes:
gene importances:
          gene
0   cg03760483
1   cg00074348
2   cg04290171
3   cg00044665
5   cg07157107
6   cg02295216
8   cg23631930
9   cg12440062
12  cg05724197
13  cg03003745
15  cg20051772
17  cg13816423
18  cg07014349
19  cg19772011
21  cg04225088
23  cg00727675
24  cg00239353
25  cg15059851
26  cg11775521
27  cg08375658
28  cg07925549
33  cg04927004
34  cg10734581
36  cg16570507
37  cg02577745
43  cg07211259
44  cg12871376
46  cg22746058
48  cg26106778
49  cg05884032
51  cg16536739
52  cg04433322
55  cg06314202
57  cg14153654
58  cg24073122


In [428]:
df_list = []
for model, metrics in results.items():
    df_model = pd.DataFrame(metrics)
    df_model.insert(1, 'validation_model', model)
    df_list.insert(0, df_model)

df = pd.concat(df_list, ignore_index=True)

i = "SVM"
if i=="XGBoost":
    df_dict = {}
df_dict[f'{i}'] = df[df["select_model"] == i]
print(df_dict)

{'XGBoost':   select_model validation_model  num_features  accuracy  sensitivity  \
0      XGBoost    Decision Tree            25      0.70         0.69   
1      XGBoost    Decision Tree            30      0.68         0.67   
2      XGBoost    Decision Tree            35      0.70         0.69   
3      XGBoost    Random Forest            25      0.72         0.75   
4      XGBoost    Random Forest            30      0.70         0.71   
5      XGBoost    Random Forest            35      0.70         0.73   

   specificity  precision  f1   mcc  
0         0.72          1   1  0.41  
1         0.68          1   1  0.35  
2         0.72          1   1  0.41  
3         0.70          1   1  0.45  
4         0.68          1   1  0.39  
5         0.68          1   1  0.41  , 'Random Forest':     select_model validation_model  num_features  accuracy  sensitivity  \
0  Random Forest    Decision Tree            25      0.67         0.67   
1  Random Forest    Decision Tree            30    

In [430]:
df_combined = pd.concat([
    df_dict["XGBoost"], 
    df_dict["Random Forest"], 
    df_dict["Decision Tree"], 
    df_dict["SVM"]
], ignore_index=True)
df_combined.to_csv("../result/GDC_breast_tissue_450k_GSE243529/RFE/predict_GSE243529.csv",index=False)
df_combined

Unnamed: 0,select_model,validation_model,num_features,accuracy,sensitivity,specificity,precision,f1,mcc
0,XGBoost,Decision Tree,25,0.7,0.69,0.72,1,1,0.41
1,XGBoost,Decision Tree,30,0.68,0.67,0.68,1,1,0.35
2,XGBoost,Decision Tree,35,0.7,0.69,0.72,1,1,0.41
3,XGBoost,Random Forest,25,0.72,0.75,0.7,1,1,0.45
4,XGBoost,Random Forest,30,0.7,0.71,0.68,1,1,0.39
5,XGBoost,Random Forest,35,0.7,0.73,0.68,1,1,0.41
6,Random Forest,Decision Tree,25,0.67,0.67,0.66,1,1,0.33
7,Random Forest,Decision Tree,30,0.69,0.67,0.7,1,1,0.37
8,Random Forest,Decision Tree,35,0.7,0.69,0.72,1,1,0.41
9,Random Forest,XGBoost,25,0.48,0.0,1.0,0,0,0.0
