In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score
from sklearn.feature_selection import RFECV,RFE
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score,confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import RandomOverSampler


#### GDC 450K組織RFE分析

In [30]:
import pandas as pd
gene = pd.read_csv("../result/GDC_rectal_tissue_450k/train80/dbeta_TSS_0.1.csv")
gene

Unnamed: 0,ID,gene,dbeta,feature
0,cg00134295,A2M,-0.106726,TSS1500
1,cg00910127,AARS2,-0.114824,TSS1500
2,cg06895831,ABCA4,-0.120163,TSS1500
3,cg14909495,ABP1,-0.104112,TSS200
4,cg08101264,ACOT8,-0.103003,TSS1500
...,...,...,...,...
672,cg07643930,ZNF598,-0.139233,TSS1500
673,cg07463519,ZNF720,-0.128613,TSS1500
674,cg13388277,ZNF727,-0.101489,TSS1500
675,cg08744726,ZNF767,-0.109304,TSS1500


In [31]:
beta_normalized_train_450k = pd.read_csv(
    "../result/GDC_rectal_tissue_450k/train80/all_beta_normalized_train_oversample_smote.csv"
)
beta_normalized_val_450k = pd.read_csv(
    "../result/GDC_rectal_tissue_450k/test20/all_beta_normalized_test.csv"
)

In [32]:
beta_normalized_train_450k.head()

Unnamed: 0.1,Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,1.320,1.321,1.322,1.323,1.324,1.325,1.326,1.327,1.328,1.329
0,cg00000957,0.856113,0.839813,0.836635,0.8396,0.872039,0.8273,0.867226,0.856248,0.844131,...,0.888416,0.837523,0.88436,0.875896,0.859817,0.84534,0.885423,0.805013,0.740653,0.824682
1,cg00001349,0.804604,0.819941,0.846237,0.840729,0.873266,0.785063,0.836406,0.826497,0.811564,...,0.905431,0.816566,0.915958,0.831712,0.862241,0.805669,0.853217,0.86871,0.755751,0.86771
2,cg00002719,0.185599,0.100295,0.095473,0.241155,0.081563,0.074698,0.085243,0.095008,0.074505,...,0.315933,0.025216,0.678708,0.512912,0.456226,0.825931,0.707738,0.300958,0.737305,0.481995
3,cg00002837,0.522515,0.489466,0.523515,0.450116,0.455411,0.53992,0.507077,0.486059,0.485272,...,0.432962,0.509807,0.604417,0.438432,0.282946,0.159257,0.691693,0.283691,0.216872,0.658773
4,cg00003287,0.17013,0.250159,0.257916,0.176988,0.135848,0.194919,0.151219,0.182795,0.184079,...,0.109457,0.083747,0.11175,0.11237,0.181156,0.083998,0.105068,0.089882,0.113225,0.124865


In [33]:
def intersect_gene(beta_normalized):
    return beta_normalized[beta_normalized["Unnamed: 0"].isin(gene["ID"])]

beta_normalized_train_450k = intersect_gene(beta_normalized_train_450k)
beta_normalized_val_450k = intersect_gene(beta_normalized_val_450k)
print(beta_normalized_train_450k.shape)
print(beta_normalized_val_450k.shape)

(677, 661)
(677, 92)


In [34]:
normal_count_train_450k = 330
train_normal_450k = beta_normalized_train_450k.iloc[:,1:normal_count_train_450k+1]
train_tumor_450k = beta_normalized_train_450k.iloc[:,normal_count_train_450k+1:]

In [35]:
X_train_450k = pd.concat([train_normal_450k, train_tumor_450k], axis=1).T
y_train_450k = [(0 if i < normal_count_train_450k else 1) for i in range(X_train_450k.shape[0])]
print(X_train_450k.shape)
print("450K訓練集樣本數量: ", len(y_train_450k))
print(
    f"450K訓練集中各類別樣本數量: \nnormal={normal_count_train_450k}, tumor={X_train_450k.shape[0]-normal_count_train_450k}"
)

(660, 677)
450K訓練集樣本數量:  660
450K訓練集中各類別樣本數量: 
normal=330, tumor=330


In [36]:
normal_count_val_450k =  11
val_normal_450k = beta_normalized_val_450k.iloc[
    :, 1 : normal_count_val_450k + 1
]
val_tumor_450k = beta_normalized_val_450k.iloc[
    :, normal_count_val_450k + 1 :
]

In [37]:
X_val_450k = pd.concat([val_normal_450k, val_tumor_450k], axis=1).T
y_val_450k = [(0 if i < normal_count_val_450k else 1) for i in range(X_val_450k.shape[0])]
print(X_val_450k.shape)
print("450K測試集樣本數量: ", len(y_val_450k))
print(
    f"450K測試集中各類別樣本數量: \nnormal={normal_count_val_450k}, tumor={X_val_450k.shape[0]-normal_count_val_450k}"
)

(91, 677)
450K測試集樣本數量:  91
450K測試集中各類別樣本數量: 
normal=11, tumor=80


In [38]:
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.metrics import (
    precision_score,
    accuracy_score,
    matthews_corrcoef,
    confusion_matrix,
    f1_score,
)
from sklearn import svm
from sklearn import ensemble
import xgboost as xgb
from sklearn.model_selection import GridSearchCV


def set_parameters(model, param_grid):
    grid_search = GridSearchCV(
        estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2
    )
    return grid_search


# xgb
param_xgb = {

    "learning_rate": [0.001, 0.01, 0.1],
    "n_estimators": [50, 100, 200],
    "max_depth": [2, 3, 4],
    "subsample": [0.6, 0.7, 0.8],
    "colsample_bytree": [0.6,0.7,0.8],
    "min_child_weight": [40,50,60],
}
xgb_model = set_parameters(xgb.XGBClassifier(random_state=42), param_xgb)

# rf
param_rf = {

    "n_estimators": [50, 100, 150,200],
    "min_samples_split": [80,90],
    "min_samples_leaf": [80,90],
    "max_features": [5, 10, 20],
}
rf_model = set_parameters(ensemble.RandomForestClassifier(random_state=42), param_rf)

# SVM
param_svm = {

    'kernel': ['rbf', 'poly', 'sigmoid'],
    'C':[50,100,200]
}
svm_model = set_parameters(svm.SVC(random_state=42), param_svm)

# dt
param_dt = {
    
    "min_samples_split": [70,80,90],
    "min_samples_leaf": [70,80,90],
    "max_features": [5, 10, 20],
}
dt_model = set_parameters(DecisionTreeClassifier(random_state=42), param_dt)

In [61]:
select_models = {
    # "XGBoost": xgb.XGBClassifier(random_state=42),
    # "Random Forest": ensemble.RandomForestClassifier(random_state=42),
    # "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": svm.SVC(kernel='linear',random_state=42),
}
models = {
    "XGBoost": xgb_model,
    "Random Forest": rf_model,
    "Decision Tree": dt_model,
    "SVM": svm_model,
}

In [40]:
results = {
    model_name: {
        "select_model":[],
        "num_features": [],
        "accuracy": [],
        "sensitivity": [],
        "specificity": [],
        "precision":[],
        "f1":[],
        "mcc": []
    }
    for model_name in models
}
results

{'XGBoost': {'select_model': [],
  'num_features': [],
  'accuracy': [],
  'sensitivity': [],
  'specificity': [],
  'precision': [],
  'f1': [],
  'mcc': []},
 'Random Forest': {'select_model': [],
  'num_features': [],
  'accuracy': [],
  'sensitivity': [],
  'specificity': [],
  'precision': [],
  'f1': [],
  'mcc': []},
 'Decision Tree': {'select_model': [],
  'num_features': [],
  'accuracy': [],
  'sensitivity': [],
  'specificity': [],
  'precision': [],
  'f1': [],
  'mcc': []},
 'SVM': {'select_model': [],
  'num_features': [],
  'accuracy': [],
  'sensitivity': [],
  'specificity': [],
  'precision': [],
  'f1': [],
  'mcc': []}}

In [62]:
import seaborn as sns
import numpy as np

feature_count = 3
for select_name,select_model in select_models.items():
    for i in range(3, feature_count + 1, 1):
        print("Select", i, "genes:")
        rfe = RFE(estimator=select_model, n_features_to_select=i)
        # X_train_rfe = rfe.fit_transform(X_train_450k, y_train_450k)
        X_train_rfe = rfe.fit_transform(X_train_450k, y_train_450k)
        X_val_rfe = rfe.transform(X_val_450k)
        selected_genes = np.where(rfe.support_)[0]
        df_feature_importances = pd.DataFrame(
            {
                "gene": gene['ID'].iloc[selected_genes],
                # "importance": rfe.estimator_.feature_importances_,
            }
        )
        # df_feature_importances.sort_values("importance", ascending=False, inplace=True)

        print("gene importances:")
        print(pd.DataFrame(df_feature_importances))
        # pd.DataFrame(df_feature_importances).to_csv("../result/GDC_breast_tissue_450k_GSE243529/RFE/method_2/GSE243529_SVM_filter_35.csv",index=False)
        # plt.figure(figsize=(10, 8))
        # sns.barplot(
        #     x="importance",
        #     y="gene",
        #     hue="gene",
        #     data=df_feature_importances,
        #     palette="magma",
        # )
        # plt.title("Gene Importances")
        # plt.xlabel("Importance")
        # plt.ylabel("Gene")
        # plt.show()

        for test_name, test_model in models.items():
            # if (i==30) and (test_name=="Decision Tree"):
            #     break
            print("Validation model:", test_name)
            grid_search = test_model
            grid_search.fit(X_train_rfe, y_train_450k)
            print("Best Parameters:", grid_search.best_params_)
            print("Best Score:", grid_search.best_score_)
            best_model = grid_search.best_estimator_

            # train
            y_pred_train = best_model.predict(X_train_rfe)
            accuracy_train = accuracy_score(y_train_450k, y_pred_train)
            print("Training Accuracy:", accuracy_train)

            # test
            y_pred = best_model.predict(X_val_rfe)
            accuracy = accuracy_score(y_val_450k, y_pred)
            print("Testing Accuracy:", accuracy)
            if abs(accuracy_train - accuracy)>0.1:
                print("=====Select", i, "genes", "Test model:",test_name, "overfitting=====")
                
            else:
                tn, fp, fn, tp = confusion_matrix(y_val_450k, y_pred).ravel()
                sensitivity = tp / (tp + fn)
                specificity = tn / (tn + fp)
                precision = precision_score(y_val_450k, y_pred)
                mcc = matthews_corrcoef(y_val_450k, y_pred)
                f1 = f1_score(y_val_450k, y_pred)


                results[test_name]["mcc"].append(round(mcc, 2))
                results[test_name]["accuracy"].append(round(accuracy,2))
                results[test_name]["sensitivity"].append(round(sensitivity,2))
                results[test_name]["specificity"].append(round(specificity,2))
                results[test_name]["precision"].append(round(precision))
                results[test_name]["f1"].append(round(f1))
                results[test_name]["num_features"].append(i)
                results[test_name]["select_model"].append(select_name)

Select 3 genes:
gene importances:
           gene
4    cg08101264
371  cg13995230
482  cg26097573
Validation model: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.001, 'max_depth': 2, 'min_child_weight': 40, 'n_estimators': 100, 'subsample': 0.6}
Best Score: 0.9969696969696968
Training Accuracy: 0.996969696969697
Testing Accuracy: 0.978021978021978
Validation model: Random Forest
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'max_features': 5, 'min_samples_leaf': 80, 'min_samples_split': 80, 'n_estimators': 50}
Best Score: 0.9878787878787879
Training Accuracy: 0.9893939393939394
Testing Accuracy: 0.989010989010989
Validation model: Decision Tree
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'max_features': 5, 'min_samples_leaf': 70, 'min_samples_split': 70}
Best Score: 0.9787878787878788
Training Accuracy: 0.9893939393939394
Testing 

In [53]:
df_list = []
for model, metrics in results.items():
    df_model = pd.DataFrame(metrics)
    df_model.insert(1, 'validation_model', model)
    df_list.insert(0, df_model)

df = pd.concat(df_list, ignore_index=True)

i = "SVM"
if i=="XGBoost":
    df_dict = {}
df_dict[f'{i}'] = df[df["select_model"] == i]
print(df_dict)

{'XGBoost':    select_model validation_model  num_features  accuracy  sensitivity  \
0       XGBoost              SVM             1      0.99         0.99   
1       XGBoost              SVM             2      1.00         1.00   
2       XGBoost              SVM             3      1.00         1.00   
3       XGBoost              SVM             4      1.00         1.00   
4       XGBoost              SVM             5      1.00         1.00   
5       XGBoost    Decision Tree             1      0.99         0.99   
6       XGBoost    Decision Tree             2      0.99         0.99   
7       XGBoost    Decision Tree             3      0.99         0.99   
8       XGBoost    Decision Tree             4      0.99         0.99   
9       XGBoost    Decision Tree             5      0.99         0.99   
10      XGBoost    Random Forest             1      0.99         0.99   
11      XGBoost    Random Forest             2      0.99         0.99   
12      XGBoost    Random Forest       

In [54]:
df_combined = pd.concat([
    df_dict["XGBoost"], 
    df_dict["Random Forest"], 
    df_dict["Decision Tree"], 
    df_dict["SVM"]
], ignore_index=True)
df_combined.to_csv("../result/GDC_rectal_tissue_450k/RFE/predict_450k.csv",index=False)
df_combined

Unnamed: 0,select_model,validation_model,num_features,accuracy,sensitivity,specificity,precision,f1,mcc
0,XGBoost,SVM,1,0.99,0.99,1.00,1,1,0.95
1,XGBoost,SVM,2,1.00,1.00,1.00,1,1,1.00
2,XGBoost,SVM,3,1.00,1.00,1.00,1,1,1.00
3,XGBoost,SVM,4,1.00,1.00,1.00,1,1,1.00
4,XGBoost,SVM,5,1.00,1.00,1.00,1,1,1.00
...,...,...,...,...,...,...,...,...,...
75,SVM,XGBoost,1,0.99,1.00,0.91,1,1,0.95
76,SVM,XGBoost,2,0.99,1.00,0.91,1,1,0.95
77,SVM,XGBoost,3,0.98,0.98,1.00,1,1,0.91
78,SVM,XGBoost,4,0.98,0.98,1.00,1,1,0.91
