### Sec. -1 Initiation

In [1]:
import os
import pandas as pd
from utils.config_helper import update_nested_toml, load_config
os.makedirs("logs", exist_ok=True)


In [5]:
# TYPE = input("Enter the type of the config file: ")
TYPE = "prostate_plan_A"
CONFIG_PATH = f"../config/{TYPE}.toml"
config = load_config(CONFIG_PATH)

In [6]:
df_file_1 = config["init"]["hyper"]["df_file_1"]
df_file_2 = config["init"]["hyper"]["df_file_2"]
dmp_file_1 = config["init"]["hyper"]["dmp_file_1"]
dmp_file_2 = config["init"]["hyper"]["dmp_file_2"]
majority_out_path_1 = config["init"]["hyper"]["majority_out_path_1"]
majority_out_path_2 = config["init"]["hyper"]["majority_out_path_2"]
joined_out_path = config["init"]["hyper"]["joined_out_path"]

### Sec. 0 process norm examples

#### 1. Split Dataset Example

In [7]:
import json
from utils.process_norm import *

In [8]:
# read the json file
cancer_type = input("Enter the cancer type: ")
with open(f"../config/{cancer_type}.json") as f:
    J = json.load(f)

In [9]:
data_source = input("Enter the raw all_beta_normalized file: ")
df = pd.read_csv(J[data_source]['file'])


In [11]:
dfo = organize_dataset(df, J[data_source]["normal"], J[data_source]["tumor"], J[data_source]["sample_count"])
complement_df, ratio_df = split_dataset(dfo, J[data_source]["split_test"], J[data_source]["random_state"])
complement_df.to_csv("all_beta_normalized_80.csv", index=False)
ratio_df.to_csv("all_beta_normalized_20.csv", index=False)

INFO complement_df feature: 581635
INFO complement_df sample (normal, tumor): (39, 39)
INFO ratio_df feature: 581635
INFO ratio_df sample (normal, tumor): (13, 13)


#### 2. Merge Dataset Example

In [3]:
df0 = pd.read_csv(f"../stomach/champ_result/gdc_stomach_GSE99553/all_beta_normalized_0.csv")
df1 = pd.read_csv(f"../stomach/champ_result/gdc_stomach_GSE99553/all_beta_normalized_GSE99553.csv")

In [4]:
df0 = organize_dataset(df0, 2, 395, 2)
df1 = organize_dataset(df1, 84, 0, 1)

In [None]:
merged_df = merge_datasets(df0, df1)
merged_df.to_csv("merged.csv", index=False)

#### 3. Inspect NaN Example

In [None]:
# Example usage:
df = pd.DataFrame({
    'ID': ['A1', 'A2', 'A3', 'A4', 'label'],
    'Col1': [1, 2, None, 4, 1],
    'Col2': [5, None, 7, 8, 1],
    'Col3': [9, 10, 11, 12, 0]
})

# Column-wise check
print(inspect_nan(df, mode="column"))

# Row-wise check
print(inspect_nan(df, mode="row"))

### Sec. 1 Delta Beta Calculation

In [None]:
from utils.dbeta_avg_helper import get_dbeta_avg, drop_dbeta_nan, get_dbeta_info

os.makedirs(f"{majority_out_path_1}/section_1", exist_ok=True)
os.makedirs(f"{majority_out_path_2}/section_1", exist_ok=True)

In [6]:
train_df = pd.read_csv(df_file_1)

In [None]:
delta_beta = get_dbeta_avg(train_df)

In [None]:
delta_beta

In [9]:
# record the list of feature with dbeta being NaN
delta_beta = drop_dbeta_nan(delta_beta, log_postfix="TCGA")

In [10]:
dmp = pd.read_csv(dmp_file_1)

In [11]:
dbeta_info = get_dbeta_info(delta_beta, dmp, log_postfix="TCGA")
dbeta_info

Unnamed: 0,ID,gene,dbeta,feature
0,cg03630821,A1BG,0.256535,Body
1,cg27394794,A1CF,-0.299349,Body
2,cg07027430,A2BP1,0.332115,Body
3,cg01723761,A2LD1,-0.040842,TSS200
4,cg08300930,A2M,0.151782,Body
...,...,...,...,...
18637,cg03489712,ZYX,-0.150827,TSS1500
18638,cg21851534,ZZEF1,0.148607,3'UTR
18639,cg10895547,ZZZ3,0.101143,Body
18640,cg20009101,psiTPTE22,0.295602,Body


In [12]:
dbeta_info["dbeta"] = dbeta_info["dbeta"].apply(lambda x: round(x, 6))
dbeta_info.to_csv(f"{majority_out_path_1}/section_1/dbeta.csv", index=False)

In [14]:
train_df = pd.read_csv(df_file_2)

In [15]:
delta_beta = get_dbeta_avg(train_df)

In [16]:
delta_beta = drop_dbeta_nan(delta_beta, log_postfix="GEO")

In [17]:
dmp = pd.read_csv(dmp_file_2)

In [18]:
dbeta_info = get_dbeta_info(delta_beta, dmp, log_postfix="GEO")
dbeta_info

Unnamed: 0,ID,gene,dbeta,feature
0,cg03630821,A1BG,0.052089,Body
1,cg20509831,A1BG-AS1,-0.068123,Body
2,cg11955117,A1CF,-0.133728,Body
3,cg07027430,A2BP1,0.239209,Body
4,cg19815813,A2LD1,-0.050767,TSS200
...,...,...,...,...
22551,cg07472835,ZYG11B,-0.103101,Body
22552,cg12529637,ZYX,0.077073,Body
22553,cg16463044,ZZEF1,0.100958,Body
22554,cg10895547,ZZZ3,0.041425,Body


In [19]:
dbeta_info["dbeta"] = dbeta_info["dbeta"].apply(lambda x: round(x, 6))
dbeta_info.to_csv(f"{majority_out_path_2}/section_1/dbeta.csv", index=False)

### Sec. 2 Filter Genes by Average Delta Beta Values


#### 2.1 Filtering TSS

In [8]:
os.makedirs(f"{majority_out_path_1}/section_2", exist_ok=True)
os.makedirs(f"{majority_out_path_2}/section_2", exist_ok=True)

In [9]:
dbeta_info_1 = pd.read_csv(f"{majority_out_path_1}/section_1/dbeta.csv")
dbeta_info_2 = pd.read_csv(f"{majority_out_path_2}/section_1/dbeta.csv")

In [10]:
TSS_1 = dbeta_info_1[dbeta_info_1["feature"].str.contains("TSS")]
TSS_2 = dbeta_info_2[dbeta_info_2["feature"].str.contains("TSS")]

In [11]:
TSS_1.to_csv(f"{majority_out_path_1}/section_2/dbeta_TSS.csv", index=False)
TSS_2.to_csv(f"{majority_out_path_2}/section_2/dbeta_TSS.csv", index=False)

#### 2.2 Thresholding

In [12]:
from utils.dbeta_avg_helper import detect_threshold

In [13]:
dbeta_TSS_threshold_1, threshold_1 = detect_threshold(TSS_1, config=config, log_postfix="_TCGA")
dbeta_TSS_threshold_1.to_csv(f"{majority_out_path_1}/section_2/dbeta_TSS_{threshold_1}.csv", index=False)

In [14]:
dbeta_TSS_threshold_2, threshold_2 = detect_threshold(TSS_2, config=config, log_postfix="_GEO")
dbeta_TSS_threshold_2.to_csv(f"{majority_out_path_2}/section_2/dbeta_TSS_{threshold_2}.csv", index=False)

#### 2.3 Visualization

In [11]:
from utils.dbeta_avg_helper import dbeta_graph, pca_graph

dbeta_graph(dbeta_TSS_threshold_1, f"{majority_out_path_1}/section_2/dbeta_TSS_{threshold_1}.png")
dbeta_graph(dbeta_TSS_threshold_2, f"{majority_out_path_2}/section_2/dbeta_TSS_{threshold_2}.png")


In [12]:
df_1 = pd.read_csv(df_file_1)

In [13]:
pca_graph(dbeta_info_1, df_1, f"{majority_out_path_1}/section_2/pca.html")

In [14]:
df_2 = pd.read_csv(df_file_2)

In [15]:
pca_graph(dbeta_info_2, df_2, f"{majority_out_path_2}/section_2/pca.html")

#### 2.4 join dbeta_info

In [30]:
os.makedirs(f"{joined_out_path}/section_2", exist_ok=True)

In [16]:
dbeta_info_1 = pd.read_csv(f"{majority_out_path_1}/section_2/dbeta_TSS_{threshold_1}.csv")
dbeta_info_2 = pd.read_csv(f"{majority_out_path_2}/section_2/dbeta_TSS_{threshold_2}.csv")

In [31]:
merged_df = pd.merge(dbeta_info_1, dbeta_info_2, on="gene", how="inner", suffixes=('_TCGA', '_GEO'))
merged_df = merged_df[["gene"] + [col for col in merged_df.columns if col != "gene"]]
merged_df.to_csv(f"{joined_out_path}/section_2/dbeta_TSS_threshold_joined.csv", index=False)


#### remember to run clustering on the filtered genes first continue to the next step

### Sec. 3 Feature Selection with ML (SFS)
sequential forward selection

#### Remove previous results
Warning: This step is not reversible

In [None]:
# import shutil
# import os
# if os.path.exists(f"{majority_out_path}/section_3/sfs"):
#     shutil.rmtree(f"{majority_out_path}/section_3/sfs")
# if os.path.exists(f"{minority_out_path}/section_3/sfs"):
#     shutil.rmtree(f"{minority_out_path}/section_3/sfs")

#### 3.1 Preparation(SFS)

In [None]:
config = load_config(CONFIG_PATH)

train_out_path = config["feature_selection"]["hyper"]["train_out_path"]
validate_out_path = config["feature_selection"]["hyper"]["validate_out_path"]

In [None]:
os.makedirs(f"{train_out_path}", exist_ok=True)
os.makedirs(f"{validate_out_path}", exist_ok=True)

In [6]:
config = load_config(CONFIG_PATH)
dbeta_info_file = config["feature_selection"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(f"{majority_out_path}/{dbeta_info_file}")

In [8]:
# check if logs/ folder exists
os.makedirs("logs", exist_ok=True)
from utils.train_helper import TrainHelper

In [9]:
# note that there is setup_dbeta in TrainHelper to further cut down the feature size
th = TrainHelper(dbeta_info)

#### 3.2 Selection(SFS)

In [10]:
train_df = pd.read_csv(f"{majority_out_path}/{majority_df_path}")
validate_df = pd.read_csv(f"{minority_out_path}/{minority_df_path}")
th.set_train_validate_df(train_df, validate_df)

In [11]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

selection_models = {
    "SVM": SVC(kernel="linear", random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        n_estimators=10,
    ),
    "XGBoost": XGBClassifier(
        random_state=42,
        n_estimators=10,
        ),
}

In [12]:
th.set_selection_models(selection_models)
th.set_train_validate()

In [13]:
os.makedirs(f"{majority_out_path}/sfs", exist_ok=True)

th.select_feature_sfs(
    out_path = f"{majority_out_path}/sfs/selected_feature.txt",
    step= 4,
    n_features_to_select="cluster"
)

INFO Training SVM with SFS
INFO Training SVM with 3 clusters selected
INFO Training finished with 4 clusters selected
INFO Training DecisionTree with SFS
INFO Training DecisionTree with 3 clusters selected
INFO Training finished with 4 clusters selected
INFO Training RandomForest with SFS
INFO Training RandomForest with 3 clusters selected
INFO Training finished with 4 clusters selected
INFO Training XGBoost with SFS
INFO Training XGBoost with 3 clusters selected
INFO Training finished with 4 clusters selected


### Sec. 3 Feature Selection with ML (RFE)
recursive feature elimination

#### Remove previous results
Warning: This step is not reversible

In [16]:
# import shutil
# import os
# if os.path.exists(f"{majority_out_path}/section_3/rfe"):
#     shutil.rmtree(f"{majority_out_path}/section_3/rfe")
# if os.path.exists(f"{minority_out_path}/section_3/rfe"):
#     shutil.rmtree(f"{minority_out_path}/section_3/rfe")

#### 3.1 Preparation

In [None]:
config = load_config(CONFIG_PATH)

train_out_path = config["feature_selection"]["hyper"]["train_out_path"]
validate_out_path = config["feature_selection"]["hyper"]["validate_out_path"]
training_param_file = config["feature_selection"]["hyper"]["training_param_file"]
os.makedirs(f"{train_out_path}", exist_ok=True)
os.makedirs(f"{validate_out_path}", exist_ok=True)

In [None]:
dbeta_info_file = config["feature_selection"]["hyper"]["dbeta_info_file"]
TSS_threshold = pd.read_csv(dbeta_info_file)

In [None]:
from utils.train_helper import TrainHelper
# note that there is setup_dbeta in TrainHelper to further cut down the feature size
th = TrainHelper(TSS_threshold)

#### 3.2 Selection

In [None]:
# from utils.process_norm import *

In [10]:
train_df = pd.read_csv(df_file_1)
validate_df_file = config["feature_selection"]["hyper"]["validate_df_file"]
validate_df = pd.read_csv(validate_df_file)

In [None]:
# train_df = inspect_nan(train_df, mode="column", remove=True)
# train_df

In [20]:
th.set_train_validate_df(train_df, validate_df)

In [None]:
# from utils.train_helper import set_parameters
# from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier

# import json

# with open(training_param_file, "r") as f:
#     training_param = json.load(f)
# xgb_grid = set_parameters(XGBClassifier(random_state=42), training_param["XGBoost"])
# rf_grid = set_parameters(
#     RandomForestClassifier(random_state=42), training_param["RandomForest"]
# )
# svm_grid = set_parameters(SVC(random_state=42, probability=True), training_param["SVM"])
# dt_grid = set_parameters(
#     DecisionTreeClassifier(random_state=42), training_param["DecisionTree"]
# )

# train_models = {
#     "XGBoost": xgb_grid,
#     "RandomForest": rf_grid,
#     "SVM": svm_grid,
#     "DecisionTree": dt_grid,
# }

In [25]:
selection_models = {
    "SVM": SVC(kernel="linear", random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        n_estimators=50,
    ),
    "XGBoost": XGBClassifier(
        random_state=42,
        n_estimators=50,
        ),
}

In [None]:
th.set_selection_models(selection_models)
# th.set_grid_estimators(train_models)
th.set_train_validate(ID="ID_TCGA")

In [27]:
th.select_feature_rfe(
    train_out_path = train_out_path,
    validate_out_path = validate_out_path,
    selected_feature_path = f"{train_out_path}/selected_feature.txt",
    feature_range = "cluster"
)

INFO Training SVM with RFE
INFO Training SVM with 1 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training finished with 3 clusters selected
INFO Training DecisionTree with RFE
INFO Training DecisionTree with 1 clusters selected
INFO Training DecisionTree with 1 clusters selected
INFO Training DecisionTree with 1 clusters selected
INFO Training DecisionTree with 1 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTre

#### 3.3 Preparation (second round)

In [29]:
config = load_config(CONFIG_PATH)

train_out_path = config["feature_selection_2"]["hyper"]["train_out_path"]
validate_out_path = config["feature_selection_2"]["hyper"]["validate_out_path"]
training_param_file = config["feature_selection_2"]["hyper"]["training_param_file"]
os.makedirs(f"{train_out_path}", exist_ok=True)
os.makedirs(f"{validate_out_path}", exist_ok=True)

In [30]:
dbeta_info_file = config["feature_selection"]["hyper"]["dbeta_info_file"]
TSS_threshold = pd.read_csv(dbeta_info_file)

In [31]:
from utils.train_helper import TrainHelper
# note that there is setup_dbeta in TrainHelper to further cut down the feature size
th = TrainHelper(TSS_threshold)

#### 3.4 Selection (second round)

In [32]:
train_df = pd.read_csv(df_file_2)
validate_df_file = config["feature_selection_2"]["hyper"]["validate_df_file"]
validate_df = pd.read_csv(validate_df_file)

In [33]:
th.set_train_validate_df(train_df, validate_df)

In [34]:
selection_models = {
    "SVM": SVC(kernel="linear", random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        n_estimators=50,
    ),
    "XGBoost": XGBClassifier(
        random_state=42,
        n_estimators=50,
        ),
}

In [35]:
th.set_selection_models(selection_models)
# th.set_grid_estimators(train_models)
th.set_train_validate(ID="ID_GEO")

In [36]:
th.select_feature_rfe(
    train_out_path = train_out_path,
    validate_out_path = validate_out_path,
    selected_feature_path = f"{train_out_path}/selected_feature.txt",
    feature_range = "cluster"
)

INFO Training SVM with RFE
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training finished with 3 clusters selected
INFO Training DecisionTree with RFE
INFO Training DecisionTree with 2 clusters selected
INFO Training Decisi

### Sec. 4 Clean Selected Features

#### 4.1 Generate Feature json for SimpleModel

In [37]:
from utils.train_helper import read_selected_features, read_selected_features_json, TrainHelper
config = load_config(CONFIG_PATH)
dbeta_info_file = config["feature_selection"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(dbeta_info_file)
th = TrainHelper(dbeta_info)
method = "rfe"
features = read_selected_features(f"{majority_out_path_1}/section_3/{method}/selected_feature.txt")
th.generate_selected_features(
    features,
    f"{majority_out_path_1}/section_3/{method}/selected_features.json",
    mode="min",
    out_format="json",
)

read_selected_features_json(f"{majority_out_path_1}/section_3/{method}/selected_features.json")

defaultdict(list,
            {'SVM': ['CCDC8', 'DEGS1', 'TUBB6', 'HOXD12', 'MIR654'],
             'DecisionTree': ['FOXD2', 'HOXD8', 'ZFP42', 'TRIM59', 'MIR654'],
             'RandomForest': ['ALX4', 'FOXD2', 'CCDC8', 'MIR1197'],
             'XGBoost': ['ALX4', 'FOXD2', 'TAC1', 'MIR654']})

In [38]:
from utils.train_helper import read_selected_features, read_selected_features_json, TrainHelper
config = load_config(CONFIG_PATH)
dbeta_info_file = config["feature_selection_2"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(dbeta_info_file)
th = TrainHelper(dbeta_info)
method = "rfe"
features = read_selected_features(f"{majority_out_path_2}/section_3/{method}/selected_feature.txt")
th.generate_selected_features(
    features,
    f"{majority_out_path_2}/section_3/{method}/selected_features.json",
    mode="min",
    out_format="json",
)

# use this to read json
read_selected_features_json(f"{majority_out_path_2}/section_3/{method}/selected_features.json")

defaultdict(list,
            {'SVM': ['ALX1', 'TBX20', 'EMID2', 'MIR495'],
             'DecisionTree': ['ESRRG', 'TBX20', 'TMEM196', 'MIR654'],
             'RandomForest': ['ESRRG', 'MEOX2', 'TBX20', 'LIMD2', 'MIR495'],
             'XGBoost': ['MEOX2', 'TBX20', 'SNORD93', 'MIR10B']})

#### 4.2 Gather Selected gene list from best selection model

In [23]:
rfe_train = pd.read_csv(f"{train_out_path}/rfe.csv")
rfe_validate = pd.read_csv(f"{validate_out_path}/rfe.csv")
fpr_tpr_train = pd.read_csv(f"{train_out_path}/roc_curve.csv")
fpr_tpr_validate = pd.read_csv(f"{validate_out_path}/roc_curve.csv")
rfe_j = pd.merge(rfe_train, rfe_validate, on=["selection_model", "train_model", "features"], suffixes=('_train', '_validate'))
fpr_tpr_j = pd.merge(fpr_tpr_train, fpr_tpr_validate, on=["selection_model", "train_model", "features"], suffixes=('_train', '_validate'))
J = pd.merge(rfe_j, fpr_tpr_j, on=["selection_model", "train_model", "features"])

In [24]:
import ast

J["fpr_train"] = J["fpr_train"].apply(ast.literal_eval)
J["tpr_train"] = J["tpr_train"].apply(ast.literal_eval)
J["fpr_validate"] = J["fpr_validate"].apply(ast.literal_eval)
J["tpr_validate"] = J["tpr_validate"].apply(ast.literal_eval)

In [25]:
from utils.painter import plot_roc_curve, create_performance_barchart

In [26]:
J['accuracy_diff'] = J['accuracy_train'] - J['accuracy_validate']
J['recall_diff'] = J['recall_train'] - J['recall_validate']
J['f1_score_diff'] = J['f1_score_train'] - J['f1_score_validate']
J['AUC_diff'] = J['AUC_train'] - J['AUC_validate']
J['MCC_diff'] = J['MCC_train'] - J['MCC_validate']
J['fbeta2_score_diff'] = J['fbeta2_score_train'] - J['fbeta2_score_validate']

In [28]:
# tweakable width and height
plot_roc_curve(
    J,
    "ROC Curves on Training Set",
    f"{train_out_path}/roc_curve.html",
    x_column = "fpr_train",
    y_column = "tpr_train",
    trace_name = ["selection_model", "train_model", "features"],
)
# tweakable width and height
plot_roc_curve(
    J,
    "ROC Curves on Testing Set",
    f"{validate_out_path}/roc_curve.html",
    x_column = "fpr_validate",
    y_column = "tpr_validate",
    trace_name = ["selection_model", "train_model", "features"],
)

ROC curve saved to ../lung/result/GDC_lung_tissue/split80/section_3/rfe/roc_curve.html
ROC curve saved to ../lung/result/GDC_lung_tissue/split20/section_3/rfe/roc_curve.html


In [29]:
# plot difference
performance_metrics = ['accuracy_diff', 'recall_diff', 'f1_score_diff', 'AUC_diff', 'MCC_diff', 'fbeta2_score_diff']
ground_by_train_model = J.groupby('train_model')[performance_metrics].mean()
ground_by_train_model['train_model'] = ground_by_train_model.index
ground_by_train_model.to_csv(f"{validate_out_path}/performance_diff_grouped_by_train_model.csv", index=False)
color_mapping = {
    "accuracy_diff": "blue",
    "recall_diff": "red",
    "f1_score_diff": "green",
    "AUC_diff": "purple",
    "MCC_diff": "orange",
    "fbeta2_score_diff": "brown",
}
create_performance_barchart(
    df=ground_by_train_model,
    color_mapping=color_mapping,
    metric="train_model",
    out_path=f"{validate_out_path}/performance_diff_grouped_by_train_model.html",
    title="Grouped Performance Difference between Training and Testing Set",
    x_axis_label="Performance Difference (Training - Testing)",
    y_axis_label="Train Model",
    orientation="h",
)

Performance difference saved to ../lung/result/GDC_lung_tissue/split20/section_3/rfe/performance_diff_grouped_by_train_model.html


In [30]:
J = J[["selection_model", "train_model", "features", "accuracy_validate", "recall_validate", "f1_score_validate", "AUC_validate", "MCC_validate", "fbeta2_score_validate"]]

In [31]:
# group by train_model, for each train_model, calculate the mean of each performance metric
performance_metrics = ['accuracy_validate', 'recall_validate',
                       'f1_score_validate', 'AUC_validate', 'MCC_validate']
ground_by_train_model = J.groupby('train_model')[performance_metrics].mean()
ground_by_train_model['train_model'] = ground_by_train_model.index
ground_by_train_model.to_csv(
    f"{validate_out_path}/performance_metrics_grouped_by_train_model.csv", index=False)
color_mapping = {
    "accuracy_validate": "blue",
    "recall_validate": "red",
    "f1_score_validate": "green",
    "AUC_validate": "purple",
    "MCC_validate": "orange",
}
create_performance_barchart(
    df=ground_by_train_model,
    color_mapping=color_mapping,
    metric="train_model",
    out_path=f"{validate_out_path}/performance_metrics_grouped_by_train_model.html",
    title="Grouped Performance Metrics by Train Model",
    x_axis_label="Performance",
    y_axis_label="Train Model",
    orientation="h",
)
best_train_model = ground_by_train_model['MCC_validate'].idxmax()
print(f"Best train model: {best_train_model}")
ground_by_feature = J[J['train_model'] == best_train_model].groupby('features')[
    performance_metrics].mean()
ground_by_feature['features'] = ground_by_feature.index
ground_by_feature.to_csv(
    f"{validate_out_path}/performance_metrics_grouped_by_feature.csv", index=False)
create_performance_barchart(
    df=ground_by_feature,
    color_mapping=color_mapping,
    metric="features",
    out_path=f"{validate_out_path}/performance_metrics_grouped_by_feature.html",
    title="Grouped Performance Metrics by Feature",
    x_axis_label="Performance",
    y_axis_label="Feature",
    orientation="h",
)
best_num_of_feature = ground_by_feature['MCC_validate'].idxmax()
print(f"Best number of feature: {best_num_of_feature}")
best_performance_records = J[(J['train_model'] == best_train_model) & (
    J['features'] == best_num_of_feature)]
best_performance_records.to_csv(
    f"{validate_out_path}/best_performance_records.csv", index=False)

Performance difference saved to ../lung/result/GDC_lung_tissue/split20/section_3/rfe/performance_metrics_grouped_by_train_model.html
Best train model: XGBoost
Performance difference saved to ../lung/result/GDC_lung_tissue/split20/section_3/rfe/performance_metrics_grouped_by_feature.html
Best number of feature: 5


In [32]:
from utils.train_helper import read_selected_features, read_selected_features_json, TrainHelper
config = load_config(CONFIG_PATH)
dbeta_info_file = config["feature_selection"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(dbeta_info_file)
th = TrainHelper(dbeta_info)

features = read_selected_features(f"{train_out_path}/selected_feature.txt")

th.generate_selected_features(
    features,
    f"{validate_out_path}/selected_features.json",
    mode=int(best_num_of_feature),
    out_format="json",
)

# use this to read json
read_selected_features_json(f"{validate_out_path}/selected_features.json")

defaultdict(list,
            {'best': ['SNORD115-10',
              'TMEM196',
              'MIR377',
              'PCK1',
              'PATE2',
              'RALYL',
              'ATP5G2',
              'C1orf150',
              'C1orf114',
              'PCDHB15',
              'ATG16L1',
              'DLC1',
              'DOC2A',
              'AIM2',
              'DCD',
              'SCARF1',
              'BHLHE23']})

### Sec. 5 Clustering Visualization

#### 1. load data

remember to calculate distance matrix first

In [81]:
import pandas as pd
import numpy as np
from utils.clustering_helper import hierarchical_clustering, check_distance_matrix

In [82]:
result_prefix = config["clustering_visual"]["hyper"]["result_prefix"]
dbeta_file = config["clustering_visual"]["hyper"]["dbeta_file"]
bp_file = config["clustering_visual"]["hyper"]["bp_file"]
cc_file = config["clustering_visual"]["hyper"]["cc_file"]
mf_file = config["clustering_visual"]["hyper"]["mf_file"]
terms_count_file = config["clustering_visual"]["hyper"]["terms_count_file"]
result_out_path = config["clustering_visual"]["hyper"]["result_out_path"]

In [83]:
os.makedirs(result_out_path, exist_ok=True)

In [84]:
gene_set = pd.read_csv(dbeta_file, index_col=0)
distance_matrix_bp = pd.read_csv(bp_file, index_col=0)
distance_matrix_cc = pd.read_csv(cc_file, index_col=0)
distance_matrix_mf = pd.read_csv(mf_file, index_col=0)
terms_count = pd.read_csv(terms_count_file, index_col=0)

In [85]:
# replace NaN with 0
distance_matrix_bp = distance_matrix_bp.fillna(0)
distance_matrix_cc = distance_matrix_cc.fillna(0)
distance_matrix_mf = distance_matrix_mf.fillna(0)

In [86]:
# reindex distance matrix
index_bp = distance_matrix_bp.index
index_cc = distance_matrix_cc.index
index_mf = distance_matrix_mf.index
index = index_bp.union(index_cc).union(index_mf)
distance_matrix_bp_ = distance_matrix_bp.reindex(index=index, columns=index, fill_value=0)
distance_matrix_cc_ = distance_matrix_cc.reindex(index=index, columns=index, fill_value=0)
distance_matrix_mf_ = distance_matrix_mf.reindex(index=index, columns=index, fill_value=0)

In [87]:
# make a array of distance matrix for each ontology
distance_matrix = []

distance_matrix.append(distance_matrix_bp_)
distance_matrix.append(distance_matrix_cc_)
distance_matrix.append(distance_matrix_mf_)

#### 2. Weighted Sum

In [88]:
weight = [count for count in terms_count["count"]]
weight = weight / np.sum(weight)
masks = np.array([~np.isnan(distance_matrix[i].values) for i in range(3)])

valid_weights = np.array([weight[i] for i in range(3)])[:, None, None] * masks

weight_sums = valid_weights.sum(axis=0)

normalized_weights = np.divide(valid_weights, weight_sums, where=weight_sums != 0)
weighted_sum = sum(
    np.nan_to_num(distance_matrix[i].values) * normalized_weights[i] for i in range(3)
)


weighted_sum_dataframe = pd.DataFrame(weighted_sum, index=index, columns=index)

weighted_sum_dataframe.head()

Unnamed: 0,ADAMTS20,ADCY4,ALX1,ALX4,C1orf114,C1orf150,CA3,CCDC8,CCDC81,CFTR,...,TAC1,TBX20,TMEM196,TRIM59,TUBB6,WDR8,WNT3,ZFP42,ZNF781,ZSCAN18
ADAMTS20,0.0,0.577176,0.811536,0.793435,0.288938,0.673585,0.724191,0.763671,0.243684,0.810076,...,0.718813,0.792564,0.469702,0.664124,0.746487,0.763971,0.691306,0.797409,0.772388,0.774362
ADCY4,0.577176,0.0,0.746949,0.773581,0.154626,0.45256,0.677446,0.575162,0.040336,0.727661,...,0.577385,0.793741,0.480065,0.640341,0.685082,0.616422,0.685534,0.717358,0.700533,0.67443
ALX1,0.811536,0.746949,0.0,0.329106,0.159371,0.594067,0.791724,0.632017,0.049826,0.782787,...,0.787421,0.407062,0.604176,0.778586,0.79339,0.650497,0.651433,0.400449,0.196909,0.151033
ALX4,0.793435,0.773581,0.329106,0.0,0.164211,0.610369,0.80162,0.635208,0.05609,0.740674,...,0.744857,0.403263,0.600006,0.799133,0.779394,0.650163,0.548934,0.468631,0.132878,0.056669
C1orf114,0.288938,0.154626,0.159371,0.164211,0.0,0.108026,0.142762,0.147033,0.137163,0.159371,...,0.267654,0.225775,0.040146,0.16554,0.135834,0.137922,0.169811,0.155385,0.265407,0.164211


In [89]:
cluster_result_weighted = hierarchical_clustering(
    weighted_sum_dataframe,
    range_min=2,
    range_max=4,
    cluster_number=3,
    out_path=f"{result_out_path}/hierarchical_clustering_weighted_sum.png",
)

chosen number of clusters: 3


In [90]:
cluster_result_weighted.head()

Unnamed: 0,gene,cluster
0,ADAMTS20,3
1,ADCY4,3
2,ALX1,1
3,ALX4,1
4,C1orf114,3


#### 3. Simple average

In [91]:
weight = [1, 1, 1]
masks = np.array([~np.isnan(distance_matrix[i].values) for i in range(3)])
valid_weights = np.array([weight[i] for i in range(3)])[:, None, None] * masks
weight_sums = valid_weights.sum(axis=0)
normalized_weights = np.divide(valid_weights, weight_sums, where=weight_sums != 0)
weighted_sum = sum(
    np.nan_to_num(distance_matrix[i].values) * normalized_weights[i] for i in range(3)
)
simple_sum_dataframe = pd.DataFrame(weighted_sum, index=index, columns=index)
simple_sum_dataframe.head()

Unnamed: 0,ADAMTS20,ADCY4,ALX1,ALX4,C1orf114,C1orf150,CA3,CCDC8,CCDC81,CFTR,...,TAC1,TBX20,TMEM196,TRIM59,TUBB6,WDR8,WNT3,ZFP42,ZNF781,ZSCAN18
ADAMTS20,0.0,0.600667,0.799667,0.792,0.524,0.491,0.645667,0.719333,0.456667,0.790333,...,0.68,0.781333,0.407,0.663667,0.696,0.738,0.631667,0.752333,0.742667,0.75
ADCY4,0.600667,0.0,0.678333,0.703,0.306,0.231333,0.527333,0.386,0.141667,0.600667,...,0.549333,0.685,0.294667,0.597667,0.577,0.466333,0.581667,0.603,0.646,0.602667
ALX1,0.799667,0.678333,0.0,0.220667,0.322667,0.303667,0.684667,0.435,0.175,0.687667,...,0.711667,0.277667,0.468333,0.632333,0.637333,0.480333,0.590667,0.368667,0.182667,0.114333
ALX4,0.792,0.703,0.220667,0.0,0.339667,0.312,0.709,0.440333,0.197,0.675,...,0.691333,0.305667,0.485,0.682333,0.642333,0.471333,0.540667,0.36,0.143,0.074667
C1orf114,0.524,0.306,0.322667,0.339667,0.0,0.142333,0.264333,0.279333,0.244667,0.322667,...,0.487,0.379333,0.141,0.344333,0.24,0.247333,0.359333,0.308667,0.473,0.339667


In [92]:
cluster_result_simple = hierarchical_clustering(
    simple_sum_dataframe,
    range_min=2,
    range_max=4,
    cluster_number=3,
    out_path=f"{result_out_path}/hierarchical_clustering_simple_sum.png",
)

chosen number of clusters: 3


In [93]:
cluster_result_simple.head()

Unnamed: 0,gene,cluster
0,ADAMTS20,3
1,ADCY4,3
2,ALX1,1
3,ALX4,1
4,C1orf114,3


#### 4. Consensus clustering 

In [94]:
cluster_bp = hierarchical_clustering(
    distance_matrix_bp, out_path=f"{result_out_path}/hierarchical_clustering_bp.png"
)
cluster_cc = hierarchical_clustering(
    distance_matrix_cc, out_path=f"{result_out_path}/hierarchical_clustering_cc.png"
)
cluster_mf = hierarchical_clustering(
    distance_matrix_mf, out_path=f"{result_out_path}/hierarchical_clustering_mf.png"
)

Best number of clusters: 24
Best number of clusters: 7
Best number of clusters: 7


In [95]:
cluster_bp.columns = ["gene", "cluster_bp"]
cluster_cc.columns = ["gene", "cluster_cc"]
cluster_mf.columns = ["gene", "cluster_mf"]
cluster_bp_cc = pd.merge(cluster_bp, cluster_cc, on="gene", how="outer")
cluster_go = pd.merge(cluster_bp_cc, cluster_mf, on="gene", how="outer")
cluster_go = cluster_go.fillna(-1)
print(cluster_go.shape)
cluster_go.head()

(91, 4)


Unnamed: 0,gene,cluster_bp,cluster_cc,cluster_mf
0,ADAMTS20,14.0,4.0,7.0
1,ADCY4,5.0,5.0,3.0
2,ALX1,1.0,1.0,1.0
3,ALX4,2.0,1.0,1.0
4,C1orf114,-1.0,7.0,3.0


In [96]:
num_genes = cluster_go.shape[0]
consensus_matrix = np.zeros((num_genes, num_genes))
for i in range(num_genes):
    for j in range(i, num_genes):
        if cluster_go.iloc[i]["cluster_bp"] == cluster_go.iloc[j]["cluster_bp"]:
            consensus_matrix[i][j] += 1

        if cluster_go.iloc[i]["cluster_cc"] == cluster_go.iloc[j]["cluster_cc"]:
            consensus_matrix[i][j] += 1

        if cluster_go.iloc[i]["cluster_mf"] == cluster_go.iloc[j]["cluster_mf"]:
            consensus_matrix[i][j] += 1

consensus_matrix = pd.DataFrame(
    consensus_matrix, index=cluster_go["gene"], columns=cluster_go["gene"]
)
consensus_matrix += consensus_matrix.T
distance_matrix_consensus = 1 - consensus_matrix / 3
np.fill_diagonal(distance_matrix_consensus.values, 0)
distance_matrix_consensus.head()

gene,ADAMTS20,ADCY4,ALX1,ALX4,C1orf114,C1orf150,CA3,CCDC8,CCDC81,CFTR,...,TAC1,TBX20,TMEM196,TRIM59,TUBB6,WDR8,WNT3,ZFP42,ZNF781,ZSCAN18
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ADAMTS20,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.666667,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ADCY4,1.0,0.0,1.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.333333,...,0.666667,1.0,1.0,0.666667,0.666667,0.666667,0.333333,1.0,1.0,1.0
ALX1,1.0,1.0,0.0,0.333333,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.333333,1.0,1.0,1.0,1.0,1.0,0.333333,0.333333,0.333333
ALX4,1.0,1.0,0.333333,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.333333,1.0,1.0,1.0,1.0,1.0,0.666667,0.666667,0.333333
C1orf114,1.0,0.666667,1.0,1.0,0.0,0.666667,0.333333,0.333333,0.0,0.666667,...,0.666667,1.0,0.666667,0.666667,0.333333,0.333333,0.666667,0.666667,1.0,1.0


In [105]:
cluster_result_consensus = hierarchical_clustering(
    distance_matrix_consensus,
    range_min=2,
    range_max=4,
    cluster_number=3,
    out_path=f"{result_out_path}/hierarchical_clustering_consensus.png",
)

chosen number of clusters: 3


In [106]:
cluster_result_consensus.head()

Unnamed: 0,gene,cluster
0,ADAMTS20,3
1,ADCY4,3
2,ALX1,1
3,ALX4,1
4,C1orf114,3


#### 5. Compare 

In [107]:
from utils.clustering_helper import hierarchical_clustering_compare

hierarchical_clustering_compare(
    [weighted_sum_dataframe, simple_sum_dataframe, distance_matrix_consensus],
    ["Weighted Average", "Simple Average", "Consensus"],
    out_path=f"{result_out_path}/hierarchical_clustering_compare.png",
    range_min=2,
    range_max=4,
)

Best number of clusters for Weighted Average: 4
Best number of clusters for Simple Average: 3
Best number of clusters for Consensus: 3


In [108]:
dbeta_info = pd.read_csv(dbeta_file)

In [109]:
# column gene isin weighted_sum_dataframe
weighted_dbeta = dbeta_info[dbeta_info["gene"].isin(weighted_sum_dataframe.index)]
simple_dbeta = dbeta_info[dbeta_info["gene"].isin(simple_sum_dataframe.index)]
consensus_dbeta = dbeta_info[dbeta_info["gene"].isin(distance_matrix_consensus.index)]

In [110]:
weighted_dbeta.merge(cluster_result_weighted, on="gene").to_csv(
    f"{result_out_path}/{result_prefix}_weighted.csv", index=False
)
simple_dbeta.merge(cluster_result_simple, on="gene").to_csv(
    f"{result_out_path}/{result_prefix}_simple.csv", index=False
)
consensus_dbeta.merge(cluster_result_consensus, on="gene").to_csv(
    f"{result_out_path}/{result_prefix}_consensus.csv", index=False
)

### Sec. 6 SimpleModel Training

#### 1. Load Data

In [40]:
from utils.train_helper import read_selected_features_json
import pandas as pd
from utils.process_norm import *

In [49]:
dbeta_file = config["simple_model"]["hyper"]["dbeta_file"]
selected_feature_file = config["simple_model"]["hyper"]["selected_feature_file"]
selected_feature_file_2 = config["simple_model"]["hyper"]["selected_feature_file_2"]
train_out_path = config["simple_model"]["hyper"]["train_out_path"]
validate_out_path = config["simple_model"]["hyper"]["validate_out_path"]
df_train_file = config["simple_model"]["hyper"]["df_train_file"]
df_test_file = config["simple_model"]["hyper"]["df_test_file"]
df_train_file_2 = config["simple_model"]["hyper"]["df_train_file_2"]
df_test_file_2 = config["simple_model"]["hyper"]["df_test_file_2"]
training_param_file = config["simple_model"]["hyper"]["training_param_file"]

In [50]:
selected_feature = read_selected_features_json(selected_feature_file)
selected_feature_2 = read_selected_features_json(selected_feature_file_2)
dbeta_file = pd.read_csv(dbeta_file)

In [26]:
train = pd.read_csv(df_train_file)
test = pd.read_csv(df_test_file)

In [29]:
train = inspect_nan(train, mode="column", remove=True)

INFO Columns with NaNs: {'102': np.float64(1.0)}


In [32]:
train_2 = pd.read_csv(df_train_file_2)
test_2 = pd.read_csv(df_test_file_2)

In [34]:
train = merge_datasets(train, train_2)
train

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,616,617,618,619,620,621,622,623,624,625
0,cg00000957,0.873862,0.892332,0.883532,0.841300,0.878719,0.867264,0.870131,0.818329,0.852370,...,0.874586,0.872533,0.835085,0.895114,0.826937,0.849945,0.887410,0.790712,0.823082,0.874806
1,cg00001583,0.106812,0.068196,0.108394,0.139665,0.079447,0.146764,0.130379,0.090545,0.143240,...,0.504870,0.423366,0.603300,0.513820,0.210275,0.500739,0.043536,0.438405,0.187056,0.469474
2,cg00002028,0.035183,0.021341,0.029587,0.037926,0.032354,0.026836,0.027859,0.031436,0.035310,...,0.036546,0.018422,0.052466,0.057909,0.042918,0.047720,0.030559,0.022480,0.020501,0.024932
3,cg00002719,0.019621,0.019265,0.044525,0.062630,0.046182,0.027035,0.024905,0.056112,0.034194,...,0.026150,0.020414,0.707763,0.738072,0.428430,0.034929,0.040819,0.440246,0.739535,0.474644
4,cg00002837,0.218565,0.276446,0.312116,0.293055,0.241271,0.242700,0.317879,0.279957,0.379785,...,0.171936,0.484733,0.257450,0.474271,0.424260,0.298486,0.442149,0.360239,0.114212,0.604305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294377,cg27657363,0.940878,0.965408,0.944938,0.949426,0.955888,0.906193,0.950697,0.921229,0.944898,...,0.914353,0.779031,0.931848,0.943105,0.928437,0.842096,0.953970,0.919841,0.918249,0.958372
294378,cg27657537,0.055777,0.064906,0.045291,0.158475,0.102932,0.056264,0.060546,0.075328,0.094149,...,0.085146,0.174162,0.053053,0.098441,0.073162,0.096444,0.063295,0.054044,0.178369,0.075318
294379,cg27662611,0.047670,0.044439,0.020585,0.058815,0.047977,0.044475,0.034528,0.052030,0.052138,...,0.067540,0.028001,0.064793,0.046730,0.030347,0.048390,0.058986,0.035668,0.043356,0.020780
294380,cg27665648,0.795897,0.891454,0.838094,0.764569,0.835856,0.818406,0.859977,0.774933,0.745618,...,0.637837,0.668517,0.901890,0.876000,0.749287,0.691579,0.586243,0.866546,0.453257,0.801099


In [35]:
test = merge_datasets(test, test_2)
test

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,200,201,202,203,204,205,206,207,208,209
0,cg00000957,0.880918,0.889138,0.874959,0.885774,0.896747,0.879334,0.858834,0.837964,0.876630,...,0.824397,0.897076,0.884351,0.891127,0.937575,0.778774,0.867760,0.874922,0.839929,0.889266
1,cg00001583,0.066911,0.076375,0.096527,0.040940,0.083554,0.087588,0.061910,0.069474,0.093006,...,0.576252,0.053651,0.576651,0.146280,0.380901,0.073419,0.165535,0.066819,0.094443,0.132342
2,cg00002028,0.023719,0.037065,0.021660,0.010759,0.010474,0.027399,0.024558,0.035275,0.017022,...,0.137957,0.097596,0.101154,0.114899,0.162179,0.096768,0.092990,0.058361,0.124010,0.119324
3,cg00002719,0.018851,0.024845,0.023367,0.012765,0.010090,0.017774,0.046654,0.026357,0.026142,...,0.801550,0.147161,0.645241,0.285699,0.941672,0.161863,0.075074,0.279016,0.330795,0.039894
4,cg00002837,0.231168,0.263481,0.326748,0.212321,0.233503,0.228290,0.271203,0.376430,0.252944,...,0.243319,0.499297,0.359858,0.280982,0.340304,0.419111,0.396845,0.550041,0.452268,0.356127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294377,cg27657363,0.965252,0.881412,0.961724,0.957276,0.956891,0.954883,0.953429,0.948396,0.957826,...,0.856380,0.791323,0.927085,0.842508,0.755541,0.882560,0.884853,0.734444,0.651562,0.877115
294378,cg27657537,0.049765,0.117931,0.061232,0.034888,0.055366,0.067702,0.054141,0.083854,0.062192,...,0.038267,0.039257,0.054078,0.060248,0.065395,0.052062,0.068284,0.049476,0.031694,0.089036
294379,cg27662611,0.042660,0.063932,0.055080,0.034882,0.017662,0.042007,0.042645,0.047950,0.047642,...,0.025175,0.025053,0.013499,0.017525,0.135137,0.031272,0.009832,0.023495,0.026024,0.017780
294380,cg27665648,0.872041,0.797600,0.829041,0.863131,0.896679,0.838608,0.709252,0.742108,0.832386,...,0.793646,0.676673,0.932568,0.792866,0.962920,0.757222,0.865069,0.638925,0.638460,0.927500


#### 2. Training

In [43]:
from utils.train_helper import set_parameters
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import json
from sklearn.ensemble import VotingClassifier


with open(training_param_file, "r") as f:
    training_param = json.load(f)

xgb_grid = set_parameters(XGBClassifier(random_state=42), training_param["XGBoost"])
rf_grid = set_parameters(RandomForestClassifier(random_state=42), training_param["RandomForest"])
svm_grid = set_parameters(SVC(random_state=42, probability=True), training_param["SVM"])
dt_grid = set_parameters(DecisionTreeClassifier(random_state=42), training_param["DecisionTree"])
voting = VotingClassifier(
    estimators=[("XGBoost", XGBClassifier(random_state=42)), ("RandomForest", RandomForestClassifier(random_state=42)), ("SVM", SVC(random_state=42, probability=True)), ("DecisionTree", DecisionTreeClassifier(random_state=42))
                ],
    voting="soft",
)

# comment out the model you don't want to use
models = {
    "XGBoost": {
        "is_grid_search": True,
        "model": xgb_grid,
    },
    "RandomForest": {
        "is_grid_search": True,
        "model": rf_grid,
    },
    "SVM": {
        "is_grid_search": True,
        "model": svm_grid,
    },
    "DecisionTree": {
        "is_grid_search": True,
        "model": dt_grid,
    },
    "Voting": {
        "is_grid_search": False,
        "model": voting,
    },
}

In [None]:
from utils.simple_model import SimpleModel

for model_name, gene_list in selected_feature.items():
    for model_name, model_config in models.items():
        model = SimpleModel(
            train_df=train,
            test_df=test,
            gene_list=gene_list,
            dbeta_info=dbeta_file,
        )
        model.setup_dbeta()
        model.setup_train_test("ID_TCGA")
        model.setup_combinations()
        model.train(
            model_name,
            model_config["model"],
            train_out_path,
            validate_out_path,
            model_config["is_grid_search"],
            "ID_TCGA"
        )

INFO Training for combination: ('CCDC8', 'HOXD12', 'MIR654') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits


  _data = np.array(data, dtype=dtype, copy=copy,


INFO Training for combination: ('DEGS1', 'HOXD12', 'MIR654') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('TUBB6', 'HOXD12', 'MIR654') with estimator: XGBoost


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('CCDC8', 'HOXD12', 'MIR654') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('DEGS1', 'HOXD12', 'MIR654') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('TUBB6', 'HOXD12', 'MIR654') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('CCDC8', 'HOXD12', 'MIR654') with estimator: SVM
Fitting 5 folds for each of 8 candidates, totalling 40 fits
INFO Training for combination: ('DEGS1', 'HOXD12', 'MIR654') with estimator: SVM
Fitting 5 folds for each of 8 candidates, totalling 40 fits
INFO Training for combination: ('TUBB6', 'HOXD12', 'MIR654') with estimator: SVM
Fitting 5 folds for each of 8 candidates, totalling 40 fits
INFO Training for combination: ('CCDC8', 'HOXD12', 'MIR65

In [None]:
from utils.simple_model import SimpleModel

for model_name, gene_list in selected_feature_2.items():
    for model_name, model_config in models.items():
        model = SimpleModel(
            train_df=train,
            test_df=test,
            gene_list=gene_list,
            dbeta_info=dbeta_file,
        )
        model.setup_dbeta()
        model.setup_train_test("ID_GEO")
        model.setup_combinations()
        model.train(
            model_name,
            model_config["model"],
            train_out_path,
            validate_out_path,
            model_config["is_grid_search"],
            "ID_GEO"
        )

INFO Training for combination: ('ALX1', 'EMID2', 'MIR495') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('TBX20', 'EMID2', 'MIR495') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('ALX1', 'EMID2', 'MIR495') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('TBX20', 'EMID2', 'MIR495') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('ALX1', 'EMID2', 'MIR495') with estimator: SVM
Fitting 5 folds for each of 8 candidates, totalling 40 fits
INFO Training for combination: ('TBX20', 'EMID2', 'MIR495') with estimator: SVM
Fitting 5 folds for each of 8 candidates, totalling 40 fits
INFO Training for combination: ('ALX1', 'EMID2', 'MIR495') with estimator: DecisionTree
Fitting 5 folds for each of 18 candid

#### 3. Visualization

##### Congratulation! You have finished the whole pipeline🎉. <br>

![title](cat.png)