### Sec. -1 Initiation

In [16]:
import os
import pandas as pd
from utils.config_helper import update_nested_toml, load_config
from utils.process_norm import *
os.makedirs("logs", exist_ok=True)


In [2]:
TYPE = input("Enter the type of the config file: ")
CONFIG_PATH = f"../config/{TYPE}.toml"
config = load_config(CONFIG_PATH)

In [3]:
df_file_1 = config["init"]["hyper"]["df_file_1"]
df_file_2 = config["init"]["hyper"]["df_file_2"]
dmp_file_1 = config["init"]["hyper"]["dmp_file_1"]
dmp_file_2 = config["init"]["hyper"]["dmp_file_2"]
majority_out_path_1 = config["init"]["hyper"]["majority_out_path_1"]
majority_out_path_2 = config["init"]["hyper"]["majority_out_path_2"]
joined_out_path = config["init"]["hyper"]["joined_out_path"]

### Sec. 0 process norm examples

#### 1. Split Dataset Example

In [1]:
import json
from utils.process_norm import *

In [4]:
# read the json file
cancer_type = input("Enter the cancer type: ")
with open(f"../config/{cancer_type}.json") as f:
    J = json.load(f)

In [10]:
data_source = input("Enter the raw all_beta_normalized file: ")
df = pd.read_csv(J[data_source]['file'])


In [11]:
dfo = organize_dataset(df, J[data_source]["normal"], J[data_source]["tumor"], J[data_source]["sample_count"])
complement_df, ratio_df = split_dataset(dfo, J[data_source]["split_test"], J[data_source]["random_state"])
complement_df.to_csv("all_beta_normalized_complement.csv", index=False)
ratio_df.to_csv("all_beta_normalized_ratio.csv", index=False)

INFO complement_df feature: 581635
INFO complement_df sample (normal, tumor): (39, 39)
INFO ratio_df feature: 581635
INFO ratio_df sample (normal, tumor): (13, 13)


#### 2. Merge Dataset Example

In [3]:
df0 = pd.read_csv(f"../stomach/champ_result/gdc_stomach_GSE99553/all_beta_normalized_0.csv")
df1 = pd.read_csv(f"../stomach/champ_result/gdc_stomach_GSE99553/all_beta_normalized_GSE99553.csv")

In [4]:
df0 = organize_dataset(df0, 2, 395, 2)
df1 = organize_dataset(df1, 84, 0, 1)

In [None]:
merged_df = merge_datasets(df0, df1)
merged_df.to_csv("merged.csv", index=False)

#### 3. Inspect NaN Example

In [None]:
# Example usage:
df = pd.DataFrame({
    'ID': ['A1', 'A2', 'A3', 'A4', 'label'],
    'Col1': [1, 2, None, 4, 1],
    'Col2': [5, None, 7, 8, 1],
    'Col3': [9, 10, 11, 12, 0]
})

# Column-wise check
print(inspect_nan(df, mode="column"))

# Row-wise check
print(inspect_nan(df, mode="row"))

### Sec. 1 Delta Beta Calculation

In [49]:
from utils.dbeta_avg_helper import get_dbeta_avg, drop_dbeta_nan, get_dbeta_info

os.makedirs(f"{majority_out_path_1}/section_1", exist_ok=True)
os.makedirs(f"{majority_out_path_2}/section_1", exist_ok=True)

In [50]:
train_df = pd.read_csv(df_file_1)

In [51]:
delta_beta = get_dbeta_avg(train_df)

In [52]:
delta_beta

Unnamed: 0.1,Unnamed: 0,dbeta
0,cg00000957,-0.007306
1,cg00001349,0.012221
2,cg00001583,0.238457
3,cg00002028,0.006614
4,cg00002719,0.293580
...,...,...
353502,cg27656573,0.001667
353503,cg27657363,-0.024430
353504,cg27657537,0.008913
353505,cg27662611,0.002383


In [53]:
# record the list of feature with dbeta being NaN
delta_beta = drop_dbeta_nan(delta_beta, log_postfix="TCGA")

In [54]:
dmp = pd.read_csv(dmp_file_1)

In [55]:
dbeta_info = get_dbeta_info(delta_beta, dmp, log_postfix="TCGA")
dbeta_info

Unnamed: 0,ID,gene,dbeta,feature
0,cg03630821,A1BG,0.253155,Body
1,cg27394794,A1CF,-0.294116,Body
2,cg07027430,A2BP1,0.328562,Body
3,cg01723761,A2LD1,-0.043872,TSS200
4,cg11139127,A2M,-0.155606,Body
...,...,...,...,...
18637,cg03489712,ZYX,-0.144462,TSS1500
18638,cg21851534,ZZEF1,0.138291,3'UTR
18639,cg10895547,ZZZ3,0.097124,Body
18640,cg20009101,psiTPTE22,0.294720,Body


In [56]:
dbeta_info["dbeta"] = dbeta_info["dbeta"].apply(lambda x: round(x, 6))
dbeta_info.to_csv(f"{majority_out_path_1}/section_1/dbeta.csv", index=False)

In [57]:
train_df = pd.read_csv(df_file_2)

In [58]:
delta_beta = get_dbeta_avg(train_df)

In [59]:
delta_beta = drop_dbeta_nan(delta_beta, log_postfix="GEO")

In [60]:
dmp = pd.read_csv(dmp_file_2)

In [61]:
dbeta_info = get_dbeta_info(delta_beta, dmp, log_postfix="GEO")
dbeta_info

Unnamed: 0,ID,gene,dbeta,feature
0,cg03630821,A1BG,0.045360,Body
1,cg20509831,A1BG-AS1,-0.070945,Body
2,cg11955117,A1CF,-0.144298,Body
3,cg07027430,A2BP1,0.227393,Body
4,cg19815813,A2LD1,-0.058239,TSS200
...,...,...,...,...
22551,cg07472835,ZYG11B,-0.098248,Body
22552,cg11769486,ZYX,0.090796,ExonBnd
22553,cg16463044,ZZEF1,0.100958,Body
22554,cg10895547,ZZZ3,0.038448,Body


In [62]:
dbeta_info["dbeta"] = dbeta_info["dbeta"].apply(lambda x: round(x, 6))
dbeta_info.to_csv(f"{majority_out_path_2}/section_1/dbeta.csv", index=False)

### Sec. 2 Filter Genes by Average Delta Beta Values


#### 2.1 Filtering TSS

In [63]:
os.makedirs(f"{majority_out_path_1}/section_2", exist_ok=True)
os.makedirs(f"{majority_out_path_2}/section_2", exist_ok=True)

In [64]:
dbeta_info_1 = pd.read_csv(f"{majority_out_path_1}/section_1/dbeta.csv")
dbeta_info_2 = pd.read_csv(f"{majority_out_path_2}/section_1/dbeta.csv")

In [65]:
TSS_1 = dbeta_info_1[dbeta_info_1["feature"].str.contains("TSS")]
TSS_2 = dbeta_info_2[dbeta_info_2["feature"].str.contains("TSS")]

In [66]:
TSS_1.to_csv(f"{majority_out_path_1}/section_2/dbeta_TSS.csv", index=False)
TSS_2.to_csv(f"{majority_out_path_2}/section_2/dbeta_TSS.csv", index=False)

#### 2.2 Thresholding

In [67]:
from utils.dbeta_avg_helper import detect_threshold

In [68]:
dbeta_TSS_threshold_1, threshold_1 = detect_threshold(TSS_1, config=config, log_postfix="_TCGA")
dbeta_TSS_threshold_1.to_csv(f"{majority_out_path_1}/section_2/dbeta_TSS_{threshold_1}.csv", index=False)

In [69]:
dbeta_TSS_threshold_2, threshold_2 = detect_threshold(TSS_2, config=config, log_postfix="_GEO")
dbeta_TSS_threshold_2.to_csv(f"{majority_out_path_2}/section_2/dbeta_TSS_{threshold_2}.csv", index=False)

#### 2.3 Visualization

In [70]:
from utils.dbeta_avg_helper import dbeta_graph, pca_graph

dbeta_graph(dbeta_TSS_threshold_1, f"{majority_out_path_1}/section_2/dbeta_TSS_{threshold_1}.png")
dbeta_graph(dbeta_TSS_threshold_2, f"{majority_out_path_2}/section_2/dbeta_TSS_{threshold_2}.png")


In [71]:
df_1 = pd.read_csv(df_file_1)

In [72]:
pca_graph(dbeta_info_1, df_1, f"{majority_out_path_1}/section_2/pca.html")

In [73]:
df_2 = pd.read_csv(df_file_2)

In [74]:
pca_graph(dbeta_info_2, df_2, f"{majority_out_path_2}/section_2/pca.html")

#### 2.4 join dbeta_info

In [75]:
os.makedirs(f"{joined_out_path}/section_2", exist_ok=True)

In [76]:
dbeta_info_1 = pd.read_csv(f"{majority_out_path_1}/section_2/dbeta_TSS_{threshold_1}.csv")
dbeta_info_2 = pd.read_csv(f"{majority_out_path_2}/section_2/dbeta_TSS_{threshold_2}.csv")

In [77]:
merged_df = pd.merge(dbeta_info_1, dbeta_info_2, on="gene", how="inner", suffixes=('_TCGA', '_GEO'))
merged_df = merged_df[["gene"] + [col for col in merged_df.columns if col != "gene"]]
merged_df.to_csv(f"{joined_out_path}/section_2/dbeta_TSS_threshold_joined.csv", index=False)


#### remember to run clustering on the filtered genes first continue to the next step

### Sec. 3 Feature Selection with ML (SFS)
sequential forward selection

#### Remember to remove previous results when rerun the code

#### 3.1 Preparation

In [None]:
config = load_config(CONFIG_PATH)

train_out_path = config["feature_selection"]["sfs"]["hyper"]["train_out_path"]
training_param_file = config["feature_selection"]["sfs"]["hyper"]["training_param_file"]
os.makedirs(f"{train_out_path}", exist_ok=True)

In [None]:
dbeta_info_file = config["feature_selection"]["sfs"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(dbeta_info_file)

In [9]:
from utils.train_helper import TrainHelper

In [10]:
# note that there is setup_dbeta in TrainHelper to further cut down the feature size
th = TrainHelper(dbeta_info)

#### 3.2 Selection

In [None]:
train_df = pd.read_csv(df_file_1)

In [24]:
train_df = inspect_nan(train_df, mode="column", remove=True)

INFO Columns with NaNs: {'265': np.float64(1.0)}


In [25]:
th.set_train_df(train_df)

In [19]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [13]:
selection_models = {
    "SVM": SVC(kernel="linear", random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(random_state=42),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        n_estimators=50,
    ),
    "XGBoost": XGBClassifier(
        random_state=42,
        n_estimators=50,
        ),
}

In [None]:
th.set_selection_models(selection_models)
th.set_train_validate(ID="ID_TCGA", do_validate=False)
th.select_feature_sfs(
    out_path = f"{train_out_path}/selected_feature.txt",
    step= 1,
    n_features_to_select="cluster"
)

#### 3.3 Preparation(second round)

In [29]:
config = load_config(CONFIG_PATH)

train_out_path = config["feature_selection_2"]["sfs"]["hyper"]["train_out_path"]
training_param_file = config["feature_selection_2"]["sfs"]["hyper"]["training_param_file"]
os.makedirs(f"{train_out_path}", exist_ok=True)

In [30]:
dbeta_info_file = config["feature_selection_2"]["sfs"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(dbeta_info_file)

In [31]:
from utils.train_helper import TrainHelper

In [32]:
# note that there is setup_dbeta in TrainHelper to further cut down the feature size
th = TrainHelper(dbeta_info)

#### 3.4 Selection (second round)

In [33]:
train_df = pd.read_csv(df_file_2)

In [34]:
train_df = inspect_nan(train_df, mode="column", remove=True)

INFO Columns with NaNs: {}


In [35]:
th.set_train_df(train_df)

In [36]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [37]:
selection_models = {
    "SVM": SVC(kernel="linear", random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(random_state=42),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        n_estimators=50,
    ),
    "XGBoost": XGBClassifier(
        random_state=42,
        n_estimators=50,
        ),
}

In [None]:
th.set_selection_models(selection_models)
th.set_train_validate(ID="ID_GEO", do_validate=False)
th.select_feature_sfs(
    out_path = f"{train_out_path}/selected_feature.txt",
    step= 1,
    n_features_to_select="cluster"
)

### Sec. 3 Feature Selection with ML (RFE)
recursive feature elimination

#### Remember to remove previous results when rerun the code

#### 3.1 Preparation

In [40]:
config = load_config(CONFIG_PATH)

train_out_path = config["feature_selection"]["rfe"]["hyper"]["train_out_path"]
training_param_file = config["feature_selection"]["rfe"]["hyper"]["training_param_file"]
os.makedirs(f"{train_out_path}", exist_ok=True)


In [41]:
dbeta_info_file = config["feature_selection"]["rfe"]["hyper"]["dbeta_info_file"]
TSS_threshold = pd.read_csv(dbeta_info_file)

In [42]:
from utils.train_helper import TrainHelper
# note that there is setup_dbeta in TrainHelper to further cut down the feature size
th = TrainHelper(TSS_threshold)

#### 3.2 Selection

In [43]:
from utils.process_norm import *

In [44]:
train_df = pd.read_csv(df_file_1)
# validate_df_file = config["feature_selection"]["hyper"]["validate_df_file"]
# validate_df = pd.read_csv(validate_df_file)

In [45]:
train_df = inspect_nan(train_df, mode="column", remove=True)

INFO Columns with NaNs: {'265': np.float64(1.0)}


In [11]:
# train_df = inspect_nan(train_df, mode="column", remove=True)
# train_df

In [46]:
th.set_train_df(train_df)

In [47]:
from utils.train_helper import set_parameters
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

# import json

# with open(training_param_file, "r") as f:
#     training_param = json.load(f)
# xgb_grid = set_parameters(XGBClassifier(random_state=42), training_param["XGBoost"])
# rf_grid = set_parameters(
#     RandomForestClassifier(random_state=42), training_param["RandomForest"]
# )
# svm_grid = set_parameters(SVC(random_state=42, probability=True), training_param["SVM"])
# dt_grid = set_parameters(
#     DecisionTreeClassifier(random_state=42), training_param["DecisionTree"]
# )

# train_models = {
#     "XGBoost": xgb_grid,
#     "RandomForest": rf_grid,
#     "SVM": svm_grid,
#     "DecisionTree": dt_grid,
# }

In [48]:
selection_models = {
    "SVM": SVC(kernel="linear", random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(random_state=42),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        n_estimators=50,
    ),
    "XGBoost": XGBClassifier(
        random_state=42,
        n_estimators=50,
        ),
}

In [50]:
th.set_selection_models(selection_models)
# th.set_grid_estimators(train_models)
th.set_train_validate(ID="ID_TCGA", do_validate=False)

In [51]:
th.select_feature_rfe(
    train_out_path = train_out_path,
    selected_feature_path = f"{train_out_path}/selected_feature.txt",
    feature_range = "cluster",
    do_validation=False
)

INFO Training SVM with RFE
INFO Training SVM with 3 clusters selected
INFO Training finished with 4 clusters selected
INFO Training DecisionTree with RFE
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training 

#### 3.3 Preparation (second round)

In [52]:
config = load_config(CONFIG_PATH)

train_out_path = config["feature_selection_2"]["rfe"]["hyper"]["train_out_path"]
training_param_file = config["feature_selection_2"]["rfe"]["hyper"]["training_param_file"]
os.makedirs(f"{train_out_path}", exist_ok=True)

In [53]:
dbeta_info_file = config["feature_selection"]["rfe"]["hyper"]["dbeta_info_file"]
TSS_threshold = pd.read_csv(dbeta_info_file)

In [54]:
from utils.train_helper import TrainHelper
# note that there is setup_dbeta in TrainHelper to further cut down the feature size
th = TrainHelper(TSS_threshold)

#### 3.4 Selection (second round)

In [55]:
train_df = pd.read_csv(df_file_2)

In [56]:
th.set_train_df(train_df)

In [57]:
selection_models = {
    "SVM": SVC(kernel="linear", random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(random_state=42),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        n_estimators=50,
    ),
    "XGBoost": XGBClassifier(
        random_state=42,
        n_estimators=50,
        ),
}

In [58]:
th.set_selection_models(selection_models)
# th.set_grid_estimators(train_models)
th.set_train_validate(ID="ID_GEO")

In [59]:
th.select_feature_rfe(
    train_out_path = train_out_path,
    selected_feature_path = f"{train_out_path}/selected_feature.txt",
    feature_range = "cluster",
    do_validation=False
)

INFO Training SVM with RFE
INFO Training SVM with 2 clusters selected
INFO Training SVM with 2 clusters selected
INFO Training SVM with 3 clusters selected
INFO Training SVM with 3 clusters selected
INFO Training SVM with 3 clusters selected
INFO Training SVM with 3 clusters selected
INFO Training SVM with 3 clusters selected
INFO Training SVM with 3 clusters selected
INFO Training SVM with 3 clusters selected
INFO Training SVM with 3 clusters selected
INFO Training SVM with 3 clusters selected
INFO Training SVM with 3 clusters selected
INFO Training SVM with 3 clusters selected
INFO Training SVM with 3 clusters selected
INFO Training SVM with 3 clusters selected
INFO Training finished with 4 clusters selected
INFO Training DecisionTree with RFE
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 clusters selected
INFO Training DecisionTree with 2 cl

### Sec. 4 Clean Selected Features

#### 4.1 Generate Feature json for SimpleModel

In [60]:
from utils.train_helper import read_selected_features, read_selected_features_json, TrainHelper
config = load_config(CONFIG_PATH)
dbeta_info_file = config["feature_selection"]["rfe"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(dbeta_info_file)
th = TrainHelper(dbeta_info)
method = "rfe"
features = read_selected_features(f"{majority_out_path_1}/section_3/{method}/selected_feature.txt")
th.generate_selected_features(
    features,
    f"{majority_out_path_1}/section_3/{method}/selected_features.json",
    mode="min",
    out_format="json",
)

read_selected_features_json(f"{majority_out_path_1}/section_3/{method}/selected_features.json")

defaultdict(list,
            {'SVM': ['FOXD2', 'GFRA2', 'GHSR', 'MIR196A2', 'SCT'],
             'DecisionTree': ['ALX1',
              'GHITM',
              'KRTAP20-1',
              'NEFM',
              'PRLHR',
              'MIR654'],
             'LogisticRegression': ['ALX1',
              'FOXD2',
              'ATP5G2',
              'TAC1',
              'GFRA2',
              'MIR196A2'],
             'RandomForest': ['ALX1',
              'FOXD2',
              'CA3',
              'FRZB',
              'MIR1197',
              'GFRA2'],
             'XGBoost': ['ALX1',
              'FOXD2',
              'POU3F3',
              'CCDC8',
              'GFRA2',
              'MIR654']})

In [61]:
from utils.train_helper import read_selected_features, read_selected_features_json, TrainHelper
config = load_config(CONFIG_PATH)
dbeta_info_file = config["feature_selection_2"]["rfe"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(dbeta_info_file)
th = TrainHelper(dbeta_info)
method = "rfe"
features = read_selected_features(f"{majority_out_path_2}/section_3/{method}/selected_feature.txt")
th.generate_selected_features(
    features,
    f"{majority_out_path_2}/section_3/{method}/selected_features.json",
    mode="min",
    out_format="json",
)

# use this to read json
read_selected_features_json(f"{majority_out_path_2}/section_3/{method}/selected_features.json")

defaultdict(list,
            {'SVM': ['DBC1', 'DEGS1', 'TRIM59', 'FOXB2', 'GRIK3', 'MIR129-2'],
             'DecisionTree': ['CA3',
              'NID2',
              'TRIM59',
              'ZSCAN18',
              'PRLHR',
              'MIR654'],
             'LogisticRegression': ['DGKI',
              'HMGCL',
              'TRIM59',
              'FOXB2',
              'MIR129-2',
              'GRIK3'],
             'RandomForest': ['DGKI',
              'SUB1',
              'TRIM59',
              'FOXB2',
              'MIR124-2',
              'KCNK2'],
             'XGBoost': ['FOXB2',
              'NID2',
              'SUB1',
              'TRIM59',
              'MIR124-2',
              'PRLHR']})

### Sec. 5 Clustering Visualization

#### 1. load data

remember to calculate distance matrix first

In [34]:
import pandas as pd
import numpy as np
from utils.clustering_helper import hierarchical_clustering, check_distance_matrix

In [35]:
result_prefix = config["clustering_visual"]["hyper"]["result_prefix"]
dbeta_file = config["clustering_visual"]["hyper"]["dbeta_file"]
bp_file = config["clustering_visual"]["hyper"]["bp_file"]
cc_file = config["clustering_visual"]["hyper"]["cc_file"]
mf_file = config["clustering_visual"]["hyper"]["mf_file"]
terms_count_file = config["clustering_visual"]["hyper"]["terms_count_file"]
result_out_path = config["clustering_visual"]["hyper"]["result_out_path"]

In [36]:
os.makedirs(result_out_path, exist_ok=True)

In [37]:
gene_set = pd.read_csv(dbeta_file, index_col=0)
distance_matrix_bp = pd.read_csv(bp_file, index_col=0)
distance_matrix_cc = pd.read_csv(cc_file, index_col=0)
distance_matrix_mf = pd.read_csv(mf_file, index_col=0)
terms_count = pd.read_csv(terms_count_file, index_col=0)

In [38]:
# replace NaN with 0
distance_matrix_bp = distance_matrix_bp.fillna(0)
distance_matrix_cc = distance_matrix_cc.fillna(0)
distance_matrix_mf = distance_matrix_mf.fillna(0)

In [39]:
# reindex distance matrix
index_bp = distance_matrix_bp.index
index_cc = distance_matrix_cc.index
index_mf = distance_matrix_mf.index
index = index_bp.union(index_cc).union(index_mf)
distance_matrix_bp_ = distance_matrix_bp.reindex(index=index, columns=index, fill_value=0)
distance_matrix_cc_ = distance_matrix_cc.reindex(index=index, columns=index, fill_value=0)
distance_matrix_mf_ = distance_matrix_mf.reindex(index=index, columns=index, fill_value=0)

In [40]:
# make a array of distance matrix for each ontology
distance_matrix = []

distance_matrix.append(distance_matrix_bp_)
distance_matrix.append(distance_matrix_cc_)
distance_matrix.append(distance_matrix_mf_)

#### 2. Weighted Sum

In [41]:
weight = [count for count in terms_count["count"]]
weight = weight / np.sum(weight)
masks = np.array([~np.isnan(distance_matrix[i].values) for i in range(3)])

valid_weights = np.array([weight[i] for i in range(3)])[:, None, None] * masks

weight_sums = valid_weights.sum(axis=0)

normalized_weights = np.divide(valid_weights, weight_sums, where=weight_sums != 0)
weighted_sum = sum(
    np.nan_to_num(distance_matrix[i].values) * normalized_weights[i] for i in range(3)
)


weighted_sum_dataframe = pd.DataFrame(weighted_sum, index=index, columns=index)

weighted_sum_dataframe.head()

Unnamed: 0,ADAMTS20,ADCY4,AIP,ALX1,ALX4,ATG16L1,ATP5G2,C1orf114,CA3,CCDC8,...,TAC1,TMEM196,TRIM59,TUBB6,WDR8,WNT3,ZC3H12D,ZFP42,ZNF781,ZSCAN18
ADAMTS20,0.0,0.577176,0.689548,0.811536,0.793435,0.79605,0.865384,0.288938,0.724191,0.763671,...,0.718813,0.469702,0.664124,0.746487,0.763971,0.691306,0.779107,0.797409,0.772388,0.774362
ADCY4,0.577176,0.0,0.572087,0.746949,0.773581,0.737435,0.691075,0.154626,0.677446,0.575162,...,0.577385,0.480065,0.640341,0.685082,0.616422,0.685534,0.723424,0.717358,0.700533,0.67443
AIP,0.689548,0.572087,0.0,0.525277,0.684875,0.63031,0.736473,0.154056,0.66224,0.564364,...,0.700663,0.547216,0.638608,0.71194,0.608903,0.669713,0.522115,0.637378,0.303839,0.290034
ALX1,0.811536,0.746949,0.525277,0.0,0.329106,0.734757,0.769313,0.159371,0.791724,0.632017,...,0.787421,0.604176,0.778586,0.79339,0.650497,0.651433,0.496802,0.400449,0.196909,0.151033
ALX4,0.793435,0.773581,0.684875,0.329106,0.0,0.67485,0.810003,0.164211,0.80162,0.635208,...,0.744857,0.600006,0.799133,0.779394,0.650163,0.548934,0.548359,0.468631,0.132878,0.056669


In [52]:
cluster_result_weighted = hierarchical_clustering(
    weighted_sum_dataframe,
    range_min=3,
    range_max=4,
    out_path=f"{result_out_path}/hierarchical_clustering_weighted_sum.png",
)

Best number of clusters: 4


In [53]:
cluster_result_weighted.head()

Unnamed: 0,gene,cluster
0,ADAMTS20,3
1,ADCY4,3
2,AIP,4
3,ALX1,1
4,ALX4,1


#### 3. Simple average

In [54]:
weight = [1, 1, 1]
masks = np.array([~np.isnan(distance_matrix[i].values) for i in range(3)])
valid_weights = np.array([weight[i] for i in range(3)])[:, None, None] * masks
weight_sums = valid_weights.sum(axis=0)
normalized_weights = np.divide(valid_weights, weight_sums, where=weight_sums != 0)
weighted_sum = sum(
    np.nan_to_num(distance_matrix[i].values) * normalized_weights[i] for i in range(3)
)
simple_sum_dataframe = pd.DataFrame(weighted_sum, index=index, columns=index)
simple_sum_dataframe.head()

Unnamed: 0,ADAMTS20,ADCY4,AIP,ALX1,ALX4,ATG16L1,ATP5G2,C1orf114,CA3,CCDC8,...,TAC1,TMEM196,TRIM59,TUBB6,WDR8,WNT3,ZC3H12D,ZFP42,ZNF781,ZSCAN18
ADAMTS20,0.0,0.600667,0.694667,0.799667,0.792,0.768667,0.851,0.524,0.645667,0.719333,...,0.68,0.407,0.663667,0.696,0.738,0.631667,0.746667,0.752333,0.742667,0.75
ADCY4,0.600667,0.0,0.478333,0.678333,0.703,0.610333,0.659,0.306,0.527333,0.386,...,0.549333,0.294667,0.597667,0.577,0.466333,0.581667,0.625667,0.603,0.646,0.602667
AIP,0.694667,0.478333,0.0,0.485667,0.581,0.532333,0.666667,0.304,0.528,0.336333,...,0.647667,0.372,0.587667,0.6,0.451667,0.551,0.487333,0.538,0.386667,0.331667
ALX1,0.799667,0.678333,0.485667,0.0,0.220667,0.635,0.682667,0.322667,0.684667,0.435,...,0.711667,0.468333,0.632333,0.637333,0.480333,0.590667,0.428667,0.368667,0.182667,0.114333
ALX4,0.792,0.703,0.581,0.220667,0.0,0.621667,0.717333,0.339667,0.709,0.440333,...,0.691333,0.485,0.682333,0.642333,0.471333,0.540667,0.466667,0.36,0.143,0.074667


In [55]:
cluster_result_simple = hierarchical_clustering(
    simple_sum_dataframe,
    range_min=3,
    range_max=4,
    out_path=f"{result_out_path}/hierarchical_clustering_simple_sum.png",
)

Best number of clusters: 4


In [56]:
cluster_result_simple.head()

Unnamed: 0,gene,cluster
0,ADAMTS20,4
1,ADCY4,4
2,AIP,1
3,ALX1,1
4,ALX4,1


#### 4. Consensus clustering 

In [57]:
cluster_bp = hierarchical_clustering(
    distance_matrix_bp, out_path=f"{result_out_path}/hierarchical_clustering_bp.png"
)
cluster_cc = hierarchical_clustering(
    distance_matrix_cc, out_path=f"{result_out_path}/hierarchical_clustering_cc.png"
)
cluster_mf = hierarchical_clustering(
    distance_matrix_mf, out_path=f"{result_out_path}/hierarchical_clustering_mf.png"
)

Best number of clusters: 26
Best number of clusters: 7
Best number of clusters: 6


In [58]:
cluster_bp.columns = ["gene", "cluster_bp"]
cluster_cc.columns = ["gene", "cluster_cc"]
cluster_mf.columns = ["gene", "cluster_mf"]
cluster_bp_cc = pd.merge(cluster_bp, cluster_cc, on="gene", how="outer")
cluster_go = pd.merge(cluster_bp_cc, cluster_mf, on="gene", how="outer")
cluster_go = cluster_go.fillna(-1)
print(cluster_go.shape)
cluster_go.head()

(97, 4)


Unnamed: 0,gene,cluster_bp,cluster_cc,cluster_mf
0,ADAMTS20,11.0,3,6.0
1,ADCY4,11.0,7,2.0
2,AIP,1.0,7,2.0
3,ALX1,1.0,1,1.0
4,ALX4,2.0,1,1.0


In [59]:
num_genes = cluster_go.shape[0]
consensus_matrix = np.zeros((num_genes, num_genes))
for i in range(num_genes):
    for j in range(i, num_genes):
        if cluster_go.iloc[i]["cluster_bp"] == cluster_go.iloc[j]["cluster_bp"]:
            consensus_matrix[i][j] += 1

        if cluster_go.iloc[i]["cluster_cc"] == cluster_go.iloc[j]["cluster_cc"]:
            consensus_matrix[i][j] += 1

        if cluster_go.iloc[i]["cluster_mf"] == cluster_go.iloc[j]["cluster_mf"]:
            consensus_matrix[i][j] += 1

consensus_matrix = pd.DataFrame(
    consensus_matrix, index=cluster_go["gene"], columns=cluster_go["gene"]
)
consensus_matrix += consensus_matrix.T
distance_matrix_consensus = 1 - consensus_matrix / 3
np.fill_diagonal(distance_matrix_consensus.values, 0)
distance_matrix_consensus.head()

gene,ADAMTS20,ADCY4,AIP,ALX1,ALX4,ATG16L1,ATP5G2,C1orf114,CA3,CCDC8,...,TAC1,TMEM196,TRIM59,TUBB6,WDR8,WNT3,ZC3H12D,ZFP42,ZNF781,ZSCAN18
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ADAMTS20,0.0,0.666667,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.666667,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.666667,1.0
ADCY4,0.666667,0.0,0.333333,1.0,1.0,0.333333,0.666667,0.333333,0.333333,0.333333,...,0.666667,0.666667,0.666667,0.333333,0.333333,0.666667,0.666667,1.0,1.0,1.0
AIP,1.0,0.333333,0.0,0.666667,1.0,0.333333,0.666667,0.333333,0.333333,0.333333,...,0.666667,0.666667,0.666667,0.333333,0.333333,0.666667,0.666667,0.666667,0.666667,1.0
ALX1,1.0,1.0,0.666667,0.0,0.333333,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.666667,0.0,0.333333,0.333333
ALX4,1.0,1.0,1.0,0.333333,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.666667,0.333333,0.666667,0.333333


In [60]:
cluster_result_consensus = hierarchical_clustering(
    distance_matrix_consensus,
    range_min=3,
    range_max=4,
    out_path=f"{result_out_path}/hierarchical_clustering_consensus.png",
)

Best number of clusters: 4


In [61]:
cluster_result_consensus.head()

Unnamed: 0,gene,cluster
0,ADAMTS20,2
1,ADCY4,2
2,AIP,2
3,ALX1,1
4,ALX4,1


#### 5. Compare 

In [63]:
from utils.clustering_helper import hierarchical_clustering_compare

hierarchical_clustering_compare(
    [weighted_sum_dataframe, simple_sum_dataframe, distance_matrix_consensus],
    ["Weighted Average", "Simple Average", "Consensus"],
    out_path=f"{result_out_path}/hierarchical_clustering_compare.png",
    range_min=3,
    range_max=4,
)

Best number of clusters for Weighted Average: 4
Best number of clusters for Simple Average: 4
Best number of clusters for Consensus: 4


In [64]:
dbeta_info = pd.read_csv(dbeta_file)

In [65]:
# column gene isin weighted_sum_dataframe
weighted_dbeta = dbeta_info[dbeta_info["gene"].isin(weighted_sum_dataframe.index)]
simple_dbeta = dbeta_info[dbeta_info["gene"].isin(simple_sum_dataframe.index)]
consensus_dbeta = dbeta_info[dbeta_info["gene"].isin(distance_matrix_consensus.index)]

In [66]:
weighted_dbeta.merge(cluster_result_weighted, on="gene").to_csv(
    f"{result_out_path}/{result_prefix}_weighted.csv", index=False
)
simple_dbeta.merge(cluster_result_simple, on="gene").to_csv(
    f"{result_out_path}/{result_prefix}_simple.csv", index=False
)
consensus_dbeta.merge(cluster_result_consensus, on="gene").to_csv(
    f"{result_out_path}/{result_prefix}_consensus.csv", index=False
)

### Sec. 6 SimpleModel Training

#### 1. Load Data

In [62]:
from utils.train_helper import read_selected_features_json
import pandas as pd
from utils.process_norm import *

In [63]:
dbeta_file = config["simple_model"]["hyper"]["dbeta_file"]
selected_feature_file = config["simple_model"]["hyper"]["selected_feature_file"]
selected_feature_file_2 = config["simple_model"]["hyper"]["selected_feature_file_2"]
train_out_path = config["simple_model"]["hyper"]["train_out_path"]
validate_out_path = config["simple_model"]["hyper"]["validate_out_path"]
df_train_file = config["simple_model"]["hyper"]["df_train_file"]
df_test_file = config["simple_model"]["hyper"]["df_test_file"]
df_train_file_2 = config["simple_model"]["hyper"]["df_train_file_2"]
df_test_file_2 = config["simple_model"]["hyper"]["df_test_file_2"]
training_param_file = config["simple_model"]["hyper"]["training_param_file"]

In [64]:
selected_feature = read_selected_features_json(selected_feature_file)
selected_feature_2 = read_selected_features_json(selected_feature_file_2)
dbeta_file = pd.read_csv(dbeta_file)

In [65]:
train = pd.read_csv(df_train_file)
test = pd.read_csv(df_test_file)

In [66]:
train = inspect_nan(train, mode="column", remove=True)

INFO Columns with NaNs: {'265': np.float64(1.0)}


In [67]:
train_2 = pd.read_csv(df_train_file_2)
test_2 = pd.read_csv(df_test_file_2)

In [68]:
train = merge_datasets(train, train_2)
train

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,826,827,828,829,830,831,832,833,834,835
0,cg00000957,0.870131,0.866782,0.841300,0.867264,0.870941,0.835596,0.882055,0.847299,0.867437,...,0.669044,0.870513,0.878363,0.867401,0.872652,0.893914,0.881826,0.861232,0.863374,0.864288
1,cg00001583,0.130379,0.070445,0.139665,0.146764,0.071685,0.126561,0.123222,0.127817,0.102428,...,0.044154,0.479830,0.324485,0.064659,0.625846,0.716061,0.646429,0.143131,0.651526,0.436335
2,cg00002028,0.027859,0.034231,0.037926,0.026836,0.025741,0.016993,0.085896,0.033704,0.028174,...,0.093667,0.060649,0.059475,0.058768,0.028895,0.042236,0.049330,0.020692,0.053677,0.048020
3,cg00002719,0.024905,0.049881,0.062630,0.027035,0.023960,0.037078,0.063606,0.044801,0.026103,...,0.143065,0.240834,0.220422,0.032740,0.238104,0.674085,0.644529,0.098214,0.760026,0.395033
4,cg00002837,0.317879,0.276213,0.293055,0.242700,0.251165,0.384913,0.300542,0.275221,0.380311,...,0.400248,0.492348,0.294186,0.500739,0.414480,0.451829,0.721043,0.266971,0.179254,0.562186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294377,cg27657363,0.950697,0.966018,0.949426,0.906193,0.957120,0.938661,0.946524,0.928793,0.955680,...,0.746055,0.925792,0.939424,0.897395,0.903703,0.958058,0.944749,0.942778,0.941170,0.899832
294378,cg27657537,0.060546,0.093363,0.158475,0.056264,0.060162,0.088070,0.160369,0.080986,0.097606,...,0.098622,0.148348,0.166300,0.183956,0.136002,0.075223,0.076596,0.059452,0.077612,0.095113
294379,cg27662611,0.034528,0.047549,0.058815,0.044475,0.029043,0.037124,0.068912,0.054865,0.037603,...,0.027905,0.061222,0.045846,0.050612,0.076919,0.074944,0.065488,0.036783,0.069584,0.055491
294380,cg27665648,0.859977,0.905516,0.764569,0.818406,0.854048,0.672517,0.738127,0.798509,0.846691,...,0.879646,0.764174,0.847019,0.794012,0.674822,0.908401,0.890954,0.846817,0.890767,0.811262


In [69]:
test = merge_datasets(test, test_2)
test

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,200,201,202,203,204,205,206,207,208,209
0,cg00000957,0.880918,0.889138,0.874959,0.885774,0.896747,0.879334,0.858834,0.837964,0.876630,...,0.824397,0.897076,0.884351,0.891127,0.937575,0.778774,0.867760,0.874922,0.839929,0.889266
1,cg00001583,0.066911,0.076375,0.096527,0.040940,0.083554,0.087588,0.061910,0.069474,0.093006,...,0.576252,0.053651,0.576651,0.146280,0.380901,0.073419,0.165535,0.066819,0.094443,0.132342
2,cg00002028,0.023719,0.037065,0.021660,0.010759,0.010474,0.027399,0.024558,0.035275,0.017022,...,0.137957,0.097596,0.101154,0.114899,0.162179,0.096768,0.092990,0.058361,0.124010,0.119324
3,cg00002719,0.018851,0.024845,0.023367,0.012765,0.010090,0.017774,0.046654,0.026357,0.026142,...,0.801550,0.147161,0.645241,0.285699,0.941672,0.161863,0.075074,0.279016,0.330795,0.039894
4,cg00002837,0.231168,0.263481,0.326748,0.212321,0.233503,0.228290,0.271203,0.376430,0.252944,...,0.243319,0.499297,0.359858,0.280982,0.340304,0.419111,0.396845,0.550041,0.452268,0.356127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294377,cg27657363,0.965252,0.881412,0.961724,0.957276,0.956891,0.954883,0.953429,0.948396,0.957826,...,0.856380,0.791323,0.927085,0.842508,0.755541,0.882560,0.884853,0.734444,0.651562,0.877115
294378,cg27657537,0.049765,0.117931,0.061232,0.034888,0.055366,0.067702,0.054141,0.083854,0.062192,...,0.038267,0.039257,0.054078,0.060248,0.065395,0.052062,0.068284,0.049476,0.031694,0.089036
294379,cg27662611,0.042660,0.063932,0.055080,0.034882,0.017662,0.042007,0.042645,0.047950,0.047642,...,0.025175,0.025053,0.013499,0.017525,0.135137,0.031272,0.009832,0.023495,0.026024,0.017780
294380,cg27665648,0.872041,0.797600,0.829041,0.863131,0.896679,0.838608,0.709252,0.742108,0.832386,...,0.793646,0.676673,0.932568,0.792866,0.962920,0.757222,0.865069,0.638925,0.638460,0.927500


#### 2. Training

In [72]:
from utils.train_helper import set_parameters
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import json
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

with open(training_param_file, "r") as f:
    training_param = json.load(f)

xgb_grid = set_parameters(XGBClassifier(random_state=42), training_param["XGBoost"])
rf_grid = set_parameters(RandomForestClassifier(random_state=42), training_param["RandomForest"])
svm_grid = set_parameters(SVC(random_state=42, probability=True), training_param["SVM"])
dt_grid = set_parameters(DecisionTreeClassifier(random_state=42), training_param["DecisionTree"])
lr_grid = set_parameters(LogisticRegression(random_state=42), training_param["LogisticRegression"])
lr_elastic_grid = set_parameters(LogisticRegression(random_state=42), training_param["LogisticRegressionElastic"])
voting = VotingClassifier(
    estimators=[
        ("XGBoost", XGBClassifier(random_state=42, n_estimators=50)), 
        ("RandomForest", RandomForestClassifier(random_state=42, n_estimators=50)),
        ("SVM", SVC(random_state=42, probability=True)),
        ("DecisionTree", DecisionTreeClassifier(random_state=42, max_depth=5)),
        ("LogisticRegression", LogisticRegression(random_state=42, C=0.1)),
    ],
    voting="soft",
)

# comment out the model you don't want to use
models = {
    "XGBoost": {
        "is_grid_search": True,
        "model": xgb_grid,
    },
    "RandomForest": {
        "is_grid_search": True,
        "model": rf_grid,
    },
    "SVM": {
        "is_grid_search": True,
        "model": svm_grid,
    },
    "DecisionTree": {
        "is_grid_search": True,
        "model": dt_grid,
    },
    "LogisticRegression": {
        "is_grid_search": True,
        "model": lr_grid,
    },
    "LogisticRegressionElastic": {
        "is_grid_search": True,
        "model": lr_elastic_grid,
    },
    "Voting": {
        "is_grid_search": False,
        "model": voting,
    },
}

In [73]:
from utils.simple_model import SimpleModel

for model_name, gene_list in selected_feature.items():
    for model_name, model_config in models.items():
        model = SimpleModel(
            train_df=train,
            test_df=test,
            gene_list=gene_list,
            dbeta_info=dbeta_file,
        )
        model.setup_dbeta()
        model.setup_train_test("ID_TCGA")
        model.setup_combinations()
        model.train(
            model_name,
            model_config["model"],
            train_out_path,
            validate_out_path,
            model_config["is_grid_search"],
            "ID_TCGA"
        )

INFO Training for combination: ('FOXD2', 'GFRA2', 'MIR196A2', 'SCT') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('FOXD2', 'GHSR', 'MIR196A2', 'SCT') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('FOXD2', 'GFRA2', 'MIR196A2', 'SCT') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('FOXD2', 'GHSR', 'MIR196A2', 'SCT') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('FOXD2', 'GFRA2', 'MIR196A2', 'SCT') with estimator: SVM
Fitting 5 folds for each of 8 candidates, totalling 40 fits
INFO Training for combination: ('FOXD2', 'GHSR', 'MIR196A2', 'SCT') with estimator: SVM
Fitting 5 folds for each of 8 candidates, totalling 40 fits
INFO Training for combination: ('FOXD2', 'GFRA2', 'MIR196A2', 'SCT') wi

In [74]:
from utils.train_helper import set_parameters
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import json
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

with open(training_param_file, "r") as f:
    training_param = json.load(f)

xgb_grid = set_parameters(XGBClassifier(random_state=42), training_param["XGBoost"])
rf_grid = set_parameters(RandomForestClassifier(random_state=42), training_param["RandomForest"])
svm_grid = set_parameters(SVC(random_state=42, probability=True), training_param["SVM"])
dt_grid = set_parameters(DecisionTreeClassifier(random_state=42), training_param["DecisionTree"])
lr_grid = set_parameters(LogisticRegression(random_state=42), training_param["LogisticRegression"])
lr_elastic_grid = set_parameters(LogisticRegression(random_state=42), training_param["LogisticRegressionElastic"])
voting = VotingClassifier(
    estimators=[
        ("XGBoost", XGBClassifier(random_state=42, n_estimators=50)), 
        ("RandomForest", RandomForestClassifier(random_state=42, n_estimators=50)),
        ("SVM", SVC(random_state=42, probability=True)),
        ("DecisionTree", DecisionTreeClassifier(random_state=42, max_depth=5)),
        ("LogisticRegression", LogisticRegression(random_state=42, C=0.1)),
    ],
    voting="soft",
)

# comment out the model you don't want to use
models = {
    "XGBoost": {
        "is_grid_search": True,
        "model": xgb_grid,
    },
    "RandomForest": {
        "is_grid_search": True,
        "model": rf_grid,
    },
    "SVM": {
        "is_grid_search": True,
        "model": svm_grid,
    },
    "DecisionTree": {
        "is_grid_search": True,
        "model": dt_grid,
    },
    "LogisticRegression": {
        "is_grid_search": True,
        "model": lr_grid,
    },
    "LogisticRegressionElastic": {
        "is_grid_search": True,
        "model": lr_elastic_grid,
    },
    "Voting": {
        "is_grid_search": False,
        "model": voting,
    },
}

In [75]:
from utils.simple_model import SimpleModel

for model_name, gene_list in selected_feature_2.items():
    for model_name, model_config in models.items():
        model = SimpleModel(
            train_df=train,
            test_df=test,
            gene_list=gene_list,
            dbeta_info=dbeta_file,
        )
        model.setup_dbeta()
        model.setup_train_test("ID_GEO")
        model.setup_combinations()
        model.train(
            model_name,
            model_config["model"],
            train_out_path,
            validate_out_path,
            model_config["is_grid_search"],
            "ID_GEO"
        )

INFO Training for combination: ('DBC1', 'FOXB2', 'GRIK3', 'MIR129-2') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('DEGS1', 'FOXB2', 'GRIK3', 'MIR129-2') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('TRIM59', 'FOXB2', 'GRIK3', 'MIR129-2') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('DBC1', 'FOXB2', 'GRIK3', 'MIR129-2') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('DEGS1', 'FOXB2', 'GRIK3', 'MIR129-2') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('TRIM59', 'FOXB2', 'GRIK3', 'MIR129-2') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('DBC1



Fitting 5 folds for each of 12 candidates, totalling 60 fits




INFO Training for combination: ('CA3', 'MIR654', 'PRLHR', 'ZSCAN18') with estimator: Voting
INFO Training for combination: ('NID2', 'MIR654', 'PRLHR', 'ZSCAN18') with estimator: Voting
INFO Training for combination: ('TRIM59', 'MIR654', 'PRLHR', 'ZSCAN18') with estimator: Voting
INFO Training for combination: ('DGKI', 'FOXB2', 'GRIK3', 'MIR129-2') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('HMGCL', 'FOXB2', 'GRIK3', 'MIR129-2') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('TRIM59', 'FOXB2', 'GRIK3', 'MIR129-2') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('DGKI', 'FOXB2', 'GRIK3', 'MIR129-2') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('HMGCL', 'FOXB2', 'GRIK3', 'MIR129-2') with estima

#### 3. Visualization

In [76]:
train_voting = pd.read_csv(f"{train_out_path}/Voting_metrics.csv")
train_voting.sort_values(by="f1_score", ascending=False).head()

Unnamed: 0,estimator_name,gene_0,gene_1,gene_2,gene_3,accuracy,recall,specificity,precision,f1_score,AUC,MCC,fbeta2_score
12,Voting,FOXD2,FRZB,GFRA2,MIR1197,0.996411,0.998621,0.981982,0.997245,0.997932,0.999925,0.984371,0.998345
20,Voting,NID2,MIR654,PRLHR,ZSCAN18,0.996411,0.997241,0.990991,0.998619,0.99793,0.999925,0.98449,0.997517
10,Voting,ALX1,FRZB,GFRA2,MIR1197,0.995215,0.997241,0.981982,0.997241,0.997241,0.999901,0.979223,0.997241
13,Voting,ALX1,CCDC8,GFRA2,MIR654,0.995215,0.995862,0.990991,0.998617,0.997238,0.999789,0.979433,0.996412
11,Voting,FOXD2,CA3,GFRA2,MIR1197,0.995215,0.995862,0.990991,0.998617,0.997238,0.999776,0.979433,0.996412


In [77]:
test_voting = pd.read_csv(f"{validate_out_path}/Voting_metrics_avg.csv")
test_voting.sort_values(by="f1_score", ascending=False).head()

Unnamed: 0,estimator_name,gene_0,gene_1,gene_2,gene_3,accuracy,recall,specificity,precision,f1_score,AUC,MCC,fbeta2_score
21,Voting,TRIM59,MIR654,PRLHR,ZSCAN18,0.958929,0.989286,0.928571,0.932644,0.960073,0.993878,0.919693,0.977375
11,Voting,FOXD2,CA3,GFRA2,MIR1197,0.957143,0.985714,0.928571,0.932414,0.958258,0.991709,0.91594,0.974528
9,Voting,ALX1,CA3,GFRA2,MIR1197,0.951786,0.975,0.928571,0.931708,0.952748,0.991327,0.904808,0.965947
7,Voting,FOXD2,ATP5G2,GFRA2,MIR196A2,0.948214,0.967857,0.928571,0.931248,0.949118,0.984184,0.897303,0.960253
23,Voting,HMGCL,FOXB2,GRIK3,MIR129-2,0.944643,0.996429,0.892857,0.902903,0.947341,0.984821,0.894158,0.976189


##### Congratulation! You have finished the whole pipeline🎉. <br>

![title](cat.png)