### Sec. -1 Initiation


In [1]:
import os
import pandas as pd
from utils.config_helper import update_nested_toml, load_config
os.makedirs("logs", exist_ok=True)


In [48]:
# TYPE = input("Enter the type of the config file: ")
TYPE = "prostate_plan_A"
CONFIG_PATH = f"../config/{TYPE}.toml"
config = load_config(CONFIG_PATH)

In [49]:
df_file_1 = config["init"]["hyper"]["df_file_1"]
df_file_2 = config["init"]["hyper"]["df_file_2"]
dmp_file_1 = config["init"]["hyper"]["dmp_file_1"]
dmp_file_2 = config["init"]["hyper"]["dmp_file_2"]
majority_out_path_1 = config["init"]["hyper"]["majority_out_path_1"]
majority_out_path_2 = config["init"]["hyper"]["majority_out_path_2"]
joined_out_path = config["init"]["hyper"]["joined_out_path"]

### Sec. 0 process norm examples


#### 1. Split Dataset Example


In [23]:
import json
from utils.process_norm import *

In [28]:
# read the json file
# cancer_type = input("Enter the cancer type: ")
cancer_type = "prostate"
with open(f"../config/{cancer_type}.json") as f:
    J = json.load(f)

In [36]:
# data_source = input("Enter the raw all_beta_normalized file: ")
data_source = "GSE269244_80"
df = pd.read_csv(J[data_source]['file'])


In [37]:
print(J[data_source]["normal"],J[data_source]["tumor"],J[data_source]["sample_count"])

94 96 1


In [38]:
dfo = organize_dataset(df, J[data_source]["normal"], J[data_source]["tumor"], J[data_source]["sample_count"])
complement_df, ratio_df = split_dataset(dfo, J[data_source]["split_test"], J[data_source]["random_state"])
complement_df.to_csv("all_beta_normalized_80.csv", index=False)
ratio_df.to_csv("all_beta_normalized_20.csv", index=False)

INFO complement_df feature: 695603
INFO complement_df sample (normal, tumor): (70, 72)
INFO ratio_df feature: 695603
INFO ratio_df sample (normal, tumor): (24, 24)


#### 2. Merge Dataset Example


In [None]:
df0 = pd.read_csv(f"../stomach/champ_result/gdc_stomach_GSE99553/all_beta_normalized_0.csv")
df1 = pd.read_csv(f"../stomach/champ_result/gdc_stomach_GSE99553/all_beta_normalized_GSE99553.csv")

In [None]:
df0 = organize_dataset(df0, 2, 395, 2)
df1 = organize_dataset(df1, 84, 0, 1)

In [None]:
merged_df = merge_datasets(df0, df1)
merged_df.to_csv("merged.csv", index=False)

#### 3. Inspect NaN Example


In [None]:
# Example usage:
df = pd.DataFrame({
    'ID': ['A1', 'A2', 'A3', 'A4', 'label'],
    'Col1': [1, 2, None, 4, 1],
    'Col2': [5, None, 7, 8, 1],
    'Col3': [9, 10, 11, 12, 0]
})

# Column-wise check
print(inspect_nan(df, mode="column"))

# Row-wise check
print(inspect_nan(df, mode="row"))

### Sec. 1 Delta Beta Calculation


In [39]:
from utils.dbeta_avg_helper import get_dbeta_avg, drop_dbeta_nan, get_dbeta_info

os.makedirs(f"{majority_out_path_1}/section_1", exist_ok=True)
os.makedirs(f"{majority_out_path_2}/section_1", exist_ok=True)

In [40]:
train_df = pd.read_csv(df_file_1)

In [41]:
delta_beta = get_dbeta_avg(train_df)

In [42]:
delta_beta

Unnamed: 0.1,Unnamed: 0,dbeta
0,cg00000957,0.018932
1,cg00001349,0.040178
2,cg00001583,0.256413
3,cg00002028,0.004746
4,cg00002719,0.423447
...,...,...
373336,cg27657363,-0.005472
373337,cg27657537,0.260572
373338,cg27662611,-0.004693
373339,cg27665648,0.000021


In [43]:
# record the list of feature with dbeta being NaN
delta_beta = drop_dbeta_nan(delta_beta, log_postfix="TCGA")

In [50]:
dmp = pd.read_csv(dmp_file_1)

In [51]:
dbeta_info = get_dbeta_info(delta_beta, dmp, log_postfix="TCGA")
dbeta_info

Unnamed: 0,ID,gene,dbeta,feature
0,cg03630821,A1BG,0.173038,Body
1,cg27394794,A1CF,-0.329788,Body
2,cg07196505,A2BP1,-0.414458,Body
3,cg00336946,A2LD1,-0.258079,TSS1500
4,cg00134295,A2M,-0.192465,TSS1500
...,...,...,...,...
18647,cg23995459,ZYG11B,-0.164786,TSS1500
18648,cg18074834,ZYX,0.037660,Body
18649,cg16463044,ZZEF1,-0.142714,Body
18650,cg22221847,ZZZ3,-0.027186,3'UTR


In [52]:
dbeta_info["dbeta"] = dbeta_info["dbeta"].apply(lambda x: round(x, 6))
dbeta_info.to_csv(f"{majority_out_path_1}/section_1/dbeta.csv", index=False)

In [53]:
train_df = pd.read_csv(df_file_2)

In [54]:
delta_beta = get_dbeta_avg(train_df)

In [55]:
delta_beta = drop_dbeta_nan(delta_beta, log_postfix="GEO")

In [56]:
dmp = pd.read_csv(dmp_file_2)

In [57]:
dbeta_info = get_dbeta_info(delta_beta, dmp, log_postfix="GEO")
dbeta_info

Unnamed: 0,ID,gene,dbeta,feature
0,cg22286978,A1BG,0.031693,Body
1,cg03817621,A1CF,-0.132061,5'UTR
2,cg08390979,A2BP1,-0.203781,5'UTR
3,cg07218357,A2LD1,-0.093872,TSS200
4,cg00146928,A2M,-0.081349,TSS1500
...,...,...,...,...
17989,cg09704136,ZYX,0.080000,Body
17990,cg16463044,ZZEF1,-0.095281,Body
17991,cg17033546,ZZZ3,-0.041094,5'UTR
17992,cg20009101,psiTPTE22,0.087387,Body


In [58]:
dbeta_info["dbeta"] = dbeta_info["dbeta"].apply(lambda x: round(x, 6))
dbeta_info.to_csv(f"{majority_out_path_2}/section_1/dbeta.csv", index=False)

### Sec. 2 Filter Genes by Average Delta Beta Values


#### 2.1 Filtering TSS


In [59]:
os.makedirs(f"{majority_out_path_1}/section_2", exist_ok=True)
os.makedirs(f"{majority_out_path_2}/section_2", exist_ok=True)

In [60]:
dbeta_info_1 = pd.read_csv(f"{majority_out_path_1}/section_1/dbeta.csv")
dbeta_info_2 = pd.read_csv(f"{majority_out_path_2}/section_1/dbeta.csv")

In [61]:
TSS_1 = dbeta_info_1[dbeta_info_1["feature"].str.contains("TSS")]
TSS_2 = dbeta_info_2[dbeta_info_2["feature"].str.contains("TSS")]

In [62]:
TSS_1.to_csv(f"{majority_out_path_1}/section_2/dbeta_TSS.csv", index=False)
TSS_2.to_csv(f"{majority_out_path_2}/section_2/dbeta_TSS.csv", index=False)

#### 2.2 Thresholding


In [63]:
from utils.dbeta_avg_helper import detect_threshold

In [64]:
dbeta_TSS_threshold_1, threshold_1 = detect_threshold(TSS_1, config=config, log_postfix="_TCGA")
dbeta_TSS_threshold_1.to_csv(f"{majority_out_path_1}/section_2/dbeta_TSS_{threshold_1}.csv", index=False)

In [65]:
dbeta_TSS_threshold_2, threshold_2 = detect_threshold(TSS_2, config=config, log_postfix="_GEO")
dbeta_TSS_threshold_2.to_csv(f"{majority_out_path_2}/section_2/dbeta_TSS_{threshold_2}.csv", index=False)

#### 2.3 Visualization


In [66]:
from utils.dbeta_avg_helper import dbeta_graph, pca_graph

dbeta_graph(dbeta_TSS_threshold_1, f"{majority_out_path_1}/section_2/dbeta_TSS_{threshold_1}.png")
dbeta_graph(dbeta_TSS_threshold_2, f"{majority_out_path_2}/section_2/dbeta_TSS_{threshold_2}.png")


In [67]:
df_1 = pd.read_csv(df_file_1)

In [68]:
pca_graph(dbeta_info_1, df_1, f"{majority_out_path_1}/section_2/pca.html")

In [69]:
df_2 = pd.read_csv(df_file_2)

In [70]:
pca_graph(dbeta_info_2, df_2, f"{majority_out_path_2}/section_2/pca.html")

#### 2.4 join dbeta_info


In [71]:
os.makedirs(f"{joined_out_path}/section_2", exist_ok=True)

In [72]:
dbeta_info_1 = pd.read_csv(f"{majority_out_path_1}/section_2/dbeta_TSS_{threshold_1}.csv")
dbeta_info_2 = pd.read_csv(f"{majority_out_path_2}/section_2/dbeta_TSS_{threshold_2}.csv")

In [73]:
merged_df = pd.merge(dbeta_info_1, dbeta_info_2, on="gene", how="inner", suffixes=('_TCGA', '_GEO'))
merged_df = merged_df[["gene"] + [col for col in merged_df.columns if col != "gene"]]
merged_df.to_csv(f"{joined_out_path}/section_2/dbeta_TSS_threshold_joined.csv", index=False)


#### remember to run clustering on the filtered genes first continue to the next step


### Sec. 3 Feature Selection with ML (SFS)

sequential forward selection


#### Remove previous results

Warning: This step is not reversible


In [None]:
# import shutil
# import os
# if os.path.exists(f"{majority_out_path}/section_3/sfs"):
#     shutil.rmtree(f"{majority_out_path}/section_3/sfs")
# if os.path.exists(f"{minority_out_path}/section_3/sfs"):
#     shutil.rmtree(f"{minority_out_path}/section_3/sfs")

#### 3.1 Preparation(SFS)


In [None]:
config = load_config(CONFIG_PATH)

train_out_path = config["feature_selection"]["hyper"]["train_out_path"]
validate_out_path = config["feature_selection"]["hyper"]["validate_out_path"]

In [None]:
os.makedirs(f"{train_out_path}", exist_ok=True)
os.makedirs(f"{validate_out_path}", exist_ok=True)

In [None]:
config = load_config(CONFIG_PATH)
dbeta_info_file = config["feature_selection"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(f"{majority_out_path}/{dbeta_info_file}")

In [None]:
# check if logs/ folder exists
os.makedirs("logs", exist_ok=True)
from utils.train_helper import TrainHelper

In [None]:
# note that there is setup_dbeta in TrainHelper to further cut down the feature size
th = TrainHelper(dbeta_info)

#### 3.2 Selection(SFS)


In [None]:
train_df = pd.read_csv(f"{majority_out_path}/{majority_df_path}")
validate_df = pd.read_csv(f"{minority_out_path}/{minority_df_path}")
th.set_train_validate_df(train_df, validate_df)

In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

selection_models = {
    "SVM": SVC(kernel="linear", random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        n_estimators=10,
    ),
    "XGBoost": XGBClassifier(
        random_state=42,
        n_estimators=10,
        ),
}

In [None]:
th.set_selection_models(selection_models)
th.set_train_validate()

In [None]:
os.makedirs(f"{majority_out_path}/sfs", exist_ok=True)

th.select_feature_sfs(
    out_path = f"{majority_out_path}/sfs/selected_feature.txt",
    step= 4,
    n_features_to_select="cluster"
)

### Sec. 3 Feature Selection with ML (RFE)

recursive feature elimination


#### Remove previous results

Warning: This step is not reversible


In [None]:
# import shutil
# import os
# if os.path.exists(f"{majority_out_path}/section_3/rfe"):
#     shutil.rmtree(f"{majority_out_path}/section_3/rfe")
# if os.path.exists(f"{minority_out_path}/section_3/rfe"):
#     shutil.rmtree(f"{minority_out_path}/section_3/rfe")

#### 3.1 Preparation


In [None]:
config = load_config(CONFIG_PATH)

train_out_path = config["feature_selection"]["hyper"]["train_out_path"]
validate_out_path = config["feature_selection"]["hyper"]["validate_out_path"]
training_param_file = config["feature_selection"]["hyper"]["training_param_file"]
os.makedirs(f"{train_out_path}", exist_ok=True)
os.makedirs(f"{validate_out_path}", exist_ok=True)

In [None]:
dbeta_info_file = config["feature_selection"]["hyper"]["dbeta_info_file"]
TSS_threshold = pd.read_csv(dbeta_info_file)

In [None]:
from utils.train_helper import TrainHelper
# note that there is setup_dbeta in TrainHelper to further cut down the feature size
th = TrainHelper(TSS_threshold)

#### 3.2 Selection


In [None]:
# from utils.process_norm import *

In [None]:
train_df = pd.read_csv(df_file_1)
validate_df_file = config["feature_selection"]["hyper"]["validate_df_file"]
validate_df = pd.read_csv(validate_df_file)

In [None]:
# train_df = inspect_nan(train_df, mode="column", remove=True)
# train_df

In [None]:
th.set_train_validate_df(train_df, validate_df)

In [None]:
# from utils.train_helper import set_parameters
# from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier

# import json

# with open(training_param_file, "r") as f:
#     training_param = json.load(f)
# xgb_grid = set_parameters(XGBClassifier(random_state=42), training_param["XGBoost"])
# rf_grid = set_parameters(
#     RandomForestClassifier(random_state=42), training_param["RandomForest"]
# )
# svm_grid = set_parameters(SVC(random_state=42, probability=True), training_param["SVM"])
# dt_grid = set_parameters(
#     DecisionTreeClassifier(random_state=42), training_param["DecisionTree"]
# )

# train_models = {
#     "XGBoost": xgb_grid,
#     "RandomForest": rf_grid,
#     "SVM": svm_grid,
#     "DecisionTree": dt_grid,
# }

In [None]:
selection_models = {
    "SVM": SVC(kernel="linear", random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        n_estimators=50,
    ),
    "XGBoost": XGBClassifier(
        random_state=42,
        n_estimators=50,
        ),
}

In [None]:
th.set_selection_models(selection_models)
# th.set_grid_estimators(train_models)
th.set_train_validate(ID="ID_TCGA")

In [None]:
th.select_feature_rfe(
    train_out_path = train_out_path,
    validate_out_path = validate_out_path,
    selected_feature_path = f"{train_out_path}/selected_feature.txt",
    feature_range = "cluster"
)

#### 3.3 Preparation (second round)


In [None]:
config = load_config(CONFIG_PATH)

train_out_path = config["feature_selection_2"]["hyper"]["train_out_path"]
validate_out_path = config["feature_selection_2"]["hyper"]["validate_out_path"]
training_param_file = config["feature_selection_2"]["hyper"]["training_param_file"]
os.makedirs(f"{train_out_path}", exist_ok=True)
os.makedirs(f"{validate_out_path}", exist_ok=True)

In [None]:
dbeta_info_file = config["feature_selection"]["hyper"]["dbeta_info_file"]
TSS_threshold = pd.read_csv(dbeta_info_file)

In [None]:
from utils.train_helper import TrainHelper
# note that there is setup_dbeta in TrainHelper to further cut down the feature size
th = TrainHelper(TSS_threshold)

#### 3.4 Selection (second round)


In [None]:
train_df = pd.read_csv(df_file_2)
validate_df_file = config["feature_selection_2"]["hyper"]["validate_df_file"]
validate_df = pd.read_csv(validate_df_file)

In [None]:
th.set_train_validate_df(train_df, validate_df)

In [None]:
selection_models = {
    "SVM": SVC(kernel="linear", random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        n_estimators=50,
    ),
    "XGBoost": XGBClassifier(
        random_state=42,
        n_estimators=50,
        ),
}

In [None]:
th.set_selection_models(selection_models)
# th.set_grid_estimators(train_models)
th.set_train_validate(ID="ID_GEO")

In [None]:
th.select_feature_rfe(
    train_out_path = train_out_path,
    validate_out_path = validate_out_path,
    selected_feature_path = f"{train_out_path}/selected_feature.txt",
    feature_range = "cluster"
)

### Sec. 4 Clean Selected Features


#### 4.1 Generate Feature json for SimpleModel


In [None]:
from utils.train_helper import read_selected_features, read_selected_features_json, TrainHelper
config = load_config(CONFIG_PATH)
dbeta_info_file = config["feature_selection"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(dbeta_info_file)
th = TrainHelper(dbeta_info)
method = "rfe"
features = read_selected_features(f"{majority_out_path_1}/section_3/{method}/selected_feature.txt")
th.generate_selected_features(
    features,
    f"{majority_out_path_1}/section_3/{method}/selected_features.json",
    mode="min",
    out_format="json",
)

read_selected_features_json(f"{majority_out_path_1}/section_3/{method}/selected_features.json")

In [None]:
from utils.train_helper import read_selected_features, read_selected_features_json, TrainHelper
config = load_config(CONFIG_PATH)
dbeta_info_file = config["feature_selection_2"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(dbeta_info_file)
th = TrainHelper(dbeta_info)
method = "rfe"
features = read_selected_features(f"{majority_out_path_2}/section_3/{method}/selected_feature.txt")
th.generate_selected_features(
    features,
    f"{majority_out_path_2}/section_3/{method}/selected_features.json",
    mode="min",
    out_format="json",
)

# use this to read json
read_selected_features_json(f"{majority_out_path_2}/section_3/{method}/selected_features.json")

#### 4.2 Gather Selected gene list from best selection model


In [None]:
rfe_train = pd.read_csv(f"{train_out_path}/rfe.csv")
rfe_validate = pd.read_csv(f"{validate_out_path}/rfe.csv")
fpr_tpr_train = pd.read_csv(f"{train_out_path}/roc_curve.csv")
fpr_tpr_validate = pd.read_csv(f"{validate_out_path}/roc_curve.csv")
rfe_j = pd.merge(rfe_train, rfe_validate, on=["selection_model", "train_model", "features"], suffixes=('_train', '_validate'))
fpr_tpr_j = pd.merge(fpr_tpr_train, fpr_tpr_validate, on=["selection_model", "train_model", "features"], suffixes=('_train', '_validate'))
J = pd.merge(rfe_j, fpr_tpr_j, on=["selection_model", "train_model", "features"])

In [None]:
import ast

J["fpr_train"] = J["fpr_train"].apply(ast.literal_eval)
J["tpr_train"] = J["tpr_train"].apply(ast.literal_eval)
J["fpr_validate"] = J["fpr_validate"].apply(ast.literal_eval)
J["tpr_validate"] = J["tpr_validate"].apply(ast.literal_eval)

In [None]:
from utils.painter import plot_roc_curve, create_performance_barchart

In [None]:
J['accuracy_diff'] = J['accuracy_train'] - J['accuracy_validate']
J['recall_diff'] = J['recall_train'] - J['recall_validate']
J['f1_score_diff'] = J['f1_score_train'] - J['f1_score_validate']
J['AUC_diff'] = J['AUC_train'] - J['AUC_validate']
J['MCC_diff'] = J['MCC_train'] - J['MCC_validate']
J['fbeta2_score_diff'] = J['fbeta2_score_train'] - J['fbeta2_score_validate']

In [None]:
# tweakable width and height
plot_roc_curve(
    J,
    "ROC Curves on Training Set",
    f"{train_out_path}/roc_curve.html",
    x_column = "fpr_train",
    y_column = "tpr_train",
    trace_name = ["selection_model", "train_model", "features"],
)
# tweakable width and height
plot_roc_curve(
    J,
    "ROC Curves on Testing Set",
    f"{validate_out_path}/roc_curve.html",
    x_column = "fpr_validate",
    y_column = "tpr_validate",
    trace_name = ["selection_model", "train_model", "features"],
)

In [None]:
# plot difference
performance_metrics = ['accuracy_diff', 'recall_diff', 'f1_score_diff', 'AUC_diff', 'MCC_diff', 'fbeta2_score_diff']
ground_by_train_model = J.groupby('train_model')[performance_metrics].mean()
ground_by_train_model['train_model'] = ground_by_train_model.index
ground_by_train_model.to_csv(f"{validate_out_path}/performance_diff_grouped_by_train_model.csv", index=False)
color_mapping = {
    "accuracy_diff": "blue",
    "recall_diff": "red",
    "f1_score_diff": "green",
    "AUC_diff": "purple",
    "MCC_diff": "orange",
    "fbeta2_score_diff": "brown",
}
create_performance_barchart(
    df=ground_by_train_model,
    color_mapping=color_mapping,
    metric="train_model",
    out_path=f"{validate_out_path}/performance_diff_grouped_by_train_model.html",
    title="Grouped Performance Difference between Training and Testing Set",
    x_axis_label="Performance Difference (Training - Testing)",
    y_axis_label="Train Model",
    orientation="h",
)

In [None]:
J = J[["selection_model", "train_model", "features", "accuracy_validate", "recall_validate", "f1_score_validate", "AUC_validate", "MCC_validate", "fbeta2_score_validate"]]

In [None]:
# group by train_model, for each train_model, calculate the mean of each performance metric
performance_metrics = ['accuracy_validate', 'recall_validate',
                       'f1_score_validate', 'AUC_validate', 'MCC_validate']
ground_by_train_model = J.groupby('train_model')[performance_metrics].mean()
ground_by_train_model['train_model'] = ground_by_train_model.index
ground_by_train_model.to_csv(
    f"{validate_out_path}/performance_metrics_grouped_by_train_model.csv", index=False)
color_mapping = {
    "accuracy_validate": "blue",
    "recall_validate": "red",
    "f1_score_validate": "green",
    "AUC_validate": "purple",
    "MCC_validate": "orange",
}
create_performance_barchart(
    df=ground_by_train_model,
    color_mapping=color_mapping,
    metric="train_model",
    out_path=f"{validate_out_path}/performance_metrics_grouped_by_train_model.html",
    title="Grouped Performance Metrics by Train Model",
    x_axis_label="Performance",
    y_axis_label="Train Model",
    orientation="h",
)
best_train_model = ground_by_train_model['MCC_validate'].idxmax()
print(f"Best train model: {best_train_model}")
ground_by_feature = J[J['train_model'] == best_train_model].groupby('features')[
    performance_metrics].mean()
ground_by_feature['features'] = ground_by_feature.index
ground_by_feature.to_csv(
    f"{validate_out_path}/performance_metrics_grouped_by_feature.csv", index=False)
create_performance_barchart(
    df=ground_by_feature,
    color_mapping=color_mapping,
    metric="features",
    out_path=f"{validate_out_path}/performance_metrics_grouped_by_feature.html",
    title="Grouped Performance Metrics by Feature",
    x_axis_label="Performance",
    y_axis_label="Feature",
    orientation="h",
)
best_num_of_feature = ground_by_feature['MCC_validate'].idxmax()
print(f"Best number of feature: {best_num_of_feature}")
best_performance_records = J[(J['train_model'] == best_train_model) & (
    J['features'] == best_num_of_feature)]
best_performance_records.to_csv(
    f"{validate_out_path}/best_performance_records.csv", index=False)

In [None]:
from utils.train_helper import read_selected_features, read_selected_features_json, TrainHelper
config = load_config(CONFIG_PATH)
dbeta_info_file = config["feature_selection"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(dbeta_info_file)
th = TrainHelper(dbeta_info)

features = read_selected_features(f"{train_out_path}/selected_feature.txt")

th.generate_selected_features(
    features,
    f"{validate_out_path}/selected_features.json",
    mode=int(best_num_of_feature),
    out_format="json",
)

# use this to read json
read_selected_features_json(f"{validate_out_path}/selected_features.json")

### Sec. 5 Clustering Visualization


#### 1. load data

remember to calculate distance matrix first


In [None]:
import pandas as pd
import numpy as np
from utils.clustering_helper import hierarchical_clustering, check_distance_matrix

In [None]:
result_prefix = config["clustering_visual"]["hyper"]["result_prefix"]
dbeta_file = config["clustering_visual"]["hyper"]["dbeta_file"]
bp_file = config["clustering_visual"]["hyper"]["bp_file"]
cc_file = config["clustering_visual"]["hyper"]["cc_file"]
mf_file = config["clustering_visual"]["hyper"]["mf_file"]
terms_count_file = config["clustering_visual"]["hyper"]["terms_count_file"]
result_out_path = config["clustering_visual"]["hyper"]["result_out_path"]

In [None]:
os.makedirs(result_out_path, exist_ok=True)

In [None]:
gene_set = pd.read_csv(dbeta_file, index_col=0)
distance_matrix_bp = pd.read_csv(bp_file, index_col=0)
distance_matrix_cc = pd.read_csv(cc_file, index_col=0)
distance_matrix_mf = pd.read_csv(mf_file, index_col=0)
terms_count = pd.read_csv(terms_count_file, index_col=0)

In [None]:
# replace NaN with 0
distance_matrix_bp = distance_matrix_bp.fillna(0)
distance_matrix_cc = distance_matrix_cc.fillna(0)
distance_matrix_mf = distance_matrix_mf.fillna(0)

In [None]:
# reindex distance matrix
index_bp = distance_matrix_bp.index
index_cc = distance_matrix_cc.index
index_mf = distance_matrix_mf.index
index = index_bp.union(index_cc).union(index_mf)
distance_matrix_bp_ = distance_matrix_bp.reindex(index=index, columns=index, fill_value=0)
distance_matrix_cc_ = distance_matrix_cc.reindex(index=index, columns=index, fill_value=0)
distance_matrix_mf_ = distance_matrix_mf.reindex(index=index, columns=index, fill_value=0)

In [None]:
# make a array of distance matrix for each ontology
distance_matrix = []

distance_matrix.append(distance_matrix_bp_)
distance_matrix.append(distance_matrix_cc_)
distance_matrix.append(distance_matrix_mf_)

#### 2. Weighted Sum


In [None]:
weight = [count for count in terms_count["count"]]
weight = weight / np.sum(weight)
masks = np.array([~np.isnan(distance_matrix[i].values) for i in range(3)])

valid_weights = np.array([weight[i] for i in range(3)])[:, None, None] * masks

weight_sums = valid_weights.sum(axis=0)

normalized_weights = np.divide(valid_weights, weight_sums, where=weight_sums != 0)
weighted_sum = sum(
    np.nan_to_num(distance_matrix[i].values) * normalized_weights[i] for i in range(3)
)


weighted_sum_dataframe = pd.DataFrame(weighted_sum, index=index, columns=index)

weighted_sum_dataframe.head()

In [None]:
cluster_result_weighted = hierarchical_clustering(
    weighted_sum_dataframe,
    range_min=2,
    range_max=4,
    cluster_number=3,
    out_path=f"{result_out_path}/hierarchical_clustering_weighted_sum.png",
)

In [None]:
cluster_result_weighted.head()

#### 3. Simple average


In [None]:
weight = [1, 1, 1]
masks = np.array([~np.isnan(distance_matrix[i].values) for i in range(3)])
valid_weights = np.array([weight[i] for i in range(3)])[:, None, None] * masks
weight_sums = valid_weights.sum(axis=0)
normalized_weights = np.divide(valid_weights, weight_sums, where=weight_sums != 0)
weighted_sum = sum(
    np.nan_to_num(distance_matrix[i].values) * normalized_weights[i] for i in range(3)
)
simple_sum_dataframe = pd.DataFrame(weighted_sum, index=index, columns=index)
simple_sum_dataframe.head()

In [None]:
cluster_result_simple = hierarchical_clustering(
    simple_sum_dataframe,
    range_min=2,
    range_max=4,
    cluster_number=3,
    out_path=f"{result_out_path}/hierarchical_clustering_simple_sum.png",
)

In [None]:
cluster_result_simple.head()

#### 4. Consensus clustering


In [None]:
cluster_bp = hierarchical_clustering(
    distance_matrix_bp, out_path=f"{result_out_path}/hierarchical_clustering_bp.png"
)
cluster_cc = hierarchical_clustering(
    distance_matrix_cc, out_path=f"{result_out_path}/hierarchical_clustering_cc.png"
)
cluster_mf = hierarchical_clustering(
    distance_matrix_mf, out_path=f"{result_out_path}/hierarchical_clustering_mf.png"
)

In [None]:
cluster_bp.columns = ["gene", "cluster_bp"]
cluster_cc.columns = ["gene", "cluster_cc"]
cluster_mf.columns = ["gene", "cluster_mf"]
cluster_bp_cc = pd.merge(cluster_bp, cluster_cc, on="gene", how="outer")
cluster_go = pd.merge(cluster_bp_cc, cluster_mf, on="gene", how="outer")
cluster_go = cluster_go.fillna(-1)
print(cluster_go.shape)
cluster_go.head()

In [None]:
num_genes = cluster_go.shape[0]
consensus_matrix = np.zeros((num_genes, num_genes))
for i in range(num_genes):
    for j in range(i, num_genes):
        if cluster_go.iloc[i]["cluster_bp"] == cluster_go.iloc[j]["cluster_bp"]:
            consensus_matrix[i][j] += 1

        if cluster_go.iloc[i]["cluster_cc"] == cluster_go.iloc[j]["cluster_cc"]:
            consensus_matrix[i][j] += 1

        if cluster_go.iloc[i]["cluster_mf"] == cluster_go.iloc[j]["cluster_mf"]:
            consensus_matrix[i][j] += 1

consensus_matrix = pd.DataFrame(
    consensus_matrix, index=cluster_go["gene"], columns=cluster_go["gene"]
)
consensus_matrix += consensus_matrix.T
distance_matrix_consensus = 1 - consensus_matrix / 3
np.fill_diagonal(distance_matrix_consensus.values, 0)
distance_matrix_consensus.head()

In [None]:
cluster_result_consensus = hierarchical_clustering(
    distance_matrix_consensus,
    range_min=2,
    range_max=4,
    cluster_number=3,
    out_path=f"{result_out_path}/hierarchical_clustering_consensus.png",
)

In [None]:
cluster_result_consensus.head()

#### 5. Compare


In [None]:
from utils.clustering_helper import hierarchical_clustering_compare

hierarchical_clustering_compare(
    [weighted_sum_dataframe, simple_sum_dataframe, distance_matrix_consensus],
    ["Weighted Average", "Simple Average", "Consensus"],
    out_path=f"{result_out_path}/hierarchical_clustering_compare.png",
    range_min=2,
    range_max=4,
)

In [None]:
dbeta_info = pd.read_csv(dbeta_file)

In [None]:
# column gene isin weighted_sum_dataframe
weighted_dbeta = dbeta_info[dbeta_info["gene"].isin(weighted_sum_dataframe.index)]
simple_dbeta = dbeta_info[dbeta_info["gene"].isin(simple_sum_dataframe.index)]
consensus_dbeta = dbeta_info[dbeta_info["gene"].isin(distance_matrix_consensus.index)]

In [None]:
weighted_dbeta.merge(cluster_result_weighted, on="gene").to_csv(
    f"{result_out_path}/{result_prefix}_weighted.csv", index=False
)
simple_dbeta.merge(cluster_result_simple, on="gene").to_csv(
    f"{result_out_path}/{result_prefix}_simple.csv", index=False
)
consensus_dbeta.merge(cluster_result_consensus, on="gene").to_csv(
    f"{result_out_path}/{result_prefix}_consensus.csv", index=False
)

### Sec. 6 SimpleModel Training


#### 1. Load Data


In [None]:
from utils.train_helper import read_selected_features_json
import pandas as pd
from utils.process_norm import *

In [None]:
dbeta_file = config["simple_model"]["hyper"]["dbeta_file"]
selected_feature_file = config["simple_model"]["hyper"]["selected_feature_file"]
selected_feature_file_2 = config["simple_model"]["hyper"]["selected_feature_file_2"]
train_out_path = config["simple_model"]["hyper"]["train_out_path"]
validate_out_path = config["simple_model"]["hyper"]["validate_out_path"]
df_train_file = config["simple_model"]["hyper"]["df_train_file"]
df_test_file = config["simple_model"]["hyper"]["df_test_file"]
df_train_file_2 = config["simple_model"]["hyper"]["df_train_file_2"]
df_test_file_2 = config["simple_model"]["hyper"]["df_test_file_2"]
training_param_file = config["simple_model"]["hyper"]["training_param_file"]

In [None]:
selected_feature = read_selected_features_json(selected_feature_file)
selected_feature_2 = read_selected_features_json(selected_feature_file_2)
dbeta_file = pd.read_csv(dbeta_file)

In [None]:
train = pd.read_csv(df_train_file)
test = pd.read_csv(df_test_file)

In [None]:
train = inspect_nan(train, mode="column", remove=True)

In [None]:
train_2 = pd.read_csv(df_train_file_2)
test_2 = pd.read_csv(df_test_file_2)

In [None]:
train = merge_datasets(train, train_2)
train

In [None]:
test = merge_datasets(test, test_2)
test

#### 2. Training


In [None]:
from utils.train_helper import set_parameters
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import json
from sklearn.ensemble import VotingClassifier


with open(training_param_file, "r") as f:
    training_param = json.load(f)

xgb_grid = set_parameters(XGBClassifier(random_state=42), training_param["XGBoost"])
rf_grid = set_parameters(RandomForestClassifier(random_state=42), training_param["RandomForest"])
svm_grid = set_parameters(SVC(random_state=42, probability=True), training_param["SVM"])
dt_grid = set_parameters(DecisionTreeClassifier(random_state=42), training_param["DecisionTree"])
voting = VotingClassifier(
    estimators=[("XGBoost", XGBClassifier(random_state=42)), ("RandomForest", RandomForestClassifier(random_state=42)), ("SVM", SVC(random_state=42, probability=True)), ("DecisionTree", DecisionTreeClassifier(random_state=42))
                ],
    voting="soft",
)

# comment out the model you don't want to use
models = {
    "XGBoost": {
        "is_grid_search": True,
        "model": xgb_grid,
    },
    "RandomForest": {
        "is_grid_search": True,
        "model": rf_grid,
    },
    "SVM": {
        "is_grid_search": True,
        "model": svm_grid,
    },
    "DecisionTree": {
        "is_grid_search": True,
        "model": dt_grid,
    },
    "Voting": {
        "is_grid_search": False,
        "model": voting,
    },
}

In [None]:
from utils.simple_model import SimpleModel

for model_name, gene_list in selected_feature.items():
    for model_name, model_config in models.items():
        model = SimpleModel(
            train_df=train,
            test_df=test,
            gene_list=gene_list,
            dbeta_info=dbeta_file,
        )
        model.setup_dbeta()
        model.setup_train_test("ID_TCGA")
        model.setup_combinations()
        model.train(
            model_name,
            model_config["model"],
            train_out_path,
            validate_out_path,
            model_config["is_grid_search"],
            "ID_TCGA"
        )

In [None]:
from utils.simple_model import SimpleModel

for model_name, gene_list in selected_feature_2.items():
    for model_name, model_config in models.items():
        model = SimpleModel(
            train_df=train,
            test_df=test,
            gene_list=gene_list,
            dbeta_info=dbeta_file,
        )
        model.setup_dbeta()
        model.setup_train_test("ID_GEO")
        model.setup_combinations()
        model.train(
            model_name,
            model_config["model"],
            train_out_path,
            validate_out_path,
            model_config["is_grid_search"],
            "ID_GEO"
        )

#### 3. Visualization


##### Congratulation! You have finished the whole pipelineðŸŽ‰. <br>

![title](cat.png)
