### Sec. -1 Initiation

In [1]:
import os
import pandas as pd
from utils.config_helper import update_nested_toml, load_config
os.makedirs("logs", exist_ok=True)


In [2]:
TYPE = input("Enter the type of the config file: ")
CONFIG_PATH = f"../config/{TYPE}.toml"
config = load_config(CONFIG_PATH)

In [3]:
split_ratio = config["init"]["hyper"]["split_ratio"]
train_df_file = config["init"]["hyper"]["train_df_file"]
validate_df_file = config["init"]["hyper"]["validate_df_file"]
test_df_file = config["init"]["hyper"]["test_df_file"]
dmp_files1 = config["init"]["hyper"]["dmp_files1"]
dmp_files2 = config["init"]["hyper"]["dmp_files2"]
majority_out_path = config["init"]["hyper"]["majority_out_path"]
minority_out_path = config["init"]["hyper"]["minority_out_path"]

### Sec. 0 process norm examples

#### 1. Split Dataset Example

In [15]:
import json
from utils.process_norm import *

In [154]:
# read the json file
cancer_type = input("Enter the cancer type: ")
with open(f"../config/{cancer_type}.json") as f:
    J = json.load(f)

In [156]:
data_source = input("Enter the raw all_beta_normalized file: ")
df = pd.read_csv(J[data_source]['file'])


In [None]:
dfo = organize_dataset(df, J[data_source]["normal"], J[data_source]["tumor"], J[data_source]["sample_count"])
complement_df, ratio_df = split_dataset(dfo, J[data_source]["split_test"], J[data_source]["random_state"])
complement_df.to_csv("all_beta_normalized_complement.csv", index=False)
ratio_df.to_csv("all_beta_normalized_ratio.csv", index=False)

#### 2. Merge Dataset Example

In [3]:
df0 = pd.read_csv(f"../stomach/champ_result/gdc_stomach_GSE99553/all_beta_normalized_0.csv")
df1 = pd.read_csv(f"../stomach/champ_result/gdc_stomach_GSE99553/all_beta_normalized_GSE99553.csv")

In [4]:
df0 = organize_dataset(df0, 2, 395, 2)
df1 = organize_dataset(df1, 84, 0, 1)

In [None]:
merged_df = merge_datasets(df0, df1)
merged_df.to_csv("merged.csv", index=False)

#### 3. Inspect NaN Example

In [None]:
# Example usage:
df = pd.DataFrame({
    'ID': ['A1', 'A2', 'A3', 'A4', 'label'],
    'Col1': [1, 2, None, 4, 1],
    'Col2': [5, None, 7, 8, 1],
    'Col3': [9, 10, 11, 12, 0]
})

# Column-wise check
print(inspect_nan(df, mode="column"))

# Row-wise check
print(inspect_nan(df, mode="row"))

### Sec. 1 Delta Beta Calculation

In [None]:
os.makedirs(f"{majority_out_path}/section_1", exist_ok=True)

In [4]:
train_df = pd.read_csv(train_df_file)

In [5]:
# remove outlier in terms of every column
def IQR(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    upper_fence = Q3 + IQR * 1.5
    lower_fence = Q1 - IQR * 1.5
    return upper_fence, lower_fence


def no_outlier(df):
    upper_fence, lower_fence = IQR(df)
    ddf = df[(df > lower_fence) & (df < upper_fence)]
    return ddf

In [6]:
# get normal count by the count of 0 in the last row
normal_count = int((train_df.iloc[-1, 1:] == 0).sum())
all_beta_normalized_normal = train_df.iloc[:-1, 1 : normal_count + 1 :].T

all_beta_normalized_tumor = train_df.iloc[:-1, normal_count + 1 : :].T

In [7]:
all_beta_normalized_normal = no_outlier(all_beta_normalized_normal)
all_beta_normalized_tumor = no_outlier(all_beta_normalized_tumor)

In [8]:
train_normal_avg = all_beta_normalized_normal.mean(skipna=True, axis=0)

In [9]:
all_beta_normalized_tumor = all_beta_normalized_tumor.subtract(
    train_normal_avg, axis=1
)

In [10]:
all_beta_normalized_tumor = no_outlier(all_beta_normalized_tumor)

In [11]:
train_tumor_mean = all_beta_normalized_tumor.mean(skipna=True, axis=0)

In [12]:
delta_beta = pd.merge(
    train_df.iloc[:-1, :1],
    pd.DataFrame(train_tumor_mean, columns=["dbeta"]),
    left_index=True,
    right_index=True,
)


In [13]:
# record the list of feature with dbeta being NaN
update_nested_toml("preprocess.dbeta", "delta_beta_avg_feature_num", delta_beta.shape[0])
update_nested_toml(
    "preprocess.dbeta",
    "NaN_dbeta_feature",
    delta_beta.loc[pd.isna(delta_beta["dbeta"]), "Unnamed: 0"].tolist(),
)
delta_beta = delta_beta.dropna(axis=0)
update_nested_toml("preprocess.dbeta", "delta_beta_avg_feature_num_remove_NaN", delta_beta.shape[0])

In [14]:
dmp = pd.read_csv(dmp_files1)
dmp = dmp[["Unnamed: 0", "gene", "feature"]]

In [15]:
update_nested_toml("preprocess.dbeta", "dmp_before_dropna_shape_feature", dmp.shape[0])

dmp = dmp.dropna(axis=0)

update_nested_toml("preprocess.dbeta", "dmp_after_dropna_shape_feature", dmp.shape[0])

In [16]:
result = pd.merge(delta_beta, dmp, on="Unnamed: 0", how="left")
update_nested_toml(
    "preprocess.dbeta", "delta_beta_avg_feature_num_remove_NaN_join_dmp", result.shape[0]
)

In [17]:
def find_max_dBeta_grouped(group):
    idx_max = group["dbeta"].abs().idxmax()
    return group.loc[idx_max]


dbeta_info = result.groupby("gene", as_index=False).apply(
    find_max_dBeta_grouped, include_groups=False
)

In [18]:
dbeta_info.columns = ["gene", "ID", "dbeta", "feature"]
dbeta_info = dbeta_info[["ID", "gene", "dbeta", "feature"]]

In [19]:
dbeta_info

Unnamed: 0,ID,gene,dbeta,feature
0,cg03630821,A1BG,0.253155,Body
1,cg27394794,A1CF,-0.294116,Body
2,cg07027430,A2BP1,0.328562,Body
3,cg01723761,A2LD1,-0.043872,TSS200
4,cg11139127,A2M,-0.155606,Body
...,...,...,...,...
18637,cg03489712,ZYX,-0.144462,TSS1500
18638,cg21851534,ZZEF1,0.138291,3'UTR
18639,cg10895547,ZZZ3,0.097124,Body
18640,cg20009101,psiTPTE22,0.294720,Body


In [None]:
# comorbidity = pd.read_csv(
#     "../external_result/matchgene174_single_3Y10__OR2.txt", sep="\t", header=None
# )
# dbeta = dbeta[
#     dbeta["gene"].isin(comorbidity[0])
# ]

# result_max_per_gene_single

In [None]:
dbeta_info["dbeta"] = dbeta_info["dbeta"].apply(lambda x: round(x, 6))
dbeta_info.to_csv(f"{majority_out_path}/section_1/dbeta.csv", index=False)

### Sec. 2 Filter Genes by Average Delta Beta Values


#### 2.1 Filtering TSS

In [None]:
os.makedirs(f"{majority_out_path}/section_2", exist_ok=True)

In [None]:
# dbeta_info = pd.read_csv(f"{majority_out_path}/dbeta.csv")

In [21]:
TSS = dbeta_info[dbeta_info["feature"].str.contains("TSS")]

In [22]:
TSS.to_csv(f"{majority_out_path}/section_2/dbeta_TSS.csv", index=False)

#### 2.2 Thresholding

In [23]:
threshold = 1
dbeta_TSS_threshold = TSS[abs(TSS["dbeta"]) > threshold]
while True:
    dbeta_TSS_threshold = TSS[abs(TSS["dbeta"]) > threshold]
    count = dbeta_TSS_threshold.shape[0]
    if (
        config["preprocess"]["filtering"]["hyper"]["avg_dbeta_lower_bound"]
        <= count
        <= config["preprocess"]["filtering"]["hyper"]["avg_dbeta_upper_bound"]
    ):
        break
    threshold -= 0.01
threshold = round(threshold, 2)
update_nested_toml("preprocess.filtering", "threshold", threshold)

In [24]:
dbeta_TSS_threshold.to_csv(f"{majority_out_path}/section_2/dbeta_TSS_{threshold}.csv", index=False)

#### 2.3 Visualization

In [25]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(dbeta_TSS_threshold["dbeta"])
plt.xlabel("delta Beta value")
plt.title("Density plot of delta Beta value")
plt.savefig(f"{majority_out_path}/section_2/dbeta_TSS_{threshold}.png")
plt.close()

In [None]:
# train_df = pd.read_csv(train_df_file)

In [37]:
train_df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,723,724,725,726,727,728,729,730,731,732
0,cg00000957,0.870131,0.866782,0.841300,0.867264,0.866280,0.873862,0.873237,0.873795,0.880335,...,0.820601,0.860153,0.884188,0.875615,0.848019,0.837902,0.912506,0.863320,0.852141,0.884989
1,cg00001349,0.805265,0.925544,0.810149,0.793186,0.845653,0.751198,0.813589,0.762273,0.874750,...,0.691991,0.780731,0.895832,0.658911,0.714955,0.689503,0.917036,0.735942,0.583788,0.754644
2,cg00001583,0.130379,0.070445,0.139665,0.146764,0.111296,0.106812,0.060897,0.099203,0.098281,...,0.142127,0.065369,0.666453,0.316554,0.345282,0.203418,0.640967,0.533204,0.104613,0.078772
3,cg00002028,0.027859,0.034231,0.037926,0.026836,0.032477,0.035183,0.020690,0.021046,0.036342,...,0.008943,0.057376,0.016075,0.028498,0.044060,0.032517,0.009303,0.016617,0.010686,0.027857
4,cg00002719,0.024905,0.049881,0.062630,0.027035,0.054883,0.019621,0.031898,0.026258,0.057241,...,0.163660,0.015071,0.683539,0.600469,0.217289,0.453588,0.727385,0.433241,0.234015,0.180881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353503,cg27657363,0.950697,0.966018,0.949426,0.906193,0.939164,0.940878,0.950655,0.946933,0.944137,...,0.912375,0.596657,0.939392,0.945487,0.912449,0.936289,0.966679,0.733868,0.945120,0.576744
353504,cg27657537,0.060546,0.093363,0.158475,0.056264,0.069728,0.055777,0.074001,0.071694,0.063404,...,0.045037,0.404847,0.119601,0.075252,0.069653,0.101835,0.111734,0.038236,0.165920,0.068246
353505,cg27662611,0.034528,0.047549,0.058815,0.044475,0.067018,0.047670,0.029004,0.026949,0.041777,...,0.021831,0.046173,0.045751,0.027354,0.030641,0.053635,0.032107,0.030105,0.017506,0.025558
353506,cg27665648,0.859977,0.905516,0.764569,0.818406,0.758578,0.795897,0.827701,0.776402,0.780268,...,0.731754,0.629923,0.804026,0.761049,0.851841,0.653382,0.677859,0.853094,0.633259,0.498804


In [52]:
beta_df = train_df.iloc[:-1, :]
beta_df = beta_df[beta_df["Unnamed: 0"].isin(dbeta_info["ID"])]
X = beta_df.iloc[:, 1:].dropna(axis=0).T
y = train_df.iloc[-1, 1:].astype(int).to_list()

In [53]:
# DEBUG
print(f"X shape: {X.shape}")
print(f"y shape: {len(y)}")
# END

X shape: (733, 6571)
y shape: 733


In [54]:
import plotly.express as px
import pandas as pd
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

df = pd.DataFrame(
    {
        "Principal Component 1": X_pca[:, 0],
        "Principal Component 2": X_pca[:, 1],
        "Principal Component 3": X_pca[:, 2],
        "Class": y,
    }
)
fig = px.scatter_3d(
    df,
    x="Principal Component 1",
    y="Principal Component 2",
    z="Principal Component 3",
    color="Class",
    title="PCA of Dataset",
    color_continuous_scale="Viridis",
)

fig.update_layout(
    scene=dict(
        xaxis_title="Principal Component 1",
        yaxis_title="Principal Component 2",
        zaxis_title="Principal Component 3",
    )
)

fig.write_html(f"{majority_out_path}/section_2/preprocess_filtering_pca.html")


#### remember to run clustering on the filtered genes first continue to the next step

### Sec. 3 Feature Selection with ML (SFS)
sequential forward selection

#### Remove previous results
Warning: This step is not reversible

In [6]:
import shutil
import os
if os.path.exists(f"{majority_out_path}/sfs"):
    shutil.rmtree(f"{majority_out_path}/sfs")


#### 3.1 Preparation(SFS)

In [None]:
config = load_config(CONFIG_PATH)

train_out_path = config["feature_selection"]["hyper"]["train_out_path"]
validate_out_path = config["feature_selection"]["hyper"]["validate_out_path"]

In [None]:
os.makedirs(f"{train_out_path}", exist_ok=True)
os.makedirs(f"{validate_out_path}", exist_ok=True)

In [6]:
config = load_config(CONFIG_PATH)
dbeta_info_file = config["feature_selection"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(f"{majority_out_path}/{dbeta_info_file}")

In [8]:
# check if logs/ folder exists
os.makedirs("logs", exist_ok=True)
from utils.train_helper import TrainHelper

In [9]:
# note that there is setup_dbeta in TrainHelper to further cut down the feature size
th = TrainHelper(dbeta_info)

#### 3.2 Selection(SFS)

In [10]:
train_df = pd.read_csv(f"{majority_out_path}/{majority_df_path}")
validate_df = pd.read_csv(f"{minority_out_path}/{minority_df_path}")
th.set_train_validate_df(train_df, validate_df)

In [11]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

selection_models = {
    "SVM": SVC(kernel="linear", random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        n_estimators=10,
    ),
    "XGBoost": XGBClassifier(
        random_state=42,
        n_estimators=10,
        ),
}

In [12]:
th.set_selection_models(selection_models)
th.set_train_validate()

In [13]:
os.makedirs(f"{majority_out_path}/sfs", exist_ok=True)

th.select_feature_sfs(
    out_path = f"{majority_out_path}/sfs/selected_feature.txt",
    step= 4,
    n_features_to_select="cluster"
)

INFO Training SVM with SFS
INFO Training SVM with 3 clusters selected
INFO Training finished with 4 clusters selected
INFO Training DecisionTree with SFS
INFO Training DecisionTree with 3 clusters selected
INFO Training finished with 4 clusters selected
INFO Training RandomForest with SFS
INFO Training RandomForest with 3 clusters selected
INFO Training finished with 4 clusters selected
INFO Training XGBoost with SFS
INFO Training XGBoost with 3 clusters selected
INFO Training finished with 4 clusters selected


### Sec. 3 Feature Selection with ML (RFE)
recursive feature elimination

#### Remove previous results
Warning: This step is not reversible

In [16]:
# import shutil
# import os
# if os.path.exists(f"{majority_out_path}/section_3/rfe"):
#     shutil.rmtree(f"{majority_out_path}/section_3/rfe")
# if os.path.exists(f"{minority_out_path}/section_3/rfe"):
#     shutil.rmtree(f"{minority_out_path}/section_3/rfe")

#### 3.1 Preparation

In [4]:
config = load_config(CONFIG_PATH)

train_out_path = config["feature_selection"]["hyper"]["train_out_path"]
validate_out_path = config["feature_selection"]["hyper"]["validate_out_path"]
training_param_file = config["feature_selection"]["hyper"]["training_param_file"]

In [5]:
os.makedirs(f"{train_out_path}", exist_ok=True)
os.makedirs(f"{validate_out_path}", exist_ok=True)

In [6]:
dbeta_info_file = config["feature_selection"]["hyper"]["dbeta_info_file"]

TSS_threshold = pd.read_csv(dbeta_info_file)

In [7]:
from utils.train_helper import TrainHelper

In [8]:
# note that there is setup_dbeta in TrainHelper to further cut down the feature size
th = TrainHelper(TSS_threshold)

#### 3.2 Selection

In [9]:
from utils.process_norm import *

In [10]:
train_df = pd.read_csv(train_df_file)
validate_df = pd.read_csv(validate_df_file)

In [11]:
# train_df = inspect_nan(train_df, mode="column", remove=True)

INFO Columns with NaNs: {'265': np.float64(1.0)}


In [16]:
inspect_nan(train_df, mode="column")
inspect_nan(validate_df, mode="column")

INFO Columns with NaNs: {}
INFO Columns with NaNs: {}


Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,174,175,176,177,178,179,180,181,182,183
0,cg00000957,0.880918,0.889138,0.874959,0.885774,0.896747,0.879334,0.858834,0.837964,0.876630,...,0.873810,0.862322,0.822148,0.863990,0.826029,0.881792,0.872564,0.747145,0.888961,0.753592
1,cg00001349,0.825366,0.711134,0.814811,0.868791,0.860436,0.682503,0.893290,0.812133,0.811409,...,0.683113,0.816664,0.581452,0.797393,0.728201,0.479400,0.831466,0.497443,0.814902,0.890325
2,cg00001583,0.066911,0.076375,0.096527,0.040940,0.083554,0.087588,0.061910,0.069474,0.093006,...,0.387439,0.401727,0.050542,0.075421,0.508549,0.535652,0.243229,0.116555,0.045325,0.356311
3,cg00002028,0.023719,0.037065,0.021660,0.010759,0.010474,0.027399,0.024558,0.035275,0.017022,...,0.032520,0.050525,0.025038,0.031182,0.032184,0.027876,0.029761,0.030817,0.037673,0.026897
4,cg00002719,0.018851,0.024845,0.023367,0.012765,0.010090,0.017774,0.046654,0.026357,0.026142,...,0.239434,0.035720,0.552907,0.390014,0.373817,0.504740,0.041114,0.032883,0.021213,0.011091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353503,cg27657363,0.965252,0.881412,0.961724,0.957276,0.956891,0.954883,0.953429,0.948396,0.957826,...,0.930339,0.955986,0.919047,0.968376,0.919547,0.946943,0.931814,0.914465,0.645025,0.924954
353504,cg27657537,0.049765,0.117931,0.061232,0.034888,0.055366,0.067702,0.054141,0.083854,0.062192,...,0.084689,0.151546,0.116393,0.067462,0.108101,0.073478,0.085709,0.262332,0.095875,0.445458
353505,cg27662611,0.042660,0.063932,0.055080,0.034882,0.017662,0.042007,0.042645,0.047950,0.047642,...,0.045012,0.036242,0.046424,0.039219,0.071840,0.067796,0.071095,0.057768,0.048707,0.056252
353506,cg27665648,0.872041,0.797600,0.829041,0.863131,0.896679,0.838608,0.709252,0.742108,0.832386,...,0.888940,0.660242,0.763887,0.808614,0.637782,0.858033,0.749160,0.803346,0.485776,0.809212


In [17]:
th.set_train_validate_df(train_df, validate_df)

In [18]:
from utils.train_helper import set_parameters
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import json

with open(training_param_file, "r") as f:
    training_param = json.load(f)
xgb_grid = set_parameters(XGBClassifier(random_state=42), training_param["XGBoost"])
rf_grid = set_parameters(
    RandomForestClassifier(random_state=42), training_param["RandomForest"]
)
svm_grid = set_parameters(SVC(random_state=42, probability=True), training_param["SVM"])
dt_grid = set_parameters(
    DecisionTreeClassifier(random_state=42), training_param["DecisionTree"]
)

train_models = {
    "XGBoost": xgb_grid,
    "RandomForest": rf_grid,
    "SVM": svm_grid,
    "DecisionTree": dt_grid,
}

In [19]:
selection_models = {
    "SVM": SVC(kernel="linear", random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(
        random_state=42,
        n_estimators=10,
    ),
    "XGBoost": XGBClassifier(
        random_state=42,
        n_estimators=10,
        ),
}

In [20]:
th.set_selection_models(selection_models)
th.set_grid_estimators(train_models)
th.set_train_validate()

In [None]:
th.select_feature_rfe(
    train_out_path = train_out_path,
    validate_out_path = validate_out_path,
    selected_feature_path = f"{train_out_path}/selected_feature.txt",
    feature_range = (1, 6, 1),
)

### Sec. 4 Clean Selected Features

#### 4.1 Generate Feature json for SimpleModel

In [6]:
from utils.train_helper import read_selected_features, read_selected_features_json, TrainHelper
config = load_config(CONFIG_PATH)
dbeta_info_file = config["machine_learning"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(f"{majority_out_path}/{dbeta_info_file}")
th = TrainHelper(dbeta_info)

folder = "sfs"
features = read_selected_features(f"{majority_out_path}/{folder}/selected_feature.txt")
th.generate_selected_features(
    features,
    f"{majority_out_path}/{folder}/selected_features.json",
    mode="min",
    out_format="json",
)

# use this to read json
read_selected_features_json(f"{majority_out_path}/{folder}/selected_features.json")

defaultdict(list,
            {'SVM': ['APOC3', 'ATG3', 'FBRSL1', 'RNH1', 'C15orf2'],
             'DecisionTree': ['C2orf89', 'MME', 'IKBKB', 'MNX1', 'C1orf212'],
             'RandomForest': ['ATG3', 'ICOSLG', 'PTPRG', 'MME', 'IRF7'],
             'XGBoost': ['ACCN4', 'DNMT1', 'FXR2', 'MME', 'APBB1', 'ARRB2']})

#### 4.2 Gather Selected gene list from best selection model

In [23]:
rfe_train = pd.read_csv(f"{train_out_path}/rfe.csv")
rfe_validate = pd.read_csv(f"{validate_out_path}/rfe.csv")
fpr_tpr_train = pd.read_csv(f"{train_out_path}/roc_curve.csv")
fpr_tpr_validate = pd.read_csv(f"{validate_out_path}/roc_curve.csv")
rfe_j = pd.merge(rfe_train, rfe_validate, on=["selection_model", "train_model", "features"], suffixes=('_train', '_validate'))
fpr_tpr_j = pd.merge(fpr_tpr_train, fpr_tpr_validate, on=["selection_model", "train_model", "features"], suffixes=('_train', '_validate'))
J = pd.merge(rfe_j, fpr_tpr_j, on=["selection_model", "train_model", "features"])

In [24]:
import ast

J["fpr_train"] = J["fpr_train"].apply(ast.literal_eval)
J["tpr_train"] = J["tpr_train"].apply(ast.literal_eval)
J["fpr_validate"] = J["fpr_validate"].apply(ast.literal_eval)
J["tpr_validate"] = J["tpr_validate"].apply(ast.literal_eval)

In [25]:
from utils.painter import plot_roc_curve, create_performance_barchart

In [26]:
J['accuracy_diff'] = J['accuracy_train'] - J['accuracy_validate']
J['recall_diff'] = J['recall_train'] - J['recall_validate']
J['f1_score_diff'] = J['f1_score_train'] - J['f1_score_validate']
J['AUC_diff'] = J['AUC_train'] - J['AUC_validate']
J['MCC_diff'] = J['MCC_train'] - J['MCC_validate']
J['fbeta2_score_diff'] = J['fbeta2_score_train'] - J['fbeta2_score_validate']

In [28]:
# tweakable width and height
plot_roc_curve(
    J, 
    "ROC Curves on Training Set", 
    f"{train_out_path}/roc_curve.html",
    x_column = "fpr_train",
    y_column = "tpr_train",
    trace_name = ["selection_model", "train_model", "features"],
)
# tweakable width and height
plot_roc_curve(
    J, 
    "ROC Curves on Testing Set", 
    f"{validate_out_path}/roc_curve.html",
    x_column = "fpr_validate",
    y_column = "tpr_validate",
    trace_name = ["selection_model", "train_model", "features"],
)

ROC curve saved to ../lung/result/GDC_lung_tissue/split80/section_3/rfe/roc_curve.html
ROC curve saved to ../lung/result/GDC_lung_tissue/split20/section_3/rfe/roc_curve.html


In [29]:
# plot difference
performance_metrics = ['accuracy_diff', 'recall_diff', 'f1_score_diff', 'AUC_diff', 'MCC_diff', 'fbeta2_score_diff']
ground_by_train_model = J.groupby('train_model')[performance_metrics].mean()
ground_by_train_model['train_model'] = ground_by_train_model.index
ground_by_train_model.to_csv(f"{validate_out_path}/performance_diff_grouped_by_train_model.csv", index=False)
color_mapping = {
    "accuracy_diff": "blue",
    "recall_diff": "red",
    "f1_score_diff": "green",
    "AUC_diff": "purple",
    "MCC_diff": "orange",
    "fbeta2_score_diff": "brown",
}
create_performance_barchart(
    df=ground_by_train_model,
    color_mapping=color_mapping,
    metric="train_model",
    out_path=f"{validate_out_path}/performance_diff_grouped_by_train_model.html",
    title="Grouped Performance Difference between Training and Testing Set",
    x_axis_label="Performance Difference (Training - Testing)",
    y_axis_label="Train Model",
    orientation="h",
)

Performance difference saved to ../lung/result/GDC_lung_tissue/split20/section_3/rfe/performance_diff_grouped_by_train_model.html


In [30]:
J = J[["selection_model", "train_model", "features", "accuracy_validate", "recall_validate", "f1_score_validate", "AUC_validate", "MCC_validate", "fbeta2_score_validate"]]

In [31]:
# group by train_model, for each train_model, calculate the mean of each performance metric
performance_metrics = ['accuracy_validate', 'recall_validate',
                       'f1_score_validate', 'AUC_validate', 'MCC_validate']
ground_by_train_model = J.groupby('train_model')[performance_metrics].mean()
ground_by_train_model['train_model'] = ground_by_train_model.index
ground_by_train_model.to_csv(
    f"{validate_out_path}/performance_metrics_grouped_by_train_model.csv", index=False)
color_mapping = {
    "accuracy_validate": "blue",
    "recall_validate": "red",
    "f1_score_validate": "green",
    "AUC_validate": "purple",
    "MCC_validate": "orange",
}
create_performance_barchart(
    df=ground_by_train_model,
    color_mapping=color_mapping,
    metric="train_model",
    out_path=f"{validate_out_path}/performance_metrics_grouped_by_train_model.html",
    title="Grouped Performance Metrics by Train Model",
    x_axis_label="Performance",
    y_axis_label="Train Model",
    orientation="h",
)
best_train_model = ground_by_train_model['MCC_validate'].idxmax()
print(f"Best train model: {best_train_model}")
ground_by_feature = J[J['train_model'] == best_train_model].groupby('features')[
    performance_metrics].mean()
ground_by_feature['features'] = ground_by_feature.index
ground_by_feature.to_csv(
    f"{validate_out_path}/performance_metrics_grouped_by_feature.csv", index=False)
create_performance_barchart(
    df=ground_by_feature,
    color_mapping=color_mapping,
    metric="features",
    out_path=f"{validate_out_path}/performance_metrics_grouped_by_feature.html",
    title="Grouped Performance Metrics by Feature",
    x_axis_label="Performance",
    y_axis_label="Feature",
    orientation="h",
)
best_num_of_feature = ground_by_feature['MCC_validate'].idxmax()
print(f"Best number of feature: {best_num_of_feature}")
best_performance_records = J[(J['train_model'] == best_train_model) & (
    J['features'] == best_num_of_feature)]
best_performance_records.to_csv(
    f"{validate_out_path}/best_performance_records.csv", index=False)

Performance difference saved to ../lung/result/GDC_lung_tissue/split20/section_3/rfe/performance_metrics_grouped_by_train_model.html
Best train model: XGBoost
Performance difference saved to ../lung/result/GDC_lung_tissue/split20/section_3/rfe/performance_metrics_grouped_by_feature.html
Best number of feature: 5


In [32]:
from utils.train_helper import read_selected_features, read_selected_features_json, TrainHelper
config = load_config(CONFIG_PATH)
dbeta_info_file = config["feature_selection"]["hyper"]["dbeta_info_file"]
dbeta_info = pd.read_csv(dbeta_info_file)
th = TrainHelper(dbeta_info)

features = read_selected_features(f"{train_out_path}/selected_feature.txt")

th.generate_selected_features(
    features,
    f"{validate_out_path}/selected_features.json",
    mode=int(best_num_of_feature),
    out_format="json",
)

# use this to read json
read_selected_features_json(f"{validate_out_path}/selected_features.json")

defaultdict(list,
            {'best': ['SNORD115-10',
              'TMEM196',
              'MIR377',
              'PCK1',
              'PATE2',
              'RALYL',
              'ATP5G2',
              'C1orf150',
              'C1orf114',
              'PCDHB15',
              'ATG16L1',
              'DLC1',
              'DOC2A',
              'AIM2',
              'DCD',
              'SCARF1',
              'BHLHE23']})

### Sec. 5 Clustering Visualization

#### 1. load data

remember to calculate distance matrix first

In [83]:
import pandas as pd
import numpy as np
from utils.clustering_helper import hierarchical_clustering, check_distance_matrix

In [91]:
result_prefix = config["clustering_visual"]["hyper"]["result_prefix"]
dbeta_file = config["clustering_visual"]["hyper"]["dbeta_file"]
bp_file = config["clustering_visual"]["hyper"]["bp_file"]
cc_file = config["clustering_visual"]["hyper"]["cc_file"]
mf_file = config["clustering_visual"]["hyper"]["mf_file"]
terms_count_file = config["clustering_visual"]["hyper"]["terms_count_file"]
result_out_path = config["clustering_visual"]["hyper"]["result_out_path"]

In [98]:
os.makedirs(result_out_path, exist_ok=True)

In [92]:
gene_set = pd.read_csv(dbeta_file, index_col=0)
distance_matrix_bp = pd.read_csv(bp_file, index_col=0)
distance_matrix_cc = pd.read_csv(cc_file, index_col=0)
distance_matrix_mf = pd.read_csv(mf_file, index_col=0)
terms_count = pd.read_csv(terms_count_file, index_col=0)

In [93]:
# replace NaN with 0
distance_matrix_bp = distance_matrix_bp.fillna(0)
distance_matrix_cc = distance_matrix_cc.fillna(0)
distance_matrix_mf = distance_matrix_mf.fillna(0)

In [94]:
# reindex distance matrix
index_bp = distance_matrix_bp.index
index_cc = distance_matrix_cc.index
index_mf = distance_matrix_mf.index
index = index_bp.union(index_cc).union(index_mf)
distance_matrix_bp_ = distance_matrix_bp.reindex(index=index, columns=index, fill_value=0)
distance_matrix_cc_ = distance_matrix_cc.reindex(index=index, columns=index, fill_value=0)
distance_matrix_mf_ = distance_matrix_mf.reindex(index=index, columns=index, fill_value=0)

In [95]:
# make a array of distance matrix for each ontology
distance_matrix = []

distance_matrix.append(distance_matrix_bp_)
distance_matrix.append(distance_matrix_cc_)
distance_matrix.append(distance_matrix_mf_)

#### 2. Weighted Sum

In [96]:
weight = [count for count in terms_count["count"]]
weight = weight / np.sum(weight)
masks = np.array([~np.isnan(distance_matrix[i].values) for i in range(3)])

valid_weights = np.array([weight[i] for i in range(3)])[:, None, None] * masks

weight_sums = valid_weights.sum(axis=0)

normalized_weights = np.divide(valid_weights, weight_sums, where=weight_sums != 0)
weighted_sum = sum(
    np.nan_to_num(distance_matrix[i].values) * normalized_weights[i] for i in range(3)
)


weighted_sum_dataframe = pd.DataFrame(weighted_sum, index=index, columns=index)

weighted_sum_dataframe.head()

Unnamed: 0,ACOT11,ADAM11,ADAM5P,ADAMTS20,ADAMTS5,ADCY2,ADCY3,ADCY4,AGXT2,AIM2,...,ZNF471,ZNF492,ZNF560,ZNF572,ZNF577,ZNF667,ZNF709,ZNF781,ZNF98,ZSCAN18
ACOT11,0.0,0.669925,0.597705,0.690038,0.671824,0.593297,0.633828,0.531955,0.735806,0.676058,...,0.7714,0.812483,0.770642,0.732916,0.7714,0.749643,0.770642,0.761281,0.812483,0.751161
ADAM11,0.669925,0.0,0.674579,0.534639,0.598009,0.681184,0.664305,0.631173,0.851705,0.728104,...,0.826384,0.883769,0.825878,0.791013,0.826384,0.796531,0.825878,0.825169,0.883769,0.797796
ADAM5P,0.597705,0.674579,0.0,0.679536,0.673202,0.72833,0.680214,0.695374,0.817316,0.595749,...,0.826487,0.826487,0.826487,0.636887,0.826487,0.804477,0.826487,0.811651,0.826487,0.804477
ADAMTS20,0.690038,0.534639,0.679536,0.0,0.510913,0.601255,0.644948,0.577176,0.837219,0.717878,...,0.773603,0.81925,0.773097,0.775932,0.773603,0.773097,0.773097,0.772388,0.81925,0.774362
ADAMTS5,0.671824,0.598009,0.673202,0.510913,0.0,0.796112,0.775743,0.760975,0.832178,0.669158,...,0.762367,0.83149,0.762114,0.729119,0.762367,0.71354,0.762114,0.762417,0.83149,0.714299


In [99]:
cluster_result_weighted = hierarchical_clustering(
    weighted_sum_dataframe,
    range_min=2,
    range_max=4,
    cluster_number=3,
    out_path=f"{result_out_path}/hierarchical_clustering_weighted_sum.png",
)

chosen number of clusters: 3


In [100]:
cluster_result_weighted.head()

Unnamed: 0,gene,cluster
0,ACOT11,3
1,ADAM11,3
2,ADAM5P,3
3,ADAMTS20,3
4,ADAMTS5,3


#### 3. Simple average

In [101]:
weight = [1, 1, 1]
masks = np.array([~np.isnan(distance_matrix[i].values) for i in range(3)])
valid_weights = np.array([weight[i] for i in range(3)])[:, None, None] * masks
weight_sums = valid_weights.sum(axis=0)
normalized_weights = np.divide(valid_weights, weight_sums, where=weight_sums != 0)
weighted_sum = sum(
    np.nan_to_num(distance_matrix[i].values) * normalized_weights[i] for i in range(3)
)
simple_sum_dataframe = pd.DataFrame(weighted_sum, index=index, columns=index)
simple_sum_dataframe.head()

Unnamed: 0,ACOT11,ADAM11,ADAM5P,ADAMTS20,ADAMTS5,ADCY2,ADCY3,ADCY4,AGXT2,AIM2,...,ZNF471,ZNF492,ZNF560,ZNF572,ZNF577,ZNF667,ZNF709,ZNF781,ZNF98,ZSCAN18
ACOT11,0.0,0.69,0.59,0.678667,0.606,0.586,0.598,0.537,0.63,0.566333,...,0.695667,0.716667,0.694667,0.668333,0.695667,0.667,0.694667,0.682333,0.716667,0.669
ADAM11,0.69,0.0,0.684,0.475,0.502333,0.625333,0.622667,0.581667,0.840333,0.733,...,0.825,0.854333,0.824333,0.781333,0.825,0.785667,0.824333,0.818667,0.854333,0.787333
ADAM5P,0.59,0.684,0.0,0.671,0.604667,0.661,0.626,0.616,0.783,0.589,...,0.764,0.764,0.764,0.649333,0.764,0.735,0.764,0.738667,0.764,0.735
ADAMTS20,0.678667,0.475,0.671,0.0,0.378,0.609333,0.640667,0.600667,0.826667,0.734333,...,0.749,0.772333,0.748333,0.757,0.749,0.748333,0.748333,0.742667,0.772333,0.75
ADAMTS5,0.606,0.502333,0.604667,0.378,0.0,0.691667,0.665,0.650333,0.755667,0.616333,...,0.680667,0.716,0.680333,0.646,0.680667,0.616333,0.680333,0.676,0.716,0.617333


In [102]:
cluster_result_simple = hierarchical_clustering(
    simple_sum_dataframe,
    range_min=2,
    range_max=4,
    cluster_number=3,
    out_path=f"{result_out_path}/hierarchical_clustering_simple_sum.png",
)

chosen number of clusters: 3


In [103]:
cluster_result_simple.head()

Unnamed: 0,gene,cluster
0,ACOT11,3
1,ADAM11,3
2,ADAM5P,3
3,ADAMTS20,3
4,ADAMTS5,3


#### 4. Consensus clustering 

In [105]:
cluster_bp = hierarchical_clustering(
    distance_matrix_bp, out_path=f"{result_out_path}/hierarchical_clustering_bp.png"
)
cluster_cc = hierarchical_clustering(
    distance_matrix_cc, out_path=f"{result_out_path}/hierarchical_clustering_cc.png"
)
cluster_mf = hierarchical_clustering(
    distance_matrix_mf, out_path=f"{result_out_path}/hierarchical_clustering_mf.png"
)

Best number of clusters: 26
Best number of clusters: 6
Best number of clusters: 7


In [106]:
cluster_bp.columns = ["gene", "cluster_bp"]
cluster_cc.columns = ["gene", "cluster_cc"]
cluster_mf.columns = ["gene", "cluster_mf"]
cluster_bp_cc = pd.merge(cluster_bp, cluster_cc, on="gene", how="outer")
cluster_go = pd.merge(cluster_bp_cc, cluster_mf, on="gene", how="outer")
cluster_go = cluster_go.fillna(-1)
print(cluster_go.shape)
cluster_go.head()

(327, 4)


Unnamed: 0,gene,cluster_bp,cluster_cc,cluster_mf
0,ACOT11,9.0,2.0,7.0
1,ADAM11,13.0,5.0,7.0
2,ADAM5P,25.0,6.0,7.0
3,ADAMTS20,13.0,6.0,5.0
4,ADAMTS5,13.0,6.0,4.0


In [107]:
num_genes = cluster_go.shape[0]
consensus_matrix = np.zeros((num_genes, num_genes))
for i in range(num_genes):
    for j in range(i, num_genes):
        if cluster_go.iloc[i]["cluster_bp"] == cluster_go.iloc[j]["cluster_bp"]:
            consensus_matrix[i][j] += 1

        if cluster_go.iloc[i]["cluster_cc"] == cluster_go.iloc[j]["cluster_cc"]:
            consensus_matrix[i][j] += 1

        if cluster_go.iloc[i]["cluster_mf"] == cluster_go.iloc[j]["cluster_mf"]:
            consensus_matrix[i][j] += 1

consensus_matrix = pd.DataFrame(
    consensus_matrix, index=cluster_go["gene"], columns=cluster_go["gene"]
)
consensus_matrix += consensus_matrix.T
distance_matrix_consensus = 1 - consensus_matrix / 3
np.fill_diagonal(distance_matrix_consensus.values, 0)
distance_matrix_consensus.head()

gene,ACOT11,ADAM11,ADAM5P,ADAMTS20,ADAMTS5,ADCY2,ADCY3,ADCY4,AGXT2,AIM2,...,ZNF471,ZNF492,ZNF560,ZNF572,ZNF577,ZNF667,ZNF709,ZNF781,ZNF98,ZSCAN18
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACOT11,0.0,0.666667,0.666667,1.0,1.0,0.666667,1.0,0.666667,0.666667,0.666667,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ADAM11,0.666667,0.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ADAM5P,0.666667,0.666667,0.0,0.666667,0.666667,1.0,1.0,1.0,0.666667,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ADAMTS20,1.0,0.666667,0.666667,0.0,0.333333,0.666667,0.666667,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
ADAMTS5,1.0,0.666667,0.666667,0.333333,0.0,1.0,1.0,0.666667,1.0,0.666667,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [108]:
cluster_result_consensus = hierarchical_clustering(
    distance_matrix_consensus,
    range_min=2,
    range_max=4,
    cluster_number=4,
    out_path=f"{result_out_path}/hierarchical_clustering_consensus.png",
)

chosen number of clusters: 4


In [109]:
cluster_result_consensus.head()

Unnamed: 0,gene,cluster
0,ACOT11,4
1,ADAM11,4
2,ADAM5P,4
3,ADAMTS20,2
4,ADAMTS5,2


#### 5. Compare 

In [122]:
from utils.clustering_helper import hierarchical_clustering_compare

hierarchical_clustering_compare(
    [weighted_sum_dataframe, simple_sum_dataframe, distance_matrix_consensus],
    ["Weighted Average", "Simple Average", "Consensus"],
    out_path=f"{result_out_path}/hierarchical_clustering_compare.png",
    range_min=2,
    range_max=4,
)

Best number of clusters for Weighted Average: 3
Best number of clusters for Simple Average: 4
Best number of clusters for Consensus: 4


In [115]:
dbeta_info = pd.read_csv(dbeta_file)

In [117]:
# column gene isin weighted_sum_dataframe
weighted_dbeta = dbeta_info[dbeta_info["gene"].isin(weighted_sum_dataframe.index)]
simple_dbeta = dbeta_info[dbeta_info["gene"].isin(simple_sum_dataframe.index)]
consensus_dbeta = dbeta_info[dbeta_info["gene"].isin(distance_matrix_consensus.index)]

In [123]:
weighted_dbeta.merge(cluster_result_weighted, on="gene").to_csv(
    f"{result_out_path}/{result_prefix}_weighted.csv", index=False
)
simple_dbeta.merge(cluster_result_simple, on="gene").to_csv(
    f"{result_out_path}/{result_prefix}_simple.csv", index=False
)
consensus_dbeta.merge(cluster_result_consensus, on="gene").to_csv(
    f"{result_out_path}/{result_prefix}_consensus.csv", index=False
)

### Sec. 6 SimpleModel Training

#### 1. Load Data

In [4]:
from utils.train_helper import read_selected_features_json
import pandas as pd
from utils.process_norm import *

In [5]:
dbeta_file = config["simple_model"]["hyper"]["dbeta_file"]
selected_feature_file = config["simple_model"]["hyper"]["selected_feature_file"]
train_out_path = config["simple_model"]["hyper"]["train_out_path"]
validate_out_path = config["simple_model"]["hyper"]["validate_out_path"]
df_file = config["simple_model"]["hyper"]["df_file"]
training_param_file = config["simple_model"]["hyper"]["training_param_file"]

In [6]:
selected_feature_file = read_selected_features_json(selected_feature_file)
dbeta_file = pd.read_csv(dbeta_file)

In [7]:
all_df = pd.read_csv(df_file)

In [8]:
import json
with open(f"../config/{TYPE}.json") as f:
    J = json.load(f)
data_source = input("Enter the data source: ")

all_df = organize_dataset(all_df, J[data_source]["normal"], J[data_source]["tumor"], J[data_source]["sample_count"])

In [9]:
sp80, sp20 = split_dataset(all_df, 0.2, 42)

INFO complement_df feature: 581634
INFO complement_df sample (normal, tumor): (52, 52)
INFO ratio_df feature: 581634
INFO ratio_df sample (normal, tumor): (13, 13)


In [10]:
sp80

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,94,95,96,97,98,99,100,101,102,103
0,cg07881041,0.922064,0.772699,0.947640,0.906835,0.922833,0.955707,0.962887,0.973165,0.978423,...,0.952590,0.685911,0.924956,0.968588,0.864975,0.955348,0.901759,0.984879,0.962641,0.959947
1,cg03513874,0.886635,0.905678,0.902759,0.949006,0.861383,0.923783,0.952101,0.954939,0.914129,...,0.941010,0.902040,0.924603,0.948913,0.829853,0.943103,0.940713,0.931432,0.884628,0.903237
2,cg05451842,0.131998,0.076887,0.098717,0.119298,0.068282,0.043533,0.066844,0.071026,0.042721,...,0.135811,0.069962,0.074974,0.063027,0.115757,0.055754,0.066508,0.079357,0.059651,0.059002
3,cg14797042,0.747154,0.915432,0.884322,0.728397,0.908885,0.953022,0.926101,0.955328,0.924597,...,0.942715,0.874629,0.878009,0.927949,0.806703,0.961924,0.934733,0.932034,0.840970,0.918369
4,cg09838562,0.081135,0.026118,0.042629,0.080403,0.077673,0.035371,0.046142,0.021307,0.030269,...,0.049759,0.040542,0.025722,0.040417,0.074214,0.026951,0.047519,0.031496,0.022080,0.077950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581630,cg06272054,0.020629,0.015493,0.027054,0.018932,0.030767,0.022423,0.014894,0.012303,0.018975,...,0.013464,0.023619,0.025366,0.021927,0.062814,0.016165,0.015958,0.031758,0.012685,0.031398
581631,cg07255356,0.026093,0.101632,0.039477,0.026144,0.034018,0.027180,0.046892,0.038662,0.049252,...,0.037902,0.021123,0.034570,0.025360,0.066376,0.039427,0.034301,0.028468,0.071658,0.033413
581632,cg24220897,0.941737,0.931526,0.957356,0.952761,0.942807,0.948706,0.908193,0.924203,0.919992,...,0.961426,0.958118,0.891759,0.850887,0.847808,0.938657,0.932081,0.952047,0.960075,0.913116
581633,cg12325588,0.066014,0.023164,0.039293,0.052662,0.057087,0.034126,0.041324,0.020782,0.041233,...,0.056860,0.049979,0.102587,0.034306,0.072589,0.024533,0.089700,0.022562,0.026170,0.028324


In [11]:
sp20

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,16,17,18,19,20,21,22,23,24,25
0,cg07881041,0.940279,0.967124,0.956306,0.976382,0.981460,0.934334,0.986245,0.963054,0.947689,...,0.970815,0.777135,0.975642,0.960455,0.914977,0.938691,0.969299,0.969110,0.966672,0.985221
1,cg03513874,0.932805,0.936430,0.934026,0.926603,0.920859,0.877375,0.930253,0.920702,0.968674,...,0.906787,0.626086,0.938125,0.846584,0.965262,0.833087,0.901131,0.947899,0.880481,0.948724
2,cg05451842,0.085244,0.085770,0.067723,0.071205,0.072631,0.106054,0.073678,0.055949,0.057820,...,0.046188,0.063359,0.044734,0.100236,0.089291,0.075774,0.057132,0.060959,0.040080,0.043854
3,cg14797042,0.941151,0.945447,0.940632,0.909522,0.961239,0.756330,0.951041,0.907743,0.944639,...,0.935103,0.824291,0.938701,0.965735,0.678206,0.917456,0.957823,0.890775,0.687828,0.980666
4,cg09838562,0.049288,0.017643,0.030479,0.004853,0.046588,0.096990,0.042188,0.044392,0.032362,...,0.046395,0.055794,0.018954,0.065677,0.118807,0.071327,0.023887,0.026934,0.044218,0.027489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581630,cg06272054,0.020802,0.017459,0.014804,0.021564,0.019553,0.022790,0.019578,0.019163,0.014666,...,0.006060,0.013253,0.018461,0.035401,0.054587,0.012906,0.019826,0.028195,0.019621,0.006389
581631,cg07255356,0.032713,0.031669,0.023020,0.023431,0.033434,0.036052,0.021340,0.029745,0.020298,...,0.024304,0.022484,0.044218,0.040682,0.048324,0.031221,0.032395,0.025121,0.028226,0.037326
581632,cg24220897,0.963746,0.801776,0.946192,0.945664,0.952774,0.931175,0.956058,0.890870,0.937956,...,0.959835,0.881203,0.954542,0.951465,0.907383,0.890302,0.947733,0.952149,0.950509,0.965631
581633,cg12325588,0.027775,0.043463,0.033532,0.027520,0.029936,0.060947,0.032499,0.027690,0.020632,...,0.034456,0.036183,0.029613,0.038345,0.174442,0.031148,0.019245,0.026788,0.044237,0.018378


#### 2. Training

In [12]:
from utils.train_helper import set_parameters
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import json
from sklearn.ensemble import VotingClassifier


with open(training_param_file, "r") as f:
    training_param = json.load(f)

xgb_grid = set_parameters(XGBClassifier(random_state=42), training_param["XGBoost"])
rf_grid = set_parameters(RandomForestClassifier(random_state=42), training_param["RandomForest"])
svm_grid = set_parameters(SVC(random_state=42, probability=True), training_param["SVM"])
dt_grid = set_parameters(DecisionTreeClassifier(random_state=42), training_param["DecisionTree"])
voting = VotingClassifier(
    estimators=[("XGBoost", XGBClassifier(random_state=42)), ("RandomForest", RandomForestClassifier(random_state=42)), ("SVM", SVC(random_state=42, probability=True)), ("DecisionTree", DecisionTreeClassifier(random_state=42))
                ],
    voting="soft",
)

# comment out the model you don't want to use
models = {
    "XGBoost": {
        "is_grid_search": True,
        "model": xgb_grid,
    },
    "RandomForest": {
        "is_grid_search": True,
        "model": rf_grid,
    },
    "SVM": {
        "is_grid_search": True,
        "model": svm_grid,
    },
    "DecisionTree": {
        "is_grid_search": True,
        "model": dt_grid,
    },
    "Voting": {
        "is_grid_search": False,
        "model": voting,
    },
}

In [13]:
from utils.simple_model import SimpleModel

for model_name, gene_list in selected_feature_file.items():
    for model_name, model_config in models.items():
        model = SimpleModel(
            train_df=sp80,
            test_df=sp20,
            gene_list=gene_list,
            dbeta_info=dbeta_file,
        )
        model.setup_dbeta()
        model.setup_train_test()
        model.setup_combinations()
        model.train(
            model_name,
            model_config["model"],
            train_out_path,
            validate_out_path,
            model_config["is_grid_search"],
        )

INFO Training for combination: ('AIM2', 'BHLHE23', 'MIR377', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('AIM2', 'BHLHE23', 'PATE2', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('AIM2', 'BHLHE23', 'PCK1', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('AIM2', 'BHLHE23', 'RALYL', 'SCARF1') with estimator: XGBoost


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('AIM2', 'BHLHE23', 'SNORD115-10', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('AIM2', 'BHLHE23', 'TMEM196', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('ATG16L1', 'BHLHE23', 'MIR377', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('ATG16L1', 'BHLHE23', 'PATE2', 'SCARF1') with estimator: XGBoost


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('ATG16L1', 'BHLHE23', 'PCK1', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('ATG16L1', 'BHLHE23', 'RALYL', 'SCARF1') with estimator: XGBoost


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('ATG16L1', 'BHLHE23', 'SNORD115-10', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('ATG16L1', 'BHLHE23', 'TMEM196', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('ATP5G2', 'BHLHE23', 'MIR377', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('ATP5G2', 'BHLHE23', 'PATE2', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('ATP5G2', 'BHLHE23', 'PCK1', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('ATP5G2', 'BHLHE23', 'RALYL', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 72

  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('C1orf150', 'BHLHE23', 'SNORD115-10', 'SCARF1') with estimator: XGBoost


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('C1orf150', 'BHLHE23', 'TMEM196', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('DCD', 'BHLHE23', 'MIR377', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('DCD', 'BHLHE23', 'PATE2', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('DCD', 'BHLHE23', 'PCK1', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('DCD', 'BHLHE23', 'RALYL', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
INFO Training for combination: ('DCD', 'BHLHE23', 'SNORD115-10', 'SCARF1') with estimator: XGBoost
Fitting 5 folds for each of 729 candidates, t

  _data = np.array(data, dtype=dtype, copy=copy,


INFO Training for combination: ('AIM2', 'BHLHE23', 'PATE2', 'SCARF1') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('AIM2', 'BHLHE23', 'PCK1', 'SCARF1') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('AIM2', 'BHLHE23', 'RALYL', 'SCARF1') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('AIM2', 'BHLHE23', 'SNORD115-10', 'SCARF1') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('AIM2', 'BHLHE23', 'TMEM196', 'SCARF1') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combination: ('ATG16L1', 'BHLHE23', 'MIR377', 'SCARF1') with estimator: RandomForest
Fitting 5 folds for each of 27 candidates, totalling 135 fits
INFO Training for combi

#### 3. Visualization

In [None]:
model = input("Enter the model name: ")


##### Congratulation! You have finished the whole pipeline🎉. <br>

![title](cat.png)