### Sec. -1 Initiation

In [3]:
import os
import pandas as pd
from utils.config_helper import update_nested_toml, load_config

breast
lung
prostate
stomach
rectal

In [7]:
TYPE = input("Enter the type of the config file: ")
CONFIG_PATH = f"../config/{TYPE}.toml"
config = load_config(CONFIG_PATH)

In [8]:
def inspect_nan(df, name):
    print(df[pd.isna(df[name])])

In [10]:
beta_file_number = config["init"]["hyper"]["beta_file_number"]
test_ratio = config["init"]["hyper"]["test_ratio"]
seed = config["init"]["hyper"]["splitting_seed"]
normal_number_0 = config["init"]["hyper"]["normal_number_0"]
if beta_file_number == 2:
    normal_number_1 = config["init"]["hyper"]["normal_number_1"]
data_source = config["init"]["hyper"]["data_source"]
is_columns_duplicated = config["init"]["hyper"]["is_columns_duplicated"]
is_oversample = config["init"]["hyper"]["is_oversample"]
training_set_path = config["init"]["hyper"]["training_set_path"]
test_set_path = config["init"]["hyper"]["test_set_path"]

In [11]:
if is_oversample:
    print("train on oversampled dataset.")
    trainOutPath = f"../{TYPE}/result/{data_source}/train{int(100-test_ratio*100)}_oversample"
else:
    print("train on original dataset.")
    trainOutPath = f"../{TYPE}/result/{data_source}/train{int(100-test_ratio*100)}"
testOutPath = f"../{TYPE}/result/{data_source}/test{int(test_ratio*100)}"

train on original dataset.


### Sec. 0 Merge and Split Champ Data
- setting is_oversample = 0 to make sure three datasets are stored properly

- file paths
  - {TYPE}/result/{data_source}/test20/all_beta_normalized_1.csv
  - {TYPE}/result/{data_source}/train80/all_beta_normalized_0.csv
  - {TYPE}/result/{data_source}/train80_oversample/all_beta_normalized_0_oversample.csv


#### 0.1 Merge Dataset (if possible)

In [None]:
df0 = pd.read_csv(f"../{TYPE}/champ_result/{data_source}/all_beta_normalized_0.csv")

In [None]:
if beta_file_number == 2:
    df1 = pd.read_csv(f"../{TYPE}/champ_result/{data_source}/all_beta_normalized_1.csv")

In [None]:
# DEBUG
df0
# END

In [None]:
# DEBUG
df1
# END

In [None]:
# potential feature loss
if beta_file_number == 2:
    feature_name_0 = df0.iloc[:, 0].tolist()
    feature_name_1 = df1.iloc[:, 0].tolist()

    feature_name = list(set(feature_name_0).intersection(feature_name_1))
    update_nested_toml(
        "preprocess.merge_and_split", "feature_size_0", len(feature_name_0)
    )
    update_nested_toml(
        "preprocess.merge_and_split", "feature_size_1", len(feature_name_1)
    )
    update_nested_toml(
        "preprocess.merge_and_split", "feature_size_intersection", len(feature_name)
    )
elif beta_file_number == 1:
    feature_name = df0.iloc[:, 0].tolist()
    update_nested_toml(
        "preprocess.merge_and_split", "feature_size_0", len(feature_name)
    )

In [None]:
if beta_file_number == 2:
    df0_join = df0[df0.iloc[:, 0].isin(feature_name)]
    df1_join = df1[df1.iloc[:, 0].isin(feature_name)]

In [None]:
if beta_file_number == 2:
    df0_join = df0_join.iloc[:, 1::is_columns_duplicated]
    if data_source == "GDC_stomach_GSE99553":  # god forgive me
        df1_join = df1_join.iloc[:, 1::is_columns_duplicated_1]
    else:
        df1_join = df1_join.iloc[:, 1::is_columns_duplicated]
    df0_join.reset_index(drop=True, inplace=True)
    df1_join.reset_index(drop=True, inplace=True)
    df0_join_normal = df0_join.iloc[:, :normal_number_0]
    df0_join_tumor = df0_join.iloc[:, normal_number_0:]
    df1_join_normal = df1_join.iloc[:, :normal_number_1]
    df1_join_tumor = df1_join.iloc[:, normal_number_1:]
elif beta_file_number == 1:
    df0_join = df0.iloc[:, 1::is_columns_duplicated]

In [None]:
if beta_file_number == 2:
    df_normal = pd.concat([df0_join_normal, df1_join_normal], axis=1)
    df_tumor = pd.concat([df0_join_tumor, df1_join_tumor], axis=1)

In [None]:
# drop those samples with missing value
# note: could use padding or other methods to fill the missing value

if beta_file_number == 2:
    update_nested_toml(
        "preprocess.merge_and_split", "Before_dropna_dfn_shape", df_normal.shape
    )
    update_nested_toml(
        "preprocess.merge_and_split", "Before_dropna_dfc_shape", df_tumor.shape
    )
    df_normal.dropna(inplace=True, axis=1)
    df_tumor.dropna(inplace=True, axis=1)
    update_nested_toml(
        "preprocess.merge_and_split", "After_dropna_dfn_shape", df_normal.shape
    )
    update_nested_toml(
        "preprocess.merge_and_split", "After_dropna_dfc_shape", df_tumor.shape
    )
elif beta_file_number == 1:
    update_nested_toml(
        "preprocess.merge_and_split", "Before_dropna_df_shape", df0_join.shape
    )
    df0_join.dropna(inplace=True, axis=1)
    update_nested_toml(
        "preprocess.merge_and_split", "After_dropna_df_shape", df0_join.shape
    )

In [None]:
# Combine the normal and tumor data
if beta_file_number == 2:
    X = pd.concat([df_normal, df_tumor], axis=1).T
    y = [0] * df_normal.shape[1] + [1] * df_tumor.shape[1]
elif beta_file_number == 1:
    X = df0_join.T
    y = [0] * normal_number_0 + [1] * (df0_join.shape[1] - normal_number_0)


#### 0.2 Split Dataset

In [None]:
from collections import Counter
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_ratio, random_state=seed
)

In [None]:
X_train

In [None]:
X_test

#### 0.3 Oversample

In [None]:
update_nested_toml(
    "preprocess.merge_and_split", "Before_SMOTE_X_train_shape", X_train.shape
)
update_nested_toml(
    "preprocess.merge_and_split", "Before_SMOTE_y_train_shape", len(y_train)
)

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=seed)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
update_nested_toml(
    "preprocess.merge_and_split", "After_SMOTE_X_train_shape", X_resampled.shape
)
update_nested_toml(
    "preprocess.merge_and_split", "After_SMOTE_y_train_shape", len(y_resampled)
)


In [None]:
X_resampled

In [None]:
train_class_distribution_oversample = Counter(y_resampled)
train_class_distribution = Counter(y_train)
testing_class_distribution = Counter(y_test)

update_nested_toml("preprocess.merge_and_split", "training_set_samples_oversample", len(X_resampled))
update_nested_toml("preprocess.merge_and_split", "training_set_samples", len(X_train))
update_nested_toml("preprocess.merge_and_split", "testing_set_samples", len(X_test))
update_nested_toml(
    "preprocess.merge_and_split",
    "training_class_distribution_oversample",
    [train_class_distribution_oversample[0], train_class_distribution_oversample[1]]
)
update_nested_toml(
    "preprocess.merge_and_split",
    "training_class_distribution",
    [train_class_distribution[0], train_class_distribution[1]],
)
update_nested_toml(
    "preprocess.merge_and_split",
    "testing_class_distribution",
    [testing_class_distribution[0], testing_class_distribution[1]],
)

In [None]:
X_resampled.columns = feature_name
X_resampled["label"] = y_resampled
X_resampled.sort_values(by=["label"], inplace=True)
train_df_os = X_resampled.T
train_df_os.columns = range(train_df_os.shape[1])

X_train.columns = feature_name
X_train["label"] = y_train
X_train.sort_values(by=["label"], inplace=True)
train_df = X_train.T
train_df.columns = range(train_df.shape[1])

X_test.columns = feature_name
X_test["label"] = y_test
X_test.sort_values(by=["label"], inplace=True)
test_df = X_test.T
test_df.columns = range(test_df.shape[1])

train_df_os.insert(0, "Unnamed: 0", train_df_os.index)
train_df_os.reset_index(drop=True, inplace=True)
train_df.insert(0, "Unnamed: 0", train_df.index)
train_df.reset_index(drop=True, inplace=True)
test_df.insert(0, "Unnamed: 0", test_df.index)
test_df.reset_index(drop=True, inplace=True)

In [None]:
train_df_os

In [None]:
train_df_os

In [None]:
# DEBUG
train_df
# END

In [None]:
# DEBUG
test_df
# END

In [None]:
os.makedirs(f"{trainOutPath}", exist_ok=True)
os.makedirs(
    f"{testOutPath}",
    exist_ok=True,
)
train_df_os.to_csv(f"{trainOutPath}_oversample/all_beta_normalized_0_oversample.csv", index=False)
train_df.to_csv(f"{trainOutPath}/all_beta_normalized_0.csv", index=False)
test_df.to_csv(f"{testOutPath}/all_beta_normalized_1.csv", index=False)

In [None]:
del train_df, test_df, train_df_os

#### 0.4 Upload Datasets

In [None]:
zip_filename = f"{TYPE}_beta_files.zip"

In [None]:
import zipfile

with zipfile.ZipFile(zip_filename, "w") as zipf:
    zipf.write(
        f"{trainOutPath}/all_beta_normalized_0_oversample.csv",
        arcname="all_beta_normalized_0_oversample.csv",
    )
    zipf.write(
        f"{trainOutPath}/all_beta_normalized_0.csv", arcname="all_beta_normalized_0.csv"
    )
    zipf.write(
        f"{testOutPath}/all_beta_normalized_1.csv", arcname="all_beta_normalized_1.csv"
    )

In [None]:
from api import utils
service = utils.authenticate_drive()

In [None]:
directory = utils.create_folder(service, TYPE)

In [None]:
utils.run_upload_with_separate_thread(service, directory, zip_filename)

### Sec. 1 Delta Beta Calculation

#### 1.1 Download

In [None]:
import gdown

In [None]:
url = input()

output = 'download.zip'

In [None]:
gdown.download(url, output, quiet=False)

In [None]:
import zipfile
import shutil

with zipfile.ZipFile("download.zip", "r") as zip_ref:
    zip_ref.extractall("download")

shutil.move("download/all_beta_normalized_0.csv", f"all_beta_normalized_0.csv")
shutil.move("download/all_beta_normalized_1.csv", f"all_beta_normalized_1.csv")

os.remove("download.zip")
shutil.rmtree("download")

#### 1.2 Average Delta Beta Calculation

In [None]:
train_df = pd.read_csv(f"{trainOutPath}/{training_set_path}.csv")

In [None]:
# remove outlier in terms of every column
def IQR(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    upper_fence = Q3 + IQR * 1.5
    lower_fence = Q1 - IQR * 1.5
    return upper_fence, lower_fence


def no_outlier(df):
    upper_fence, lower_fence = IQR(df)
    ddf = df[(df > lower_fence) & (df < upper_fence)]
    return ddf

In [None]:
config = load_config(CONFIG_PATH)
normal_count = config["preprocess"]["merge_and_split"]["training_class_distribution"][0]
all_beta_normalized_normal = train_df.iloc[:-1, 1 : normal_count + 1 :].T


all_beta_normalized_tumor = train_df.iloc[:-1, normal_count + 1 : :].T

In [None]:
all_beta_normalized_normal = no_outlier(all_beta_normalized_normal)
all_beta_normalized_tumor = no_outlier(all_beta_normalized_tumor)

In [None]:
train_normal_avg = all_beta_normalized_normal.mean(skipna=True, axis=0)

In [None]:
train_normal_avg

In [None]:
all_beta_normalized_tumor

In [None]:
train_normal_avg

In [None]:
all_beta_normalized_tumor

In [None]:
all_beta_normalized_tumor = all_beta_normalized_tumor.subtract(
    train_normal_avg, axis=1
)

In [None]:
all_beta_normalized_tumor = no_outlier(all_beta_normalized_tumor)

In [None]:
train_tumor_mean = all_beta_normalized_tumor.mean(skipna=True, axis=0)

In [None]:
delta_beta = pd.merge(
    train_df.iloc[:-1, :1],
    pd.DataFrame(train_tumor_mean, columns=["dbeta"]),
    left_index=True,
    right_index=True,
)
update_nested_toml("preprocess.dbeta", "delta_beta_avg", delta_beta.shape[0])

In [None]:
# print(delta_beta[pd.isna(delta_beta["dbeta"])])
# record the list of feature with dbeta being NaN
update_nested_toml(
    "preprocess.dbeta",
    "NaN_dbeta_feature",
    delta_beta.loc[pd.isna(delta_beta["dbeta"]), "Unnamed: 0"].tolist(),
)
delta_beta.dropna(inplace=True, axis=0)
update_nested_toml("preprocess.dbeta", "delta_beta_avg_remove_NaN", delta_beta.shape[0])

In [None]:
dmp = pd.read_csv(f"../{TYPE}/champ_result/{data_source}/DMP_result_0.csv")
dmp = dmp[["Unnamed: 0", "gene", "feature"]]
update_nested_toml("preprocess.dbeta", "dmp_before_dropna_shape_feature", dmp.shape[0])
dmp.dropna(inplace=True)
update_nested_toml("preprocess.dbeta", "dmp_after_dropna_shape_feature", dmp.shape[0])

In [None]:
result = pd.merge(delta_beta, dmp, on="Unnamed: 0", how="left")
update_nested_toml(
    "preprocess.dbeta", "delta_beta_avg_remove_NaN_with_gene_name", result.shape[0]
)

In [None]:
def find_max_dBeta_grouped(group):
    idx_max = group["dbeta"].abs().idxmax()
    return group.loc[idx_max]


dbeta = result.groupby("gene", as_index=False).apply(
    find_max_dBeta_grouped, include_groups=False
)

In [None]:
dbeta.columns = ["gene", "ID", "dbeta", "feature"]
dbeta = dbeta[["ID", "gene", "dbeta", "feature"]]
# DEBUG
dbeta
# END

In [None]:
# comorbidity = pd.read_csv(
#     "../external_result/matchgene174_single_3Y10__OR2.txt", sep="\t", header=None
# )
# dbeta = dbeta[
#     dbeta["gene"].isin(comorbidity[0])
# ]

# result_max_per_gene_single

In [None]:
dbeta["dbeta"] = dbeta["dbeta"].apply(lambda x: round(x, 6))
dbeta.to_csv(f"{trainOutPath}/dbeta.csv", index=False)

### Sec. 2 Filter Genes by Average Delta Beta Values
1. filter genes by dbeta values
3. filter genes by TSS position
4. plot distribution of dbeta values
5. plot PCA for normal and tumor


#### 2.1 Filtering TSS

In [None]:
# dbeta = pd.read_csv(f"{trainOutPath}/dbeta.csv")

In [None]:
TSS = dbeta[dbeta["feature"].str.contains("TSS")]

In [None]:
TSS.to_csv(f"{trainOutPath}/dbeta_TSS.csv", index=False)

#### 2.2 Thresholding

In [None]:
threshold = 1
TSS_threshold = TSS[abs(TSS["dbeta"]) > threshold]
while True:
    TSS_threshold = TSS[abs(TSS["dbeta"]) > threshold]
    count = TSS_threshold.shape[0]
    if (
        config["preprocess"]["filtering"]["hyper"]["avg_dbeta_lower_bound"]
        <= count
        <= config["preprocess"]["filtering"]["hyper"]["avg_dbeta_upper_bound"]
    ):
        break
    threshold -= 0.01
threshold = round(threshold, 2)
update_nested_toml("preprocess.filtering", "threshold", threshold)

In [None]:
TSS_threshold.to_csv(f"{trainOutPath}/dbeta_TSS_{threshold}.csv", index=False)

#### 2.3 Visualization

In [None]:
# DEBUG
import seaborn as sns
import matplotlib.pyplot as plt

sns.kdeplot(TSS_threshold["dbeta"])
plt.xlabel("delta Beta value")
plt.title("Density plot of delta Beta value")
# save the plot
plt.savefig(f"{trainOutPath}/dbeta_TSS_{threshold}.png")
plt.close()
# END

In [None]:
# train_df = pd.read_csv(f"{trainOutPath}/all_beta_normalized_0.csv")

In [None]:
normal_count = (train_df.iloc[-1, 1:] == 0).sum()
df_gene = train_df.iloc[:-1, :]
df_gene = df_gene[df_gene[df_gene.columns[0]].isin(dbeta["ID"])]
X = df_gene.iloc[:, 1:].reset_index(drop=True).T
y = [0 if i < normal_count else 1 for i in range(X.shape[0])]
# DEBUG
print(f"X shape: {X.shape}")
print(f"y shape: {len(y)}")
# END

In [None]:
import plotly.express as px
import pandas as pd
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

df = pd.DataFrame(
    {
        "Principal Component 1": X_pca[:, 0],
        "Principal Component 2": X_pca[:, 1],
        "Principal Component 3": X_pca[:, 2],
        "Class": y,
    }
)
print(df.shape)
fig = px.scatter_3d(
    df,
    x="Principal Component 1",
    y="Principal Component 2",
    z="Principal Component 3",
    color="Class",
    title="PCA of Dataset",
    color_continuous_scale="Viridis",
)

fig.update_layout(
    scene=dict(
        xaxis_title="Principal Component 1",
        yaxis_title="Principal Component 2",
        zaxis_title="Principal Component 3",
    )
)

# fig.show()

fig.write_html(f"{trainOutPath}/preprocess_filtering_pca.html")

# open in browser
# save the plot

### Sec. 3 feature Selection with ML

#### 3.1 Preparation

In [12]:
config = load_config(CONFIG_PATH)
threshold_file = config["machine_learning"]["hyper"]["TSS_threshold"]

TSS_threshold = pd.read_csv(f"{trainOutPath}/{threshold_file}.csv")

In [None]:
# TSS_threshold_hyper = TSS_threshold[TSS_threshold["dbeta"] > 0]
# # DEBUG
# TSS_threshold_hyper
# # END

In [13]:
# check if logs/ folder exists
os.makedirs("logs", exist_ok=True)
from utils.train_helper import TrainHelper

In [None]:
del th

In [14]:
# note that there is setup_dbeta in TrainHelper to further cut down the feature size
th = TrainHelper(TSS_threshold)

#### 3.2 Selection (SFS)

In [15]:
train_df = pd.read_csv(f"{trainOutPath}/{training_set_path}.csv")
# test_df = pd.read_csv(f"{testOutPath}/all_beta_normalized_1.csv")


In [16]:
 th.set_train_test(train_df)



In [None]:
th.set_train_test2()

In [17]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

selection_models = {
    "SVM": SVC(kernel="linear", random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
}


In [18]:
th.set_selection_models(selection_models)

In [19]:
th.select_feature_sfs(
    TrainOutPath = trainOutPath,
    step= 4,
    n_features_to_select="cluster"
)

DEBUG Training SVM with SFS


AttributeError: 'TrainHelper' object has no attribute 'X_train'

#### 3.3.1 Visualization (RFE)

In [None]:
# fpr_tpr_train = pd.read_csv(f"{trainOutPath}/roc_curve.csv")
# fpr_tpr_test = pd.read_csv(f"{testOutPath}/roc_curve.csv")
# rfe_train = pd.read_csv(f"{trainOutPath}/rfe.csv")
# rfe_test = pd.read_csv(f"{testOutPath}/rfe.csv")

In [None]:
# rfe_j = pd.merge(rfe_train, rfe_test, on=["selection_model", "train_model", "features"], suffixes=('_train', '_test'))
# fpr_tpr_j = pd.merge(fpr_tpr_train, fpr_tpr_test, on=["selection_model", "train_model", "features"], suffixes=('_train', '_test'))
# J = pd.merge(rfe_j, fpr_tpr_j, on=["selection_model", "train_model", "features"])

In [None]:
# import ast

# J["fpr_train"] = J["fpr_train"].apply(ast.literal_eval)
# J["tpr_train"] = J["tpr_train"].apply(ast.literal_eval)
# J["fpr_test"] = J["fpr_test"].apply(ast.literal_eval)
# J["tpr_test"] = J["tpr_test"].apply(ast.literal_eval)

In [None]:
# from utils.painter import plot_roc_curve, create_performance_barchart

In [None]:
# J['accuracy_diff'] = J['accuracy_train'] - J['accuracy_test']
# J['recall_diff'] = J['recall_train'] - J['recall_test']
# J['f1_score_diff'] = J['f1_score_train'] - J['f1_score_test']
# J['AUC_diff'] = J['AUC_train'] - J['AUC_test']
# J['MCC_diff'] = J['MCC_train'] - J['MCC_test']

In [None]:
# # tweakable width and height
# plot_roc_curve(J, "ROC Curves on Training Set", f"{trainOutPath}/roc_train.html")

In [None]:
# # tweakable width and height
# plot_roc_curve(J, "ROC Curves on Testing Set", f"{testOutPath}/roc_test.html", mode="test")

In [None]:
rfe_acc = pd.DataFrame(
    columns=[
        "selection_model",
        "train_model",
        "features",
    ]
)
for i in range(5):
    rfe_i = pd.read_csv(f"{testOutPath}/rfe_{i}.csv")
    rfe_acc = pd.merge(rfe_acc, rfe_i, on=["selection_model", "train_model", "features"], suffixes=('', f'_{i}'), how='outer')
rfe_acc

#### 3.3.2 Filter Combinations (RFE)
"selection_model", "training_model", and "features"

In [None]:
# # eliminate the those with abs accuracy difference greater than 0.1
# J = J[abs(J["accuracy_diff"]) < 0.1]

In [None]:
# # plot difference
# performance_metrics = ['accuracy_diff', 'recall_diff', 'f1_score_diff', 'AUC_diff', 'MCC_diff']
# ground_by_train_model = J.groupby('train_model')[performance_metrics].mean()
# ground_by_train_model['train_model'] = ground_by_train_model.index
# ground_by_train_model.to_csv(f"{trainOutPath}/performance_diff_grouped_by_train_model.csv", index=False)
# color_mapping = {
#     "accuracy_diff": "blue",
#     "recall_diff": "red",
#     "f1_score_diff": "green",
#     "AUC_diff": "purple",
#     "MCC_diff": "orange",
# }
# create_performance_barchart(
#     df=ground_by_train_model,
#     color_mapping=color_mapping,
#     metric="train_model",
#     out_path=f"{trainOutPath}/performance_diff_grouped_by_train_model.html",
#     title="Grouped Performance Difference between Training and Testing Set",
#     x_axis_label="Performance Difference (Training - Testing)",
#     y_axis_label="Train Model",
#     orientation="h",
# )

In [None]:
# J = J[["selection_model", "train_model", "features", "accuracy_test", "recall_test", "f1_score_test", "AUC_test", "MCC_test"]]

In [None]:
# # group by train_model, for each train_model, calculate the mean of each performance metric
# performance_metrics = ['accuracy_test', 'recall_test',
#                        'f1_score_test', 'AUC_test', 'MCC_test']
# ground_by_train_model = J.groupby('train_model')[performance_metrics].mean()
# ground_by_train_model['train_model'] = ground_by_train_model.index
# ground_by_train_model.to_csv(
#     f"{trainOutPath}/performance_metrics_grouped_by_train_model.csv", index=False)
# color_mapping = {
#     "accuracy_test": "blue",
#     "recall_test": "red",
#     "f1_score_test": "green",
#     "AUC_test": "purple",
#     "MCC_test": "orange",
# }
# create_performance_barchart(
#     df=ground_by_train_model,
#     color_mapping=color_mapping,
#     metric="train_model",
#     out_path=f"{trainOutPath}/performance_metrics_grouped_by_train_model.html",
#     title="Grouped Performance Metrics by Train Model",
#     x_axis_label="Performance",
#     y_axis_label="Train Model",
#     orientation="h",
# )
# best_train_model = ground_by_train_model['MCC_test'].idxmax()
# print(f"Best train model: {best_train_model}")
# ground_by_feature = J[J['train_model'] == best_train_model].groupby('features')[
#     performance_metrics].mean()
# ground_by_feature['features'] = ground_by_feature.index
# ground_by_feature.to_csv(
#     f"{trainOutPath}/performance_metrics_grouped_by_feature.csv", index=False)
# create_performance_barchart(
#     df=ground_by_feature,
#     color_mapping=color_mapping,
#     metric="features",
#     out_path=f"{trainOutPath}/performance_metrics_grouped_by_feature.html",
#     title="Grouped Performance Metrics by Feature",
#     x_axis_label="Performance",
#     y_axis_label="Feature",
#     orientation="h",
# )
# best_num_of_feature = ground_by_feature['MCC_test'].idxmax()
# print(f"Best number of feature: {best_num_of_feature}")
# best_performance_records = J[(J['train_model'] == best_train_model) & (
#     J['features'] == best_num_of_feature)]
# best_performance_records.to_csv(
#     f"{trainOutPath}/best_performance_records.csv", index=False)

In [None]:
# first = True
# gene_set = set()
# with open(f"{trainOutPath}/selected_feature_names.csv", "r") as f:
#     for line in f:
#         if first:
#             first = False
#             continue
#         if line.split(",")[0].endswith(str(best_num_of_feature)):
#             selected_feature_names = line.split(",")[1:]
#             selected_feature_names[-1] = selected_feature_names[-1].strip()
#             gene_set.update(selected_feature_names)

# gene_list = pd.DataFrame(list(gene_set), columns=["gene"])
# gene_list.to_csv(f"{trainOutPath}/selected_feature_set.csv", index=False)

#### 3.4 Clean Selected Features (SFS)

In [None]:
from utils.train_helper import read_selected_features, read_selected_features_json
features = read_selected_features(f"{trainOutPath}/sfs/selected_feature_names_sfs.txt")
th.generate_selected_features(features, f"{trainOutPath}/sfs/selected_features.json")
read_selected_features_json(f"{trainOutPath}/sfs/selected_features.json")

### Sec. 4 Clustering

#### 4.1 load data

remember to calculate distance matrix first

In [None]:
import pandas as pd
import numpy as np
from utils.clustering_helper import hierarchical_clustering, check_distance_matrix

In [None]:
dbeta_path = config["clustering"]["hyper"]["dbeta_file"]
gene_set_file = config["clustering"]["hyper"]["gene_set_file"]
bp_file = config["clustering"]["hyper"]["bp_file"]
cc_file = config["clustering"]["hyper"]["cc_file"]
mf_file = config["clustering"]["hyper"]["mf_file"]
terms_count_file = config["clustering"]["hyper"]["terms_count_file"]

In [None]:
gene_set = pd.read_csv(f"{trainOutPath}/{gene_set_file}.csv", index_col=0)
distance_matrix_bp = pd.read_csv(f"{trainOutPath}/{bp_file}.csv", index_col=0)
distance_matrix_cc = pd.read_csv(f"{trainOutPath}/{cc_file}.csv", index_col=0)
distance_matrix_mf = pd.read_csv(f"{trainOutPath}/{mf_file}.csv", index_col=0)
terms_count = pd.read_csv(f"{trainOutPath}/{terms_count_file}.csv", index_col=0)

In [None]:
# replace NaN with 0
distance_matrix_bp = distance_matrix_bp.fillna(0)
distance_matrix_cc = distance_matrix_cc.fillna(0)
distance_matrix_mf = distance_matrix_mf.fillna(0)

In [None]:
# reindex distance matrix
index_bp = distance_matrix_bp.index
index_cc = distance_matrix_cc.index
index_mf = distance_matrix_mf.index
index = index_bp.union(index_cc).union(index_mf)
distance_matrix_bp_ = distance_matrix_bp.reindex(index=index, columns=index, fill_value=0)
distance_matrix_cc_ = distance_matrix_cc.reindex(index=index, columns=index, fill_value=0)
distance_matrix_mf_ = distance_matrix_mf.reindex(index=index, columns=index, fill_value=0)

In [None]:
# make a array of distance matrix for each ontology
distance_matrix = []

distance_matrix.append(distance_matrix_bp_)
distance_matrix.append(distance_matrix_cc_)
distance_matrix.append(distance_matrix_mf_)

#### 4.2 Weighted Sum

In [None]:
weight = [count for count in terms_count["count"]]
weight = weight / np.sum(weight)
masks = np.array([~np.isnan(distance_matrix[i].values) for i in range(3)])

valid_weights = np.array([weight[i] for i in range(3)])[:, None, None] * masks

weight_sums = valid_weights.sum(axis=0)

normalized_weights = np.divide(valid_weights, weight_sums, where=weight_sums != 0)
weighted_sum = sum(
    np.nan_to_num(distance_matrix[i].values) * normalized_weights[i] for i in range(3)
)


weighted_sum_dataframe = pd.DataFrame(weighted_sum, index=index, columns=index)

weighted_sum_dataframe.head()

In [None]:
cluster_result_weighted = hierarchical_clustering(
    weighted_sum_dataframe,
    range_min=2,
    range_max=4,
    cluster_number=3,
    out_path=f"{trainOutPath}/hierarchical_clustering_weighted_sum.png",
)

In [None]:
J = J[["selection_model", "train_model", "features", "accuracy_test", "recall_test", "f1_score_test", "AUC_test", "MCC_test"]]

In [None]:
# group by train_model, for each train_model, calculate the mean of each performance metric
performance_metrics = ['accuracy_test', 'recall_test',
                       'f1_score_test', 'AUC_test', 'MCC_test']
ground_by_train_model = J.groupby('train_model')[performance_metrics].mean()
ground_by_train_model['train_model'] = ground_by_train_model.index
ground_by_train_model.to_csv(
    f"{trainOutPath}/performance_metrics_grouped_by_train_model.csv", index=False)
color_mapping = {
    "accuracy_test": "blue",
    "recall_test": "red",
    "f1_score_test": "green",
    "AUC_test": "purple",
    "MCC_test": "orange",
}
create_performance_barchart(
    df=ground_by_train_model,
    color_mapping=color_mapping,
    metric="train_model",
    out_path=f"{trainOutPath}/performance_metrics_grouped_by_train_model.html",
    title="Grouped Performance Metrics by Train Model",
    x_axis_label="Performance",
    y_axis_label="Train Model",
    orientation="h",
)
best_train_model = ground_by_train_model['MCC_test'].idxmax()
print(f"Best train model: {best_train_model}")
ground_by_feature = J[J['train_model'] == best_train_model].groupby('features')[
    performance_metrics].mean()
ground_by_feature['features'] = ground_by_feature.index
ground_by_feature.to_csv(
    f"{trainOutPath}/performance_metrics_grouped_by_feature.csv", index=False)
create_performance_barchart(
    df=ground_by_feature,
    color_mapping=color_mapping,
    metric="features",
    out_path=f"{trainOutPath}/performance_metrics_grouped_by_feature.html",
    title="Grouped Performance Metrics by Feature",
    x_axis_label="Performance",
    y_axis_label="Feature",
    orientation="h",
)
best_num_of_feature = ground_by_feature['MCC_test'].idxmax()
print(f"Best number of feature: {best_num_of_feature}")
best_performance_records = J[(J['train_model'] == best_train_model) & (
    J['features'] == best_num_of_feature)]
best_performance_records.to_csv(g
    f"{trainOutPath}/best_performance_records.csv", index=False)

In [None]:
first = True
gene_set = set()
with open(f"{trainOutPath}/selected_feature_names.csv", "r") as f:
    for line in f:
        if first:
            first = False
            continue
        if line.split(",")[0].endswith(str(best_num_of_feature)):
            selected_feature_names = line.split(",")[1:]
            selected_feature_names[-1] = selected_feature_names[-1].strip()
            gene_set.update(selected_feature_names)

gene_list = pd.DataFrame(list(gene_set), columns=["gene"])
gene_list.to_csv(f"{trainOutPath}/selected_feature_set.csv", index=False)

### Sec. 4 Clustering

#### 4.1 load data

In [None]:
import pandas as pd
import numpy as np
from utils.clustering_helper import hierarchical_clustering, check_distance_matrix

In [None]:
dbeta_path = config["clustering"]["hyper"]["dbeta_file"]
gene_set_file = config["clustering"]["hyper"]["gene_set_file"]
bp_file = config["clustering"]["hyper"]["bp_file"]
cc_file = config["clustering"]["hyper"]["cc_file"]
mf_file = config["clustering"]["hyper"]["mf_file"]
terms_count_file = config["clustering"]["hyper"]["terms_count_file"]

In [None]:
dbeta = pd.read_csv(f"{trainOutPath}/{dbeta_path}.csv", index_col=0)
gene_set = pd.read_csv(f"{trainOutPath}/{gene_set_file}.csv", index_col=0)
distance_matrix_bp = pd.read_csv(f"{trainOutPath}/{bp_file}.csv", index_col=0)
distance_matrix_cc = pd.read_csv(f"{trainOutPath}/{cc_file}.csv", index_col=0)
distance_matrix_mf = pd.read_csv(f"{trainOutPath}/{mf_file}.csv", index_col=0)
terms_count = pd.read_csv(f"{trainOutPath}/{terms_count_file}.csv", index_col=0)

In [None]:
# replace NaN with 0
distance_matrix_bp = distance_matrix_bp.fillna(0)
distance_matrix_cc = distance_matrix_cc.fillna(0)
distance_matrix_mf = distance_matrix_mf.fillna(0)

In [None]:
# reindex distance matrix
index_bp = distance_matrix_bp.index
index_cc = distance_matrix_cc.index
index_mf = distance_matrix_mf.index
index = index_bp.union(index_cc).union(index_mf)
distance_matrix_bp_ = distance_matrix_bp.reindex(index=index, columns=index, fill_value=0)
distance_matrix_cc_ = distance_matrix_cc.reindex(index=index, columns=index, fill_value=0)
distance_matrix_mf_ = distance_matrix_mf.reindex(index=index, columns=index, fill_value=0)

In [None]:
# make a array of distance matrix for each ontology
distance_matrix = []

distance_matrix.append(distance_matrix_bp_)
distance_matrix.append(distance_matrix_cc_)
distance_matrix.append(distance_matrix_mf_)

#### 4.2 Weighted Sum

In [None]:
weight = [count for count in terms_count["count"]]
weight = weight / np.sum(weight)
masks = np.array([~np.isnan(distance_matrix[i].values) for i in range(3)])

valid_weights = np.array([weight[i] for i in range(3)])[:, None, None] * masks

weight_sums = valid_weights.sum(axis=0)

normalized_weights = np.divide(valid_weights, weight_sums, where=weight_sums != 0)
weighted_sum = sum(
    np.nan_to_num(distance_matrix[i].values) * normalized_weights[i] for i in range(3)
)


weighted_sum_dataframe = pd.DataFrame(weighted_sum, index=index, columns=index)

weighted_sum_dataframe.head()

In [None]:
cluster_result_weighted = hierarchical_clustering(
    weighted_sum_dataframe,
    range_min=2,
    range_max=4,
    cluster_number=3,
    out_path=f"{trainOutPath}/hierarchical_clustering_weighted_sum.png",
)

In [None]:
cluster_result_weighted.head()

#### 4.3 Simple average

In [None]:
weight = [1, 1, 1]
masks = np.array([~np.isnan(distance_matrix[i].values) for i in range(3)])
valid_weights = np.array([weight[i] for i in range(3)])[:, None, None] * masks
weight_sums = valid_weights.sum(axis=0)
normalized_weights = np.divide(valid_weights, weight_sums, where=weight_sums != 0)
weighted_sum = sum(
    np.nan_to_num(distance_matrix[i].values) * normalized_weights[i] for i in range(3)
)
simple_sum_dataframe = pd.DataFrame(weighted_sum, index=index, columns=index)
simple_sum_dataframe.head()

In [None]:
cluster_result_simple = hierarchical_clustering(
    simple_sum_dataframe,
    range_min=2,
    range_max=4,
    cluster_number=3,
    out_path=f"{trainOutPath}/hierarchical_clustering_simple_sum.png",
)

In [None]:
cluster_result_simple.head()

#### 4.4 Consensus clustering 

In [None]:
cluster_bp = hierarchical_clustering(
    distance_matrix_bp, out_path=f"{trainOutPath}/hierarchical_clustering_bp.png"
)
cluster_cc = hierarchical_clustering(
    distance_matrix_cc, out_path=f"{trainOutPath}/hierarchical_clustering_cc.png"
)
cluster_mf = hierarchical_clustering(
    distance_matrix_mf, out_path=f"{trainOutPath}/hierarchical_clustering_mf.png"
)

In [None]:
cluster_bp.columns = ["gene", "cluster_bp"]
cluster_cc.columns = ["gene", "cluster_cc"]
cluster_mf.columns = ["gene", "cluster_mf"]
cluster_bp_cc = pd.merge(cluster_bp, cluster_cc, on="gene", how="outer")
cluster_go = pd.merge(cluster_bp_cc, cluster_mf, on="gene", how="outer")
cluster_go = cluster_go.fillna(-1)
print(cluster_go.shape)
cluster_go.head()

In [None]:
num_genes = cluster_go.shape[0]
consensus_matrix = np.zeros((num_genes, num_genes))

for i in range(num_genes):
    for j in range(i, num_genes):
        if cluster_go.iloc[i]["cluster_bp"] == cluster_go.iloc[j]["cluster_bp"]:
            consensus_matrix[i][j] += 1

        if cluster_go.iloc[i]["cluster_cc"] == cluster_go.iloc[j]["cluster_cc"]:
            consensus_matrix[i][j] += 1

        if cluster_go.iloc[i]["cluster_mf"] == cluster_go.iloc[j]["cluster_mf"]:
            consensus_matrix[i][j] += 1


consensus_matrix = pd.DataFrame(
    consensus_matrix, index=cluster_go["gene"], columns=cluster_go["gene"]
)
consensus_matrix += consensus_matrix.T


distance_matrix_consensus = 1 - consensus_matrix / 3
np.fill_diagonal(distance_matrix_consensus.values, 0)


distance_matrix_consensus.head()

In [None]:
cluster_result_consensus = hierarchical_clustering(
    distance_matrix_consensus,
    range_min=2,
    range_max=4,
    cluster_number=4,
    out_path=f"{trainOutPath}/hierarchical_clustering_consensus.png",
)

In [None]:
cluster_result_consensus.head()

#### 4.5 Compare 

In [None]:
from utils.clustering_helper import hierarchical_clustering_compare

hierarchical_clustering_compare(
    [weighted_sum_dataframe, simple_sum_dataframe, distance_matrix_consensus],
    ["Weighted Average", "Simple Average", "Consensus"],
    out_path=f"{trainOutPath}/hierarchical_clustering_compare.png",
)

In [None]:
dbeta["ID"] = dbeta.index

In [None]:
# column gene isin weighted_sum_dataframe
weighted_dbeta = dbeta[dbeta["gene"].isin(weighted_sum_dataframe.index)]
simple_dbeta = dbeta[dbeta["gene"].isin(simple_sum_dataframe.index)]
consensus_dbeta = dbeta[dbeta["gene"].isin(distance_matrix_consensus.index)]

In [None]:
weighted_dbeta.merge(cluster_result_weighted, on="gene").to_csv(
    f"{trainOutPath}/{dbeta_path}_weighted.csv", index=False
)
simple_dbeta.merge(cluster_result_simple, on="gene").to_csv(
    f"{trainOutPath}/{dbeta_path}_simple.csv", index=False
)
consensus_dbeta.merge(cluster_result_consensus, on="gene").to_csv(
    f"{trainOutPath}/{dbeta_path}_consensus.csv", index=False
)

### Sec. 5 Combination

#### 5.1 preparation

In [None]:
import pandas as pd

In [None]:
dbeta_path = config["combination"]["hyper"]["dbeta_file"]
dbeta = pd.read_csv(f"{trainOutPath}/{dbeta_path}.csv")
dbeta

In [None]:
gene_list = pd.read_csv(f"{trainOutPath}/selected_feature_set.csv")
dbeta[dbeta["gene"].isin(gene_list["gene"])].to_csv(
    f"{trainOutPath}/{dbeta_path}_selected.csv", index=False
)

In [None]:
train_df = pd.read_csv(f"{trainOutPath}/all_beta_normalized_0.csv")
test_df = pd.read_csv(f"{testOutPath}/all_beta_normalized_1.csv")

#### 5.2 training

In [None]:
from utils.combination_helper import CombinationHelper

In [None]:
helper = CombinationHelper(
    train_df=train_df,
    test_df=test_df,
    dbeta=dbeta,
    gene_list=gene_list,
)

In [None]:
helper.setup_dbeta()
helper.setup_train_test()
helper.setup_combinations()
helper.setup_estimators()
helper.setup_grids()
helper.setup_grid_estimator()
helper.setup_grid_estimator()


In [None]:
cluster_result_weighted.head()

#### 4.3 Simple average

In [None]:
weight = [1, 1, 1]
masks = np.array([~np.isnan(distance_matrix[i].values) for i in range(3)])
valid_weights = np.array([weight[i] for i in range(3)])[:, None, None] * masks
weight_sums = valid_weights.sum(axis=0)
normalized_weights = np.divide(valid_weights, weight_sums, where=weight_sums != 0)
weighted_sum = sum(
    np.nan_to_num(distance_matrix[i].values) * normalized_weights[i] for i in range(3)
)
simple_sum_dataframe = pd.DataFrame(weighted_sum, index=index, columns=index)
simple_sum_dataframe.head()

In [None]:
cluster_result_simple = hierarchical_clustering(
    simple_sum_dataframe,
    range_min=2,
    range_max=4,
    cluster_number=3,
    out_path=f"{trainOutPath}/hierarchical_clustering_simple_sum.png",
)

In [None]:
cluster_result_simple.head()

#### 4.4 Consensus clustering 

In [None]:
cluster_bp = hierarchical_clustering(
    distance_matrix_bp, out_path=f"{trainOutPath}/hierarchical_clustering_bp.png"
)
cluster_cc = hierarchical_clustering(
    distance_matrix_cc, out_path=f"{trainOutPath}/hierarchical_clustering_cc.png"
)
cluster_mf = hierarchical_clustering(
    distance_matrix_mf, out_path=f"{trainOutPath}/hierarchical_clustering_mf.png"
)

In [None]:
cluster_bp.columns = ["gene", "cluster_bp"]
cluster_cc.columns = ["gene", "cluster_cc"]
cluster_mf.columns = ["gene", "cluster_mf"]
cluster_bp_cc = pd.merge(cluster_bp, cluster_cc, on="gene", how="outer")
cluster_go = pd.merge(cluster_bp_cc, cluster_mf, on="gene", how="outer")
cluster_go = cluster_go.fillna(-1)
print(cluster_go.shape)
cluster_go.head()

In [None]:
num_genes = cluster_go.shape[0]
consensus_matrix = np.zeros((num_genes, num_genes))

for i in range(num_genes):
    for j in range(i, num_genes):
        if cluster_go.iloc[i]["cluster_bp"] == cluster_go.iloc[j]["cluster_bp"]:
            consensus_matrix[i][j] += 1

        if cluster_go.iloc[i]["cluster_cc"] == cluster_go.iloc[j]["cluster_cc"]:
            consensus_matrix[i][j] += 1

        if cluster_go.iloc[i]["cluster_mf"] == cluster_go.iloc[j]["cluster_mf"]:
            consensus_matrix[i][j] += 1


consensus_matrix = pd.DataFrame(
    consensus_matrix, index=cluster_go["gene"], columns=cluster_go["gene"]
)
consensus_matrix += consensus_matrix.T


distance_matrix_consensus = 1 - consensus_matrix / 3
np.fill_diagonal(distance_matrix_consensus.values, 0)


distance_matrix_consensus.head()

In [None]:
cluster_result_consensus = hierarchical_clustering(
    distance_matrix_consensus,
    range_min=2,
    range_max=4,
    cluster_number=4,
    out_path=f"{trainOutPath}/hierarchical_clustering_consensus.png",
)

In [None]:
cluster_result_consensus.head()

#### 4.5 Compare 

In [None]:
from utils.clustering_helper import hierarchical_clustering_compare

hierarchical_clustering_compare(
    [weighted_sum_dataframe, simple_sum_dataframe, distance_matrix_consensus],
    ["Weighted Average", "Simple Average", "Consensus"],
    out_path=f"{trainOutPath}/hierarchical_clustering_compare.png",
)

In [None]:
dbeta["ID"] = dbeta.index

In [None]:
# column gene isin weighted_sum_dataframe
weighted_dbeta = dbeta[dbeta["gene"].isin(weighted_sum_dataframe.index)]
simple_dbeta = dbeta[dbeta["gene"].isin(simple_sum_dataframe.index)]
consensus_dbeta = dbeta[dbeta["gene"].isin(distance_matrix_consensus.index)]

In [None]:
weighted_dbeta.merge(cluster_result_weighted, on="gene").to_csv(
    f"{trainOutPath}/{dbeta_path}_weighted.csv", index=False
)
simple_dbeta.merge(cluster_result_simple, on="gene").to_csv(
    f"{trainOutPath}/{dbeta_path}_simple.csv", index=False
)
consensus_dbeta.merge(cluster_result_consensus, on="gene").to_csv(
    f"{trainOutPath}/{dbeta_path}_consensus.csv", index=False
)

### Sec. 9 $\frac{3}{4}$ SimpleModel Training

#### Sec. 9 $\frac{3}{4}$.1 Load Data

In [None]:
from utils.train_helper import read_selected_features_json
import pandas as pd

In [None]:
train_df = pd.read_csv(f"{trainOutPath}/{training_set_path}.csv")
test_df = pd.read_csv(f"{testOutPath}/{test_set_path}.csv")

In [None]:
gene_dict = read_selected_features_json(f"{trainOutPath}/sfs/trimmed_selected_features.json")

In [None]:
dbeta_path = config["combination"]["hyper"]["dbeta_file"]
dbeta_info = pd.read_csv(f"{trainOutPath}/{dbeta_path}.csv")

#### Sec. 9 $\frac{3}{4}$.2 Training

In [None]:
from utils.train_helper import set_parameters
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import json
from sklearn.ensemble import VotingClassifier


with open(f"{trainOutPath}/training_param.json", "r") as f:
    training_param = json.load(f)

xgb_grid = set_parameters(XGBClassifier(random_state=42), training_param["XGBoost"])
rf_grid = set_parameters(RandomForestClassifier(random_state=42), training_param["RandomForest"])
svm_grid = set_parameters(SVC(random_state=42, probability=True), training_param["SVM"])
dt_grid = set_parameters(DecisionTreeClassifier(random_state=42), training_param["DecisionTree"])
voting = VotingClassifier(
    estimators=[("XGBoost", XGBClassifier(random_state=42)), ("RandomForest", RandomForestClassifier(random_state=42)), ("SVM", SVC(random_state=42, probability=True)), ("DecisionTree", DecisionTreeClassifier(random_state=42))
                ],
    voting="soft",
)

# comment out the model you don't want to use
models = {
    "XGBoost": {
        "is_grid_search": True,
        "model": xgb_grid,
    },
    "RandomForest": {
        "is_grid_search": True,
        "model": rf_grid,
    },
    "SVM": {
        "is_grid_search": True,
        "model": svm_grid,
    },
    "DecisionTree": {
        "is_grid_search": True,
        "model": dt_grid,
    },
    "Voting": {
        "is_grid_search": False,
        "model": voting,
    },
}

In [None]:
from utils.simple_model import SimpleModel

if not os.path.exists(f"{trainOutPath}/sfs/"):
    os.makedirs(f"{trainOutPath}/sfs/")
if not os.path.exists(f"{testOutPath}/sfs/"):
    os.makedirs(f"{testOutPath}/sfs/")

for model_name, gene_list in gene_dict.items():
    for model_name, model_config in models.items():
        model = SimpleModel(
            train_df=train_df,
            test_df=test_df,
            gene_list=gene_list,
            dbeta_info=dbeta_info,
        )
        model.setup_dbeta()
        model.setup_train_test()
        model.setup_combinations()
        model.train(
            model_name,
            model_config["model"],
            f"{trainOutPath}/sfs/",
            f"{testOutPath}/sfs/",
            model_config["is_grid_search"],
        )

### Sec. 5 Combination (RFE)

#### 5.1 preparation

In [None]:
import pandas as pd

In [None]:
dbeta_path = config["combination"]["hyper"]["dbeta_file"]
dbeta = pd.read_csv(f"{trainOutPath}/{dbeta_path}.csv")
dbeta

In [None]:
gene_list = pd.read_csv(f"{trainOutPath}/selected_feature_set.csv")
dbeta[dbeta["gene"].isin(gene_list["gene"])].to_csv(
    f"{trainOutPath}/{dbeta_path}_selected.csv", index=False
)

In [None]:
train_df = pd.read_csv(f"{trainOutPath}/all_beta_normalized_0.csv")
test_df = pd.read_csv(f"{testOutPath}/all_beta_normalized_1.csv")

#### 5.2 training

In [None]:
from utils.combination_helper import CombinationHelper

In [None]:
helper = CombinationHelper(
    train_df=train_df,
    test_df=test_df,
    dbeta=dbeta,
    gene_list=gene_list,
)

In [None]:
helper.setup_dbeta()
helper.setup_train_test()
helper.setup_combinations()
helper.setup_estimators()
helper.setup_grids()
helper.setup_grid_estimator()
helper.setup_grid_estimator()


In [None]:
helper.train(
    train_folder=trainOutPath,
    test_folder=testOutPath,
    filename="combination",
    discard_overfitting=True
)