In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import itertools

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [None]:
dataset = pd.read_csv("dataset.csv").drop_duplicates()
print(dataset)

In [None]:
def process_data(data, under_sample_factor=None, over_sample_factor=None):
    x = np.array(data.drop(["ID_REF"], axis=1)).astype('float')
    y = np.array(data["ID_REF"]).astype('int')
    feature_names = data.columns[1:]

    if under_sample_factor is not None and isinstance(under_sample_factor, float) and 0 < under_sample_factor <= 1:
        under_sampler = RandomUnderSampler(sampling_strategy=under_sample_factor)
        x, y = under_sampler.fit_resample(x, y)

    if over_sample_factor is not None and isinstance(over_sample_factor, float) and 0 < over_sample_factor <= 1:
        over_sampler = RandomOverSampler(sampling_strategy=over_sample)
        x, y = over_sampler.fit_resample(x, y)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0, stratify=y)

    return x_train, x_test, y_train, y_test, feature_names

In [None]:
def support_vector_machine(x_train, y_train, feature_num):
    pipe = Pipeline([('skb', SelectKBest(f_classif, k=feature_num)), ('estimator', SVC())])

    pipe_parameters = {'skb__k' : [feature_num],
                       'estimator__C': [0.25, 0.5, 0.75, 1],
                       'estimator__kernel': ['linear']
                       }

    svm_grid_search = GridSearchCV(pipe, pipe_parameters, scoring='accuracy', cv=10, n_jobs=-1)
    svm_grid_search.fit(x_train, y_train)

    return svm_grid_search

def get_svm_metrics(svm_grid_search, x_test, y_test):
    best_accuracy = svm_grid_search.best_score_
    best_parameters = svm_grid_search.best_params_
    print("Training Accuracy:", best_accuracy)
    print("Best Parameters:", best_parameters)

    y_pred = svm_grid_search.predict(x_test)
    matrix = confusion_matrix(y_test, y_pred)
    print("Testing Accuracy:", accuracy_score(y_test, y_pred))
    # print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
    print("Confusion Matrix:")
    print(matrix)

def get_top_svm_features(svm_grid_search, feature_names, top_feature_num):
    coef_list = svm_grid_search.best_estimator_.named_steps['estimator'].coef_[0]
    features = svm_grid_search.best_estimator_.named_steps['skb'].get_support()
    selected_features_list = feature_names[features].tolist()

    coef_list, selected_features_list = zip(*sorted(zip(abs(coef_list), selected_features_list), reverse=True))
    coef_list, selected_features_list = list(coef_list), list(selected_features_list)

    return coef_list[:top_feature_num], selected_features_list[:top_feature_num]

In [None]:
def random_forest(x_train, y_train, feature_num):
    pipe = Pipeline([('skb', SelectKBest(f_classif, k=feature_num)), ('estimator', RandomForestClassifier(random_state=0))])

    pipe_parameters = {'skb__k': [feature_num],
                       'estimator__n_estimators': [100, 500],
                       'estimator__max_features': ['auto', 'sqrt', 'log2'],
                       'estimator__criterion' :['gini', 'entropy']
                       }

    rf_grid_search = GridSearchCV(pipe, pipe_parameters, scoring='accuracy', cv=10, n_jobs=-1)
    rf_grid_search.fit(x_train, y_train)

    return rf_grid_search

def get_rf_metrics(rf_grid_search, x_test, y_test):
    best_accuracy = rf_grid_search.best_score_
    best_parameters = rf_grid_search.best_params_
    print("Training Accuracy:", best_accuracy)
    print("Best Parameters:", best_parameters)

    y_pred = rf_grid_search.predict(x_test)
    matrix = confusion_matrix(y_test, y_pred)
    print("Testing Accuracy:", accuracy_score(y_test, y_pred))
    # print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
    print("Confusion Matrix:")
    print(matrix)

def get_top_rf_features(rf_grid_search, feature_names, top_feature_num):
    importance_list = rf_grid_search.best_estimator_.named_steps['estimator'].feature_importances_
    features = rf_grid_search.best_estimator_.named_steps['skb'].get_support()
    selected_features_list = feature_names[features].tolist()

    importance_list, selected_features_list = zip(*sorted(zip(importance_list, selected_features_list), reverse=True))
    importance_list, selected_features_list = list(importance_list), list(selected_features_list)

    return importance_list[:top_feature_num], selected_features_list[:top_feature_num]

In [None]:
def gradient_boosting(x_train, y_train, feature_num):
    pipe = Pipeline([('skb', SelectKBest(f_classif)), ('estimator', GradientBoostingClassifier())])

    pipe_parameters = {'skb__k': [feature_num],
                       'estimator__learning_rate': [0.5, 1],
                       'estimator__n_estimators': [50],
                       }

    gb_grid_search = GridSearchCV(pipe, pipe_parameters, scoring='accuracy', cv=10, n_jobs=-1)
    gb_grid_search.fit(x_train, y_train)

    return gb_grid_search

def get_gb_metrics(gb_grid_search, x_test, y_test):
    best_accuracy = gb_grid_search.best_score_
    best_parameters = gb_grid_search.best_params_
    print("Training Accuracy:", best_accuracy)
    print("Best Parameters:", best_parameters)

    y_pred = gb_grid_search.predict(x_test)
    matrix = confusion_matrix(y_test, y_pred)
    print("Testing Accuracy:", accuracy_score(y_test, y_pred))
    # print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
    print("Confusion Matrix:")
    print(matrix)

def get_top_gb_features(gb_grid_search, feature_names, top_feature_num):
    importance_list = gb_grid_search.best_estimator_.named_steps['estimator'].feature_importances_
    features = gb_grid_search.best_estimator_.named_steps['skb'].get_support()
    selected_features_list = feature_names[features].tolist()

    importance_list, selected_features_list = zip(*sorted(zip(importance_list, selected_features_list), reverse=True))
    importance_list, selected_features_list = list(importance_list), list(selected_features_list)

    return importance_list[:top_feature_num], selected_features_list[:top_feature_num]

In [None]:
def get_top_features(svm_top_features, rf_top_features, gb_top_features, top_feature_num):
    features_list_full = []
    features_list_full.extend(svm_top_features)
    features_list_full.extend(rf_top_features)
    features_list_full.extend(gb_top_features)
    features_list = list(dict.fromkeys(features_list_full))

    rank_features_list = []
    rank_num_list = []

    for feature in features_list:
        rank = float(svm_top_features.index(feature) + rf_top_features.index(feature) + gb_top_features.index(feature)) / 3
        rank_features_list.append(feature)
        rank_num_list.append(rank)

    rank_num_list, rank_features_list = zip(*sorted(zip(rank_num_list, rank_features_list)))

    rank_features_list = rank_features_list[:top_feature_num]

    return rank_features_list

In [None]:
feature_selection_num = 2565
feature_importance_num = 10

under_sample = None
over_sample = None

Lung Cancer Dataset

In [None]:
lung_cancer_dataset = dataset.copy()

lung_cancer_dataset.loc[lung_cancer_dataset["ID_REF"] == "Lung Cancer", "ID_REF"] = 1
lung_cancer_dataset.loc[lung_cancer_dataset["ID_REF"] == "No Cancer", "ID_REF"] = 0
lung_cancer_dataset = lung_cancer_dataset[(lung_cancer_dataset["ID_REF"] == 0) | (lung_cancer_dataset["ID_REF"] == 1)]

print(lung_cancer_dataset, "\n")
print("Lung Cancer Sample Number:", lung_cancer_dataset["ID_REF"].value_counts()[1])
print("Non-Cancer Sample Number:", lung_cancer_dataset["ID_REF"].value_counts()[0])
print("All Sample Number:", lung_cancer_dataset["ID_REF"].value_counts()[0] + lung_cancer_dataset["ID_REF"].value_counts()[1], "\n")

x_train_lung, x_test_lung, y_train_lung, y_test_lung, feature_names_lung = process_data(lung_cancer_dataset, under_sample_factor=under_sample, over_sample_factor=over_sample)

In [None]:
svm_grid_search_lung = support_vector_machine(x_train_lung, y_train_lung, feature_selection_num)

In [None]:
get_svm_metrics(svm_grid_search_lung, x_test_lung, y_test_lung)
svm_top_coef_lung, svm_top_features_lung = get_top_svm_features(svm_grid_search_lung, feature_names_lung, feature_selection_num)
print(svm_top_features_lung)
print(svm_top_coef_lung)

In [None]:
rf_grid_search_lung = random_forest(x_train_lung, y_train_lung, feature_selection_num)

In [None]:
get_rf_metrics(rf_grid_search_lung, x_test_lung, y_test_lung)
rf_top_importance_lung, rf_top_features_lung = get_top_rf_features(rf_grid_search_lung, feature_names_lung, feature_selection_num)
print(rf_top_features_lung)
print(rf_top_importance_lung)

In [None]:
gb_grid_search_lung = gradient_boosting(x_train_lung, y_train_lung, feature_selection_num)

In [None]:
get_gb_metrics(gb_grid_search_lung, x_test_lung, y_test_lung)
gb_top_importance_lung, gb_top_features_lung = get_top_gb_features(gb_grid_search_lung, feature_names_lung, feature_selection_num)
print(gb_top_features_lung)
print(gb_top_importance_lung)

In [None]:
top_features_lung = get_top_features(svm_top_features_lung, rf_top_features_lung, gb_top_features_lung, feature_importance_num)
print(top_features_lung, len(top_features_lung))

Colorectal Cancer Dataset

In [None]:
colorectal_cancer_dataset = dataset.copy()

colorectal_cancer_dataset.loc[colorectal_cancer_dataset["ID_REF"] == "Colorectal Cancer", "ID_REF"] = 1
colorectal_cancer_dataset.loc[colorectal_cancer_dataset["ID_REF"] == "No Cancer", "ID_REF"] = 0
colorectal_cancer_dataset = colorectal_cancer_dataset[(colorectal_cancer_dataset["ID_REF"] == 0) | (colorectal_cancer_dataset["ID_REF"] == 1)]

print(colorectal_cancer_dataset, "\n")
print("Colorectal Cancer Sample Number:", colorectal_cancer_dataset["ID_REF"].value_counts()[1])
print("Non-Cancer Sample Number:", colorectal_cancer_dataset["ID_REF"].value_counts()[0])
print("All Sample Number:", colorectal_cancer_dataset["ID_REF"].value_counts()[0] + colorectal_cancer_dataset["ID_REF"].value_counts()[1], "\n")

x_train_colorectal, x_test_colorectal, y_train_colorectal, y_test_colorectal, feature_names_colorectal = process_data(colorectal_cancer_dataset, under_sample_factor=under_sample, over_sample_factor=over_sample)

In [None]:
svm_grid_search_colorectal = support_vector_machine(x_train_colorectal, y_train_colorectal, feature_selection_num)

In [None]:
get_svm_metrics(svm_grid_search_colorectal, x_test_colorectal, y_test_colorectal)
svm_top_coef_colorectal, svm_top_features_colorectal = get_top_svm_features(svm_grid_search_colorectal, feature_names_colorectal, feature_selection_num)
print(svm_top_features_colorectal)
print(svm_top_coef_colorectal)

In [None]:
rf_grid_search_colorectal = random_forest(x_train_colorectal, y_train_colorectal, feature_selection_num)

In [None]:
get_rf_metrics(rf_grid_search_colorectal, x_test_colorectal, y_test_colorectal)
rf_top_importance_colorectal, rf_top_features_colorectal = get_top_rf_features(rf_grid_search_colorectal, feature_names_colorectal, feature_selection_num)
print(rf_top_features_colorectal)
print(rf_top_importance_colorectal)

In [None]:
gb_grid_search_colorectal = gradient_boosting(x_train_colorectal, y_train_colorectal, feature_selection_num)

In [None]:
get_gb_metrics(gb_grid_search_colorectal, x_test_colorectal, y_test_colorectal)
gb_top_importance_colorectal, gb_top_features_colorectal = get_top_gb_features(gb_grid_search_colorectal, feature_names_colorectal, feature_selection_num)
print(gb_top_features_colorectal)
print(gb_top_importance_colorectal)

In [None]:
top_features_colorectal = get_top_features(svm_top_features_colorectal, rf_top_features_colorectal, gb_top_features_colorectal, feature_importance_num)
print(top_features_colorectal, len(top_features_colorectal))

Gastric Cancer Dataset

In [None]:
gastric_cancer_dataset = dataset.copy()

gastric_cancer_dataset.loc[gastric_cancer_dataset["ID_REF"] == "Gastric Cancer", "ID_REF"] = 1
gastric_cancer_dataset.loc[gastric_cancer_dataset["ID_REF"] == "No Cancer", "ID_REF"] = 0
gastric_cancer_dataset = gastric_cancer_dataset[(gastric_cancer_dataset["ID_REF"] == 0) | (gastric_cancer_dataset["ID_REF"] == 1)]

print(gastric_cancer_dataset, "\n")
print("Gastric Cancer Sample Number:", gastric_cancer_dataset["ID_REF"].value_counts()[1])
print("Non-Cancer Sample Number:", gastric_cancer_dataset["ID_REF"].value_counts()[0])
print("All Sample Number:", gastric_cancer_dataset["ID_REF"].value_counts()[0] + gastric_cancer_dataset["ID_REF"].value_counts()[1], "\n")

x_train_gastric, x_test_gastric, y_train_gastric, y_test_gastric, feature_names_gastric = process_data(gastric_cancer_dataset, under_sample_factor=under_sample, over_sample_factor=over_sample)

In [None]:
svm_grid_search_gastric = support_vector_machine(x_train_gastric, y_train_gastric, feature_selection_num)

In [None]:
get_svm_metrics(svm_grid_search_gastric, x_test_gastric, y_test_gastric)
svm_top_coef_gastric, svm_top_features_gastric = get_top_svm_features(svm_grid_search_gastric, feature_names_gastric, feature_selection_num)
print(svm_top_features_gastric)
print(svm_top_coef_gastric)

In [None]:
rf_grid_search_gastric = random_forest(x_train_gastric, y_train_gastric, feature_selection_num)

In [None]:
get_rf_metrics(rf_grid_search_gastric, x_test_gastric, y_test_gastric)
rf_top_importance_gastric, rf_top_features_gastric = get_top_rf_features(rf_grid_search_gastric, feature_names_gastric, feature_selection_num)
print(rf_top_features_gastric)
print(rf_top_importance_gastric)

In [None]:
gb_grid_search_gastric = gradient_boosting(x_train_gastric, y_train_gastric, feature_selection_num)

In [None]:
get_gb_metrics(gb_grid_search_gastric, x_test_gastric, y_test_gastric)
gb_top_importance_gastric, gb_top_features_gastric = get_top_gb_features(gb_grid_search_gastric, feature_names_gastric, feature_selection_num)
print(gb_top_features_gastric)
print(gb_top_importance_gastric)

In [None]:
top_features_gastric = get_top_features(svm_top_features_gastric, rf_top_features_gastric, gb_top_features_gastric, feature_importance_num)
print(top_features_gastric, len(top_features_gastric))

Prostate Cancer

In [None]:
prostate_cancer_dataset = dataset.copy()

prostate_cancer_dataset.loc[prostate_cancer_dataset["ID_REF"] == "Prostate Cancer", "ID_REF"] = 1
prostate_cancer_dataset.loc[prostate_cancer_dataset["ID_REF"] == "No Cancer", "ID_REF"] = 0
prostate_cancer_dataset = prostate_cancer_dataset[(prostate_cancer_dataset["ID_REF"] == 0) | (prostate_cancer_dataset["ID_REF"] == 1)]

print(prostate_cancer_dataset, "\n")
print("Prostate Cancer Sample Number:", prostate_cancer_dataset["ID_REF"].value_counts()[1])
print("Non-Cancer Sample Number:", prostate_cancer_dataset["ID_REF"].value_counts()[0])
print("All Sample Number:", prostate_cancer_dataset["ID_REF"].value_counts()[0] + prostate_cancer_dataset["ID_REF"].value_counts()[1], "\n")

x_train_prostate, x_test_prostate, y_train_prostate, y_test_prostate, feature_names_prostate = process_data(prostate_cancer_dataset, under_sample_factor=under_sample, over_sample_factor=over_sample)

In [None]:
svm_grid_search_prostate = support_vector_machine(x_train_prostate, y_train_prostate, feature_selection_num)

In [None]:
get_svm_metrics(svm_grid_search_prostate, x_test_prostate, y_test_prostate)
svm_top_coef_prostate, svm_top_features_prostate = get_top_svm_features(svm_grid_search_prostate, feature_names_prostate, feature_selection_num)
print(svm_top_features_prostate)
print(svm_top_coef_prostate)

In [None]:
rf_grid_search_prostate = random_forest(x_train_prostate, y_train_prostate, feature_selection_num)

In [None]:
get_rf_metrics(rf_grid_search_prostate, x_test_prostate, y_test_prostate)
rf_top_importance_prostate, rf_top_features_prostate = get_top_rf_features(rf_grid_search_prostate, feature_names_prostate, feature_selection_num)
print(rf_top_features_prostate)
print(rf_top_importance_prostate)

In [None]:
gb_grid_search_prostate = gradient_boosting(x_train_prostate, y_train_prostate, feature_selection_num)

In [None]:
get_gb_metrics(gb_grid_search_prostate, x_test_prostate, y_test_prostate)
gb_top_importance_prostate, gb_top_features_prostate = get_top_gb_features(gb_grid_search_prostate, feature_names_prostate, feature_selection_num)
print(gb_top_features_prostate)
print(gb_top_importance_prostate)

In [None]:
top_features_prostate = get_top_features(svm_top_features_prostate, rf_top_features_prostate, gb_top_features_prostate, feature_importance_num)
print(top_features_prostate, len(top_features_prostate))

Multi-Class

In [None]:
multi_class_dataset = dataset.copy()

multi_class_dataset.loc[multi_class_dataset["ID_REF"] == "Lung Cancer", "ID_REF"] = 1
multi_class_dataset.loc[multi_class_dataset["ID_REF"] == "Colorectal Cancer", "ID_REF"] = 2
multi_class_dataset.loc[multi_class_dataset["ID_REF"] == "Gastric Cancer", "ID_REF"] = 3
multi_class_dataset.loc[multi_class_dataset["ID_REF"] == "Prostate Cancer", "ID_REF"] = 4
multi_class_dataset = multi_class_dataset[(multi_class_dataset["ID_REF"] == 1) | (multi_class_dataset["ID_REF"] == 2) | (multi_class_dataset["ID_REF"] == 3) | (multi_class_dataset["ID_REF"] == 4)]

print(multi_class_dataset, "\n")
print("Lung Cancer Sample Number:", multi_class_dataset["ID_REF"].value_counts()[1])
print("Colorectal Cancer Sample Number:", multi_class_dataset["ID_REF"].value_counts()[2])
print("Gastric Cancer Sample Number:", multi_class_dataset["ID_REF"].value_counts()[3])
print("Prostate Cancer Sample Number:", multi_class_dataset["ID_REF"].value_counts()[4])
print("All Sample Number:", multi_class_dataset["ID_REF"].value_counts()[1] + multi_class_dataset["ID_REF"].value_counts()[2] + multi_class_dataset["ID_REF"].value_counts()[3] + multi_class_dataset["ID_REF"].value_counts()[4], "\n")

x_train_multi, x_test_multi, y_train_multi, y_test_multi, feature_names_multi = process_data(multi_class_dataset, under_sample_factor=under_sample, over_sample_factor=over_sample)

In [None]:
svm_grid_search_multi = support_vector_machine(x_train_multi, y_train_multi, feature_selection_num)

In [None]:
get_svm_metrics(svm_grid_search_multi, x_test_multi, y_test_multi)
svm_top_coef_multi, svm_top_features_multi = get_top_svm_features(svm_grid_search_multi, feature_names_multi, 10)
print(svm_top_features_multi)
print(svm_top_coef_multi)

In [None]:
rf_grid_search_multi = random_forest(x_train_multi, y_train_multi, feature_selection_num)

In [None]:
get_rf_metrics(rf_grid_search_multi, x_test_multi, y_test_multi)
rf_top_importance_multi, rf_top_features_multi = get_top_rf_features(rf_grid_search_multi, feature_names_multi, 10)
print(rf_top_features_multi)
print(rf_top_importance_multi)

In [None]:
gb_grid_search_multi = gradient_boosting(x_train_multi, y_train_multi, feature_selection_num)

In [None]:
get_gb_metrics(gb_grid_search_multi, x_test_multi, y_test_multi)
gb_top_importance_multi, gb_top_features_multi = get_top_gb_features(gb_grid_search_multi, feature_names_multi, 10)
print(gb_top_features_multi)
print(gb_top_importance_multi)

In [None]:
top_features_multi = gb_top_features_multi
print(top_features_multi, len(top_features_multi))

Analysis

In [None]:
def create_network(top_features_list, all_features_list, correlation_threshold_factor, cancer_dataset):
    cancer_subset, control_subset = cancer_dataset[(cancer_dataset["ID_REF"] == 1)], cancer_dataset[(cancer_dataset["ID_REF"] == 0)]

    edges = [((node1, node2), cancer_subset[node1].corr(cancer_subset[node2], method="pearson")) for node1, node2 in itertools.combinations(top_features_list, 2)]
    edges = [(node1, node2, {'weight': abs(correlation), 'sign': 1 if correlation > 0 else 0}) for (node1, node2), correlation in edges if abs(correlation) >= correlation_threshold_factor]

    nodes = [(feature, {'sides': all_features_list.count(feature) + 1, "comparison": 1 if cancer_subset[feature].mean() >= control_subset[feature].mean() else (0 if cancer_subset[feature].mean() < control_subset[feature].mean() else 0.5)}) for feature in top_features_list]

    graph = nx.Graph()
    graph.add_nodes_from(nodes)
    graph.add_edges_from(edges)

    degrees = dict(graph.degree)

    network_degrees_values, network_degrees_nodes = zip(*sorted(zip(degrees.values(), degrees.keys()), reverse=True))
    print(network_degrees_nodes)
    print(network_degrees_values)

    return graph

In [None]:
def create_bar_charts(top_features_list, full_dataset, path_name):
    lung_cancer = full_dataset[(full_dataset["ID_REF"] == "Lung Cancer")]
    colorectal_cancer = full_dataset[(full_dataset["ID_REF"] == "Colorectal Cancer")]
    gastric_cancer = full_dataset[(full_dataset["ID_REF"] == "Gastric Cancer")]
    prostate_cancer = full_dataset[(full_dataset["ID_REF"] == "Prostate Cancer")]
    no_cancer = full_dataset[(full_dataset["ID_REF"] == "No Cancer")]

    cancer_dataset = [lung_cancer, colorectal_cancer, gastric_cancer, prostate_cancer, no_cancer]

    plt.ioff()
    for feature_name in top_features_list:
        plt.figure(0).clf()
        plt.figure(figsize=(9, 6))

        plt.rcParams.update({'font.size': 18})

        labels = ['Lung', 'Colorectal', 'Gastric', 'Prostate', 'No Cancer']
        means = [data[feature_name].mean() for data in cancer_dataset]
        errors = [data[feature_name].sem() * 2 for data in cancer_dataset]
        plt.bar(labels, means, yerr=errors, error_kw={'elinewidth': 10, 'ecolor': 'k'}, capsize=15)

        plt.title(feature_name)
        plt.ylabel('Signal Value')
        plt.savefig("Bar Charts/" + path_name + "/" + feature_name, dpi=200, bbox_inches='tight')
    plt.ion()

In [None]:
all_features = []
all_features.extend(top_features_lung)
all_features.extend(top_features_colorectal)
all_features.extend(top_features_gastric)
all_features.extend(top_features_prostate)
single_features = list(dict.fromkeys(all_features))

correlation_threshold = 0.5

In [None]:
graph_lung = create_network(top_features_lung, all_features, correlation_threshold, lung_cancer_dataset)
nx.write_gexf(graph_lung, "lung_new.gexf")

In [None]:
graph_colorectal = create_network(top_features_colorectal, all_features, correlation_threshold, colorectal_cancer_dataset)
nx.write_gexf(graph_colorectal, "colorectal_new.gexf")

In [None]:
graph_gastric = create_network(top_features_gastric, all_features, correlation_threshold, gastric_cancer_dataset)
nx.write_gexf(graph_gastric, "gastric_new.gexf")

In [None]:
graph_prostate = create_network(top_features_prostate, all_features, correlation_threshold, prostate_cancer_dataset)
nx.write_gexf(graph_prostate, "prostate_new.gexf")

In [None]:
create_bar_charts(top_features_lung, dataset, "Lung")
create_bar_charts(top_features_colorectal, dataset, "Colorectal")
create_bar_charts(top_features_gastric, dataset, "Gastric")
create_bar_charts(top_features_prostate, dataset, "Prostate")
create_bar_charts(top_features_multi, dataset, "Multi")