Necessary imports

In [None]:
import os, sys
import pandas as pd

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from packages.bicpy import bicpy
from packages.mylib import myproject
from packages.tcgahandler import LayerDataset
from packages.pydge import pydge
import statistics
import seaborn as sns

DATA_DIR = "../../data/"

Define necessary variables

In [2]:
# rename dictionary just for aesthetic purposes
rename_dict = {
                "logistic_regression": "Logistic Regression", 
                "random_forest": "Random Forest",
                "svm": "Support Vector Machine",
                "accuracy": "Accuracy",
                "recall": "Recall",
                "precision": "Precision",
                "f1_score": "F1 Score"
                }

# parameterization for biclustering algorithm
# default parameters out of scope for this work
default_param = {
    'symmetries': False,
    'normalization': "column",
    'discretization': "normal_distribution",
    'noise_relaxation': "optional",
    'filling_criteria': "remove",
    'pattern_type': "constant",
    'orientation': "rows",
    'remove_percentage': 0.1, # with only 1 iter doesn't affect
    'balancing': False,
    "to_posthandle": True
}
# relevant parameters defined
params = {"min_biclusters": 10, "min_columns": 3, "min_lift": 1.2, "nr_iterations": 10, "nr_labels": 11, }

# evaluation parameters
# classifiers to test
classifier_list = ['logistic_regression', 'svm', 'random_forest']# DatasetEvaluator.available_classifiers()
# metrics to use
metric_list = ['accuracy', 'recall', 'precision', 'f1_score']
# number of folds for cross-validation
n_folds = 10
# balancing and normalization
balancing, normalization = True, True

# parameters of transformation
# filtering step
filtering = "q10"
filter_by = "lift"
# distance
distance = "norm_euclidean"

# outro
pvalue = 1
target = "vital_status"

Function to train and test classifiers

In [3]:
def prediction_results(dataset, target, pvalue, classifier_list, metric_list, parameterization, normalization, 
                                    n_folds, filtering,  filter_by, balancing, distance="euclidean", verbose=0):
    """Trains and tests classifiers in gene- and pattern-centric data, returns obtained results in defined metrics
    
    Parameters
    ----------
    dataset : tcgahandler.LayerDataset
        Abstraction for data corresponding to a omics layer of a given TCGA project
    target : str
        Target variable for the prediction task
    pvalue : float
        p-value to use for DEG Analysis, if equal to 1 doesn't perform it 
    classifier_list : list of str
        Classifiers to train and test
    metric_list: list of str
        Classification metrics used to evaluate the classifiers
    parameterization: dict
        Parameterization for the biclustering algorithm
    normalization: bool
        Flag to normalize data before classification
    n_folds: int
        Number of folds to use in the cross-validation
    filtering: str
        Type of filtering to apply in the pattern-based transformation
    filter_by: str
        Metric to filter by in the pattern-based transformation
    balancing: bool
        Flag to balance data before classification
    distance: str
        Distance used to calculate new values of pattern-centric data
    verbose: int
        Verbose flag
    
    Returns
    --------
    tuple(pandas.DataFrame, pd.DataFrame)
        A DataFrame with the results for each combination of dataset, classifier, and metric, and a DataFrame with the average dimensionalities for each dataset 

    """

    # get pd.DataFrame with omics data and target variable
    data = dataset.get_data_with_target(data_type='default', target=target)

    if dataset.layer != 'protein':
        #if it is miRNA or mRNA, log transform and filter
        counts = dataset.get_data_with_target(data_type='counts', target=target)
        filtered_genes = pydge.deg_filtering(counts, target, filter_only=True)
        data = dataset.log_transform_data(data, target, data_type="default")
    else:
        counts = None
        filtered_genes = data.drop(columns=[target]).columns.tolist()

    #
    pattern_results = myproject.transform_and_evaluate_validation(parameterization, data[filtered_genes + [target]], target, classifier_list, metric_list,
                                                                  normalization, n_folds, filtering, filter_by, balancing, distance,
                                                                  verbose=verbose)
    gene_results = myproject.evaluate_validation(data, counts, target, pvalue, classifier_list, metric_list, normalization, n_folds,
                                                    balancing, verbose=verbose)

    pattern_results['dataset'] = 'Pattern-based'
    gene_results['dataset'] = 'Gene-based'

    pattern_dimension = statistics.mean([shape[1]-1 for shape in pattern_results["shape"]])
    gene_dimension = statistics.mean([shape[1]-1 for shape in gene_results["shape"]])
    dimensionalities = pd.DataFrame( {"dataset": ["Gene-based", "Pattern-based"], "dimensions": [gene_dimension, pattern_dimension]} )
    print(pattern_dimension, gene_dimension)

    results = pd.concat([pattern_results, gene_results], ignore_index=True)
    results = results.sort_values(by=['dataset', 'Metric'], ascending=True)
    return results, dimensionalities

## TCGA-LGG


#### miRNA


Obtain results

In [4]:
project, layer = "TCGA-LGG", "mirna"
# create LayerDataset for layer miRNA of project TCGA-LGG
dataset = LayerDataset(DATA_DIR, project, layer)
# set p-value to 1 so that there is no DEG filtering
pvalue = 1

parameterization = bicpy.add_default_parameterization(params, default_param)
results, dimensionalities = prediction_results(dataset, target, pvalue, classifier_list, metric_list, parameterization, 
                                normalization, n_folds, filtering,  filter_by, balancing, 
                                'norm_euclidean', verbose=1)
print(dimensionalities)

No file with values to replace. Continuing.
No file with values to replace. Continuing.
pre-transformation pattern data shape:  (510, 352)
Pattern dimensions:  [27, 23, 26, 30, 26, 23, 21, 29, 24, 28] Mean:  25.7
DGE dimensions:  [349, 352, 349, 351, 353, 349, 353, 353, 349, 351] Mean:  350.9
24.7 350.9
         dataset  dimensions
0     Gene-based       350.9
1  Pattern-based        24.7


Transform result table for display

In [5]:
results_show2 = results.drop(columns=["shape"])
results_show2 = results_show2.groupby(['dataset', 'classifier', 'Metric'], as_index=False).agg({'Score':['mean','std'],})
results_show2.columns = ['_'.join(col) if col[1] !='' else col[0] for col in results_show2.columns]
results_show2 = results_show2.round(decimals=3)
results_show2["Score"] = results_show2["Score_mean"].astype("str") + "+-" + results_show2["Score_std"].astype(str)
results_show2 = results_show2.drop(columns=["Score_mean", "Score_std"])
results_show2 = results_show2.pivot(index=['classifier'], columns=['Metric', 'dataset'], values='Score')
results_show2 = results_show2[['accuracy', 'recall', 'precision', 'f1_score']]
display(results_show2)

Metric,accuracy,accuracy,recall,recall,precision,precision,f1_score,f1_score
dataset,Gene-based,Pattern-based,Gene-based,Pattern-based,Gene-based,Pattern-based,Gene-based,Pattern-based
classifier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
logistic_regression,0.702+-0.049,0.788+-0.029,0.609+-0.068,0.614+-0.038,0.604+-0.067,0.749+-0.089,0.603+-0.067,0.627+-0.055
random_forest,0.782+-0.05,0.775+-0.044,0.601+-0.062,0.602+-0.061,0.723+-0.181,0.723+-0.134,0.608+-0.09,0.61+-0.079
svm,0.769+-0.039,0.778+-0.038,0.566+-0.046,0.586+-0.057,0.701+-0.18,0.717+-0.166,0.559+-0.069,0.585+-0.085


#### protein


Obtain results

In [6]:
project, layer = "TCGA-LGG", "protein"
# create LayerDataset for layer miRNA of project TCGA-LGG
dataset = LayerDataset(DATA_DIR, project, layer)
# set p-value to 1 so that there is no DEG filtering
pvalue = 1

parameterization = bicpy.add_default_parameterization(params, default_param)
results, dimensionalities = prediction_results(dataset, target, pvalue, classifier_list, metric_list, parameterization, 
                                normalization, n_folds, filtering,  filter_by, balancing, 
                                'norm_euclidean', verbose=1)
print(dimensionalities)

No file with values to replace. Continuing.
pre-transformation pattern data shape:  (428, 457)
Pattern dimensions:  [19, 34, 30, 30, 34, 28, 30, 23, 24, 38] Mean:  29.0
28 456
         dataset  dimensions
0     Gene-based         456
1  Pattern-based          28


Transform result table for display

In [7]:
results_show2 = results.drop(columns=["shape"])
results_show2 = results_show2.groupby(['dataset', 'classifier', 'Metric'], as_index=False).agg({'Score':['mean','std'],})
results_show2.columns = ['_'.join(col) if col[1] !='' else col[0] for col in results_show2.columns]
results_show2 = results_show2.round(decimals=3)
results_show2["Score"] = results_show2["Score_mean"].astype("str") + "+-" + results_show2["Score_std"].astype(str)
results_show2 = results_show2.drop(columns=["Score_mean", "Score_std"])
results_show2 = results_show2.pivot(index=['classifier'], columns=['Metric', 'dataset'], values='Score')
results_show2 = results_show2[['accuracy', 'recall', 'precision', 'f1_score']]
display(results_show2)

Metric,accuracy,accuracy,recall,recall,precision,precision,f1_score,f1_score
dataset,Gene-based,Pattern-based,Gene-based,Pattern-based,Gene-based,Pattern-based,Gene-based,Pattern-based
classifier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
logistic_regression,0.745+-0.044,0.755+-0.022,0.633+-0.074,0.562+-0.04,0.636+-0.063,0.619+-0.127,0.632+-0.069,0.557+-0.06
random_forest,0.781+-0.034,0.799+-0.027,0.579+-0.064,0.622+-0.038,0.691+-0.157,0.745+-0.08,0.578+-0.088,0.638+-0.051
svm,0.778+-0.031,0.799+-0.03,0.556+-0.063,0.612+-0.054,0.641+-0.215,0.723+-0.154,0.539+-0.097,0.622+-0.082


#### mRNA


Obtain results

In [8]:
project, layer = "TCGA-LGG", "mrna"
# create LayerDataset for layer miRNA of project TCGA-LGG
dataset = LayerDataset(DATA_DIR, project, layer)
# set p-value to 1 so that there is no DEG filtering
pvalue = 0.05

parameterization = bicpy.add_default_parameterization(params, default_param)
results, dimensionalities = prediction_results(dataset, target, pvalue, classifier_list, metric_list, parameterization, 
                                normalization, n_folds, filtering,  filter_by, balancing, 
                                'norm_euclidean', verbose=1)
print(dimensionalities)

No file with values to replace. Continuing.
No file with values to replace. Continuing.
pre-transformation pattern data shape:  (514, 24027)
Pattern dimensions:  [19, 20, 26, 34, 29, 22, 24, 74, 23, 27] Mean:  29.8
DGE dimensions:  [12737, 11993, 11759, 12077, 12055, 12370, 12660, 11939, 12149, 11668] Mean:  12140.7
28.8 12140.7
         dataset  dimensions
0     Gene-based     12140.7
1  Pattern-based        28.8


Transform result table for display

In [9]:
results_show2 = results.drop(columns=["shape"])
results_show2 = results_show2.groupby(['dataset', 'classifier', 'Metric'], as_index=False).agg({'Score':['mean','std'],})
results_show2.columns = ['_'.join(col) if col[1] !='' else col[0] for col in results_show2.columns]
results_show2 = results_show2.round(decimals=3)
results_show2["Score"] = results_show2["Score_mean"].astype("str") + "+-" + results_show2["Score_std"].astype(str)
results_show2 = results_show2.drop(columns=["Score_mean", "Score_std"])
results_show2 = results_show2.pivot(index=['classifier'], columns=['Metric', 'dataset'], values='Score')
results_show2 = results_show2[['accuracy', 'recall', 'precision', 'f1_score']]
display(results_show2)

Metric,accuracy,accuracy,recall,recall,precision,precision,f1_score,f1_score
dataset,Gene-based,Pattern-based,Gene-based,Pattern-based,Gene-based,Pattern-based,Gene-based,Pattern-based
classifier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
logistic_regression,0.776+-0.049,0.78+-0.035,0.758+-0.061,0.627+-0.054,0.72+-0.053,0.719+-0.095,0.727+-0.053,0.639+-0.064
random_forest,0.771+-0.041,0.794+-0.032,0.609+-0.052,0.627+-0.066,0.695+-0.091,0.769+-0.094,0.62+-0.064,0.637+-0.083
svm,0.782+-0.028,0.79+-0.025,0.612+-0.03,0.619+-0.046,0.742+-0.099,0.76+-0.091,0.625+-0.039,0.631+-0.056


## TCGA-COAD


#### miRNA


Obtain results

In [10]:
project, layer = "TCGA-COAD", "mirna"
# create LayerDataset for layer miRNA of project TCGA-LGG
dataset = LayerDataset(DATA_DIR, project, layer)
# set p-value to 1 so that there is no DEG filtering
pvalue = 1

parameterization = bicpy.add_default_parameterization(params, default_param)
results, dimensionalities = prediction_results(dataset, target, pvalue, classifier_list, metric_list, parameterization, 
                                normalization, n_folds, filtering,  filter_by, balancing, 
                                'norm_euclidean', verbose=1)
print(dimensionalities)

No file with values to replace. Continuing.
No file with values to replace. Continuing.
pre-transformation pattern data shape:  (442, 293)
Pattern dimensions:  [62, 49, 44, 62, 50, 48, 33, 42, 39, 35] Mean:  46.4
DGE dimensions:  [294, 294, 291, 291, 293, 290, 291, 291, 293, 291] Mean:  291.9
45.4 291.9
         dataset  dimensions
0     Gene-based       291.9
1  Pattern-based        45.4


Transform result table for display

In [11]:
results_show2 = results.drop(columns=["shape"])
results_show2 = results_show2.groupby(['dataset', 'classifier', 'Metric'], as_index=False).agg({'Score':['mean','std'],})
results_show2.columns = ['_'.join(col) if col[1] !='' else col[0] for col in results_show2.columns]
results_show2 = results_show2.round(decimals=3)
results_show2["Score"] = results_show2["Score_mean"].astype("str") + "+-" + results_show2["Score_std"].astype(str)
results_show2 = results_show2.drop(columns=["Score_mean", "Score_std"])
results_show2 = results_show2.pivot(index=['classifier'], columns=['Metric', 'dataset'], values='Score')
results_show2 = results_show2[['accuracy', 'recall', 'precision', 'f1_score']]
display(results_show2)

Metric,accuracy,accuracy,recall,recall,precision,precision,f1_score,f1_score
dataset,Gene-based,Pattern-based,Gene-based,Pattern-based,Gene-based,Pattern-based,Gene-based,Pattern-based
classifier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
logistic_regression,0.679+-0.042,0.751+-0.035,0.542+-0.064,0.504+-0.03,0.537+-0.064,0.491+-0.123,0.537+-0.064,0.468+-0.046
random_forest,0.774+-0.014,0.774+-0.013,0.509+-0.022,0.512+-0.031,0.487+-0.215,0.471+-0.185,0.454+-0.041,0.46+-0.056
svm,0.772+-0.006,0.772+-0.006,0.5+-0.0,0.5+-0.0,0.386+-0.003,0.386+-0.003,0.436+-0.002,0.436+-0.002


#### protein


Obtain results

In [12]:
project, layer = "TCGA-COAD", "protein"
# create LayerDataset for layer miRNA of project TCGA-LGG
dataset = LayerDataset(DATA_DIR, project, layer)
# set p-value to 1 so that there is no DEG filtering
pvalue = 1

parameterization = bicpy.add_default_parameterization(params, default_param)
results, dimensionalities = prediction_results(dataset, target, pvalue, classifier_list, metric_list, parameterization, 
                                normalization, n_folds, filtering,  filter_by, balancing, 
                                'norm_euclidean', verbose=1)
print(dimensionalities)

No file with values to replace. Continuing.
pre-transformation pattern data shape:  (347, 456)
Pattern dimensions:  [24, 34, 26, 38, 25, 39, 26, 44, 30, 32] Mean:  31.8
30.8 455
         dataset  dimensions
0     Gene-based       455.0
1  Pattern-based        30.8


Transform result table for display

In [13]:
results_show2 = results.drop(columns=["shape"])
results_show2 = results_show2.groupby(['dataset', 'classifier', 'Metric'], as_index=False).agg({'Score':['mean','std'],})
results_show2.columns = ['_'.join(col) if col[1] !='' else col[0] for col in results_show2.columns]
results_show2 = results_show2.round(decimals=3)
results_show2["Score"] = results_show2["Score_mean"].astype("str") + "+-" + results_show2["Score_std"].astype(str)
results_show2 = results_show2.drop(columns=["Score_mean", "Score_std"])
results_show2 = results_show2.pivot(index=['classifier'], columns=['Metric', 'dataset'], values='Score')
results_show2 = results_show2[['accuracy', 'recall', 'precision', 'f1_score']]
display(results_show2)

Metric,accuracy,accuracy,recall,recall,precision,precision,f1_score,f1_score
dataset,Gene-based,Pattern-based,Gene-based,Pattern-based,Gene-based,Pattern-based,Gene-based,Pattern-based
classifier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
logistic_regression,0.726+-0.066,0.784+-0.033,0.581+-0.076,0.528+-0.052,0.601+-0.088,0.621+-0.252,0.583+-0.076,0.501+-0.078
random_forest,0.769+-0.014,0.775+-0.031,0.493+-0.013,0.515+-0.035,0.389+-0.005,0.535+-0.213,0.435+-0.005,0.478+-0.059
svm,0.781+-0.013,0.781+-0.013,0.5+-0.0,0.5+-0.0,0.391+-0.006,0.391+-0.006,0.439+-0.004,0.439+-0.004


#### mRNA


Obtain results

In [14]:
project, layer = "TCGA-COAD", "mrna"
# create LayerDataset for layer miRNA of project TCGA-LGG
dataset = LayerDataset(DATA_DIR, project, layer)
# set p-value to 1 so that there is no DEG filtering
pvalue = 1

parameterization = bicpy.add_default_parameterization(params, default_param)
results, dimensionalities = prediction_results(dataset, target, pvalue, classifier_list, metric_list, parameterization, 
                                normalization, n_folds, filtering,  filter_by, balancing, 
                                'norm_euclidean', verbose=1)
print(dimensionalities)

No file with values to replace. Continuing.
No file with values to replace. Continuing.
pre-transformation pattern data shape:  (456, 21541)
Pattern dimensions:  [37, 40, 62, 42, 48, 32, 45, 34, 31, 43] Mean:  41.4
DGE dimensions:  [21490, 21546, 21537, 21529, 21613, 21632, 21466, 21451, 21424, 21536] Mean:  21522.4
40.4 21522.4
         dataset  dimensions
0     Gene-based     21522.4
1  Pattern-based        40.4


Transform result table for display

In [15]:
results_show2 = results.drop(columns=["shape"])
results_show2 = results_show2.groupby(['dataset', 'classifier', 'Metric'], as_index=False).agg({'Score':['mean','std'],})
results_show2.columns = ['_'.join(col) if col[1] !='' else col[0] for col in results_show2.columns]
results_show2 = results_show2.round(decimals=3)
results_show2["Score"] = results_show2["Score_mean"].astype("str") + "+-" + results_show2["Score_std"].astype(str)
results_show2 = results_show2.drop(columns=["Score_mean", "Score_std"])
results_show2 = results_show2.pivot(index=['classifier'], columns=['Metric', 'dataset'], values='Score')
results_show2 = results_show2[['accuracy', 'recall', 'precision', 'f1_score']]
display(results_show2)

Metric,accuracy,accuracy,recall,recall,precision,precision,f1_score,f1_score
dataset,Gene-based,Pattern-based,Gene-based,Pattern-based,Gene-based,Pattern-based,Gene-based,Pattern-based
classifier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
logistic_regression,0.627+-0.061,0.759+-0.042,0.603+-0.055,0.519+-0.051,0.577+-0.045,0.581+-0.228,0.566+-0.056,0.494+-0.075
random_forest,0.759+-0.019,0.772+-0.011,0.495+-0.023,0.497+-0.006,0.437+-0.107,0.388+-0.004,0.447+-0.036,0.436+-0.003
svm,0.776+-0.008,0.776+-0.008,0.5+-0.0,0.5+-0.0,0.388+-0.004,0.388+-0.004,0.437+-0.003,0.437+-0.003
