Import necessary packages

In [2]:
import pandas as pd
import scipy
import knnie
import numpy as np
import dcor
from scipy import stats
from scipy.stats import entropy
import seaborn as sns
import math

Subsetting function to identify k-cases of genes

In [3]:
def subsetting(k_val, gene_index, metabolite_index, model):
    if k_val in [-1,0,1]:
        if k_val == -1:
            return gene_index
        elif k_val == 0:
            gene_names = []
            for gene in model.genes:
                if gene.name in gene_index:
                    gene_names.append(gene.name)
            return gene_names
        else:
            metabolite_names = []
            for met in model.metabolites:
                met_id = "_".join(met.id.split("_")[:-1])
                if met_id not in metabolite_names and met_id in metabolite_index:
                    metabolite_names.append(met_id)
            print(len(metabolite_names))
            gene_names = []
            for gene in model.genes:
                if gene.name in gene_index:
                    if len(gene.reactions) > 0:
                        for rxn in gene.reactions:
                            for met in list(rxn.metabolites.keys()):
                                met_id = "_".join(met.id.split("_")[:-1])
                                if met_id in metabolite_names and gene.name not in gene_names:
                                    gene_names.append(gene.name)
            return gene_names
    else:
        raise ValueError("K_val is incorrect")


Average duplicate genes

In [4]:
def averageDuplicates(dataframe):
    dataframe["Gene"] = dataframe.index
    dataframe = dataframe.groupby("Gene").mean()
    return dataframe

Calculate distance correlation metric between two vectors

In [6]:
def calcOverlap(x, y, metric):
    if metric == "DC":
        # To recreate Székely et al paper use distance_correlation_sqr rather than distance_correlation
        if (len(x) != len(y)) and (len(x.columns) == len(y.columns)):
            x_transposed = x.transpose()
            y_transposed = y.transpose()
            return dcor.distance_correlation(x_transposed, y_transposed)
        elif (len(x) == len(y)):
            return dcor.distance_correlation(x, y)
        elif (len(x) == len(y.columns)):
            y_transposed = y.transpose()
            return dcor.distance_correlation(x, y_transposed)
        elif (len(y) == len(x.columns)):
            x_transposed = x.transpose()
            return dcor.distance_correlation(x_transposed, y)
        else:
            raise ValueError("At least one dimension must be identical")

Permution of samples

In [None]:
def permuteDataframe(x, y, n_permutations, metric):
    data = []
    for i in range(0,n_permutations-1):
        x_perm = x.reindex(np.random.permutation(x.columns), axis = "columns").transpose()
        distance_corr_coeff2 = calcOverlap(x_perm, y, metric)
        data.append(distance_corr_coeff2)
        permutationData = pd.DataFrame(data = data)
    return permutationData

Sample testing of genes and etc

In [None]:
def geneSampling(x,y, included_index, n_samples, metric):
    # Remove genes that are also in genes of interest
    data = []
    samplingData = pd.DataFrame()
    x = x.drop(included_index)
    for i in range(0,n_samples-1):
        x_sampled = x.sample(len(included_index), axis = 0)
        samplingData[i] = x_sampled.index
        distance_corr_coeff3 = calcOverlap(x_sampled, y, metric)
        data.append(distance_corr_coeff3)
    samplingData.loc[-1] = data
    samplingData = samplingData.transpose()
    Coeffs = samplingData.pop(-1)
    samplingData.insert(0, 'Coeffs', Coeffs)

    return samplingData

Calculate partial Distance Correlation

In [9]:
def calcPDC(x,y,z):
    return dcor.partial_distance_correlation(x, y,z)