In [1]:
# Python file to recreate correlation calculation.
import os
import sys
import pandas as pd
import numpy as np    
import networkx as nx
import scipy
from scipy import stats
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
import matplotlib
import matplotlib.pyplot as plt
import plotnine
from plotnine import ggplot, aes, geom_line, geom_bar
from plotnine import *

def calculate_spearman(col1, col2):
    return stats.spearmanr(col1, col2).pvalue

def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    pvalues = Parallel(n_jobs=num_cores)(delayed(calculate_spearman)(df.iloc[:,i], df.iloc[:,j]) \
        for i in range(len(df.columns)) \
        for j in range(i,len(df.columns)))
    return pvalues

def calc_correlations(numericmatrix, strong_threshold = 0.5):
    from scipy import stats
    correlation_spearman, pvalues = stats.spearmanr(numericmatrix.T)
    correlation_spearman = pd.DataFrame(correlation_spearman, columns = numericmatrix.T.columns)
    pvalues = pd.DataFrame(pvalues, columns = numericmatrix.T.columns)
    correlation_spearman["mag2"] = correlation_spearman.columns
    pvalues["mag2"] = pvalues.columns
    pvals_melted = pvalues.melt(id_vars = "mag2", var_name = "mag1", value_name = "pval")
    corr_melted = correlation_spearman.melt(id_vars = "mag2", var_name = "mag1", value_name = "corrcoef")
    pvals_melted = pvals_melted[(pvals_melted.pval == pvals_melted.pval)]
    combined_graph = pd.merge(corr_melted, pvals_melted, how = "right")
    # Sidak:
    filtered_graph = combined_graph[combined_graph.pval < (1-(1-0.05)**(1/len(combined_graph.index)))]
    filtered_strong_connections = combined_graph[abs(combined_graph.corrcoef) > strong_threshold]
    return combined_graph, filtered_strong_connections

datamatrix_tpm = pd.read_csv(os.path.join("..","input","MAG_tpm_new_approach.csv"))
datamatrix_tpm = datamatrix_tpm[datamatrix_tpm.Genome != "unmapped"]
datamatrix_tpm.index = datamatrix_tpm.Genome
datamatrix_tpm = datamatrix_tpm.drop(columns=['Genome',"Unnamed: 0"])
qualify_cols = datamatrix_tpm.sum(axis=0)
numericmatrix_tpm = datamatrix_tpm.loc[:,qualify_cols > 0]
datamatrix_tpm["Method"] = "TPM"
combined_graph_tpm, filtered_strong_connections_tpm = calc_correlations(numericmatrix_tpm)
combined_graph_tpm.to_csv(os.path.join("..","input","combined_graph_tpm_fullpval.csv"))

In [2]:
datamatrix_rpkm = pd.read_csv(os.path.join("..","input","MAG_rpkm_new_approach.csv"))
datamatrix_rpkm = datamatrix_rpkm[datamatrix_rpkm.new_mag_name != "unmapped"]
datamatrix_rpkm.index = datamatrix_rpkm.new_mag_name
datamatrix_rpkm = datamatrix_rpkm.drop(columns=['new_mag_name',"Unnamed: 0","old_mag_name"])
qualify_cols = datamatrix_rpkm.sum(axis=0)
numericmatrix_rpkm = datamatrix_rpkm.loc[:,qualify_cols > 0]
datamatrix_rpkm["Method"] = "RPKM"
combined_graph_rpkm, filtered_strong_connections_rpkm = calc_correlations(numericmatrix_rpkm)
combined_graph_rpkm.to_csv(os.path.join("..","input","combined_graph_rpkm_fullpval.csv"))