In [210]:
#
# Import Libraries
#

import os
import pandas as pd
import pickle
import sys
import networkx as nx
import numpy as np
from scipy.stats import poisson
import itertools

from scipy.stats import pearsonr, spearmanr, kendalltau

from functools import reduce

from plotly import graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly

import re

palette = plotly.colors.qualitative.Plotly

from pyvis.network import Network

import subprocess

utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)

In [211]:
#
# Paths & Constants
#

corr_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\02-Correlations\ALDH4\corr_all.pkl'
pval_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\02-Correlations\ALDH4\pvals_all.pkl'

xm_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\ALDH4\WorkingFiles\Xm_norm.tsv'
xq_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Proteomics\ALDH4\WorkingFiles\Xq_minus_X_norm.tsv'

m2i_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\ALDH4\WorkingFiles\f2i_TP.tsv'
q2i_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Proteomics\ALDH4\WorkingFiles\q2info.tsv'

mdata_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\ALDH4\WorkingFiles\main_metadata.tsv'

In [212]:
xq = pd.read_csv(xq_path, sep='\t', index_col=0)
xm = pd.read_csv(xm_path, sep='\t', index_col=0)

q2i = pd.read_csv(q2i_path, sep='\t', index_col='fid')
m2i = pd.read_csv(m2i_path, sep='\t', index_col='fid')

mdata = pd.read_csv(mdata_path, sep='\t')

In [213]:
#
# Read raw data from which the graph will be built
#

with open(corr_path, 'rb') as f:
    corr, corrSL = pickle.load(f)

with open(pval_path, 'rb') as f:
    pv, adpv = pickle.load(f)

Analizar orden y tamaño del grafo usando diferentes tipos de correlaciones y umbrales de FDR

In [214]:
#
# 
#

ctypes = ['rpc', 'psk']

pvrange = [0,0.005, 0.01,0.05, 0.1] #np.arange(0,0.2,0.05)

plot = {}

for ctype in ctypes:
    plot[ctype] = {
        'qq': {'nodes':[], 'edges':[]},
        'mm': {'nodes':[], 'edges':[]}, 
    }
    for pv in pvrange:

        qqc = getattr(corr, ctype).qq.a.copy()
        qqp = adpv[ctype].qq.a.copy()
        qqg = nx.from_pandas_adjacency(qqc[qqp<=pv].fillna(0))
        plot[ctype]['qq']['nodes'].append((np.array(list(dict(nx.degree(qqg)).values()))>0).sum())
        plot[ctype]['qq']['edges'].append(qqg.size())

        mmc = getattr(corr, ctype).mm.a.copy()
        mmp = adpv[ctype].mm.a.copy()
        mmg = nx.from_pandas_adjacency(mmc[mmp<=pv].fillna(0))
        plot[ctype]['mm']['nodes'].append((np.array(list(dict(nx.degree(mmg)).values()))>0).sum())
        plot[ctype]['mm']['edges'].append(mmg.size())


ctypes2 = ['rcca', 'psk', 'cca']

for ctype in ctypes2:
    if ctype not in plot.keys(): plot[ctype]={}
    plot[ctype]['qm'] = {
        'nodes':[], 'edges':[],
    }
    for pv in pvrange:

        qmc = getattr(corr, ctype).qm.a.copy()
        qmp = adpv[ctype].qm.a.copy()
        qmg = qmc[qmp<pv].fillna(0)
        qmg = pd.concat([
            pd.DataFrame(columns=qmg.index, index=qmg.index).fillna(0).join(qmg),
            qmg.T.join(pd.DataFrame(columns=qmg.columns, index=qmg.columns).fillna(0))
        ])
        qmg = nx.from_pandas_adjacency(qmg)
        plot[ctype]['qm']['nodes'].append((np.array(list(dict(nx.degree(qmg)).values()))>0).sum())
        plot[ctype]['qm']['edges'].append(qmg.size())

In [215]:
from plotly.subplots import make_subplots
from plotly import graph_objects as go
import plotly.express as px

palette = px.colors.qualitative.Plotly

file = 'Plots/FDR_CorrType.html'
if os.path.exists(file):
    os.remove(file)

fig = make_subplots(rows=3, cols=2, subplot_titles=['qq - Nodes', 'qq - Edges','mm - Nodes', 'mm - Edges', 'qm - Nodes', 'qm - Edges'])
plot['psk']['qq']['nodes']

for n,ctype in enumerate(ctypes):
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['qq']['nodes'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=1, col=1)
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['qq']['edges'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=1, col=2)
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['mm']['nodes'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=2, col=1)
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['mm']['edges'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=2, col=2)

for n,ctype in enumerate(ctypes2):
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['qm']['nodes'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=3, col=1)
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['qm']['edges'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=3, col=2)

fig.update_layout(title='Number of nodes and edges per correlation type', width=1000, height=700)
fig.show()

with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))


Construcción de grafo usando Graphical Lasso y rCCA con FDR <0.05

In [7]:
# Checkpoint
qqc,qqp,qqg,mmc,mmp,mmg,qmc,qmp,qmg = pickle.load(open('sourceData.pkl', 'rb'))

In [216]:
pv_rpc = 0.05
pv_rcca = 0.01

qqc = corr.psk.qq.a.copy() # Get psk in this case
qqp = adpv.psk.qq.a.copy()
qqg = nx.from_pandas_adjacency(qqc[qqp<=pv_rpc].fillna(0))

mmc = corr.psk.mm.a.copy() # Get psk in this case
mmp = adpv.psk.mm.a.copy()
mmg = nx.from_pandas_adjacency(mmc[mmp<=pv_rpc].fillna(0))

qmc = corr.rcca.qm.a.copy()
qmp = adpv.rcca.qm.a.copy()
qmg = qmc[qmp<pv_rcca].fillna(0)
qmg = pd.concat([
    pd.DataFrame(columns=qmg.index, index=qmg.index).fillna(0).join(qmg),
    qmg.T.join(pd.DataFrame(columns=qmg.columns, index=qmg.columns).fillna(0))
])
qmg.index, qmg.columns = [i.replace('.', '-') for i in qmg.index], [i.replace('.', '-') for i in qmg.columns]
qmg = nx.from_pandas_adjacency(qmg)

In [48]:
# Checkpoint
pickle.dump([qqc,qqp,qqg,mmc,mmp,mmg,qmc,qmp,qmg], open('sourceData.pkl', 'wb'))

In [217]:
#
# Generate random graphs
#
n = 100

qqA = [nx.gnm_random_graph(qqg.order(), qqg.size(), seed=i) for i in range(n)]
mmA = [nx.gnm_random_graph(mmg.order(), mmg.size(), seed=i) for i in range(n)]
# qmA = [nx.gnm_random_graph(qmg.order(), qmg.size(), seed=i) for i in range(n)]

In [50]:
#
# Basic graph characteristics
#

G = qqg
GA = qqA

for G, GA, name in [(qqg, qqA, 'Proteomics'), (mmg, mmA, 'Metabolomics')]:#, (qmg, qmA, 'Proteomics-Metabolomics')]:

    print()
    print(name)
    print()

    print(f'Order: {G.order()}')
    print(f'Size: {G.size()}')
    print(f'Density: {round(nx.density(G), 5)}')

    i = [nx.average_clustering(i) for i in GA]
    print(f'Average clustering: {round(nx.average_clustering(G), 5)} (random graph: mean = {round(np.mean(i),5)}, std = {round(np.std(i),5)})')
    print(f'Average degree: {round(np.mean(list(dict(G.degree).values())),5)}')

    i = [len(list(nx.connected_components(i))) for i in GA]
    print(f'Number of connected components: {round(len(list(nx.connected_components(G))), 5)} (random graph: mean = {round(np.mean(i), 5)}, std = {round(np.std(i), 5)})')

    i = [len([i for i in nx.connected_components(g) if len(i)>1]) for g in GA]
    print(f'Number of connected components (order g.t. 1): {round(len([i for i in nx.connected_components(G) if len(i)>1]), 5)} (random graph: mean = {round(np.mean(i),5)}, std = {round(np.std(i),5)})')

fig = make_subplots(rows=1, cols=2, subplot_titles=['Proteomics', 'Metabolomics'])

plot = nx.degree_histogram(qqg)
fig.add_trace(go.Bar(
    x=np.arange(0,len(plot)),
    y=plot,
    offsetgroup=1, marker_color=palette[0]
), row=1,col=1)
fig.add_trace(go.Bar(
    x=np.arange(0,len(plot)),
    y=G.order()*poisson.pmf(np.arange(0, len(plot)),2*G.size()/G.order()),
    offsetgroup=1, marker_color='black', opacity=0.2
), row=1,col=1)


plot = nx.degree_histogram(mmg)
fig.add_trace(go.Bar(
    x=np.arange(0,len(plot)),
    y=plot, 
    offsetgroup=1, marker_color=palette[1]
), row=1,col=2)
fig.add_trace(go.Bar(
    x=np.arange(0,len(plot)),
    y=G.order()*poisson.pmf(np.arange(0, len(plot)),2*G.size()/G.order()),
    offsetgroup=1, marker_color='black', opacity=0.2
), row=1,col=2)

fig.update_layout(bargap=0.2, title='Degree Distribution', showlegend=False)


Proteomics

Order: 500
Size: 13213
Density: 0.10592
Average clustering: 0.46533 (random graph: mean = 0.10589, std = 0.00061)
Average degree: 52.852
Number of connected components: 1 (random graph: mean = 1.0, std = 0.0)
Number of connected components (order g.t. 1): 1 (random graph: mean = 1.0, std = 0.0)

Metabolomics

Order: 500
Size: 17743
Density: 0.14223
Average clustering: 0.53805 (random graph: mean = 0.14221, std = 0.00053)
Average degree: 70.972
Number of connected components: 1 (random graph: mean = 1.0, std = 0.0)
Number of connected components (order g.t. 1): 1 (random graph: mean = 1.0, std = 0.0)


In [52]:
#
# Number of connected components with different sizes
#

G = mmg

fig = make_subplots(rows=1, cols=3, subplot_titles=['Proteomics', 'Metabolomics'])#, 'Proteomics-Metabolomics'])#go.Figure()
for n,G in enumerate([qqg, mmg, qmg]):
    plot = pd.Series([len(i) for i in nx.connected_components(G)]).value_counts().sort_index()
    fig.add_trace(go.Bar(
        x = [str(i) for i in plot.index],
        y = plot.values,
        width=0.5,
        text=plot.values, textposition='auto', 
    ), row=1, col=n+1)

fig.update_layout(bargap=1)
fig.update_layout(title=f'Number of connected components with different order', showlegend=False)
fig.show()

In [53]:
#
# Number of cliques with diferent orders
#

fig = make_subplots(rows=1, cols=3, subplot_titles=['Proteomics', 'Metabolomics', 'Proteomics-Metabolomics'])#go.Figure()

for n,G in enumerate([qqg, mmg, qmg]):
    plot = pd.Series([len(i) for i in nx.find_cliques(G)]).value_counts().sort_index()
    fig.add_trace(go.Bar(
        x = [str(i) for i in plot.index],
        y = plot.values,
        width=0.5,
        text=plot.values, textposition='auto', 
    ), row=1, col=n+1)

    fig.update_layout(bargap=1)
fig.update_layout(title=f'Number of cliques with different order', showlegend=False)
fig.show()

In [218]:
#
# Build complete graph adding omic type to each node
#

g = nx.compose_all([qqg, mmg, qmg])

attr = {}
_ = [attr.update({i:'q'}) for i in q2i.index]
_ = [attr.update({i:'m'}) for i in m2i.index]
nx.set_node_attributes(g, attr, 'group')
attr = {}
_ = [attr.update({i:q2i.loc[i, 'qdesc']}) for i in q2i.index]
_ = [attr.update({i:m2i.loc[i, 'TP_ID']}) for i in m2i.index]
nx.set_node_attributes(g, attr, 'title')
# attr = {}
# _ = [attr.update({i:'blue'}) for i in q2i.index]
# _ = [attr.update({i:'red'}) for i in m2i.index]
# nx.set_node_attributes(g, attr, 'color')

OBTAIN CLUSTERS/COMMUNITIES - En ALDH4 considerando todos los elementos no obtenemos aristas en los grafos intraómicos con GLASSO. Usamos PSK
en su lugar 

In [19]:
#
# Detect communities (clusters) 
# Louvain & Leiden; Finally we use Leiden

# comm = nx_comm.louvain_communities(g, resolution=1, seed=0)

In [219]:
# Get clustering consensus 

import leidenalg
import igraph as ig
import networkx.algorithms.community as nx_comm

G= mmg
n = 4
thr=10


def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def get_communities(G, n=5, thr=5):
    comm = [
        list(i.to_networkx().nodes)
        for i in leidenalg.find_partition(
                ig.Graph.from_networkx(G), leidenalg.ModularityVertexPartition, 
                n_iterations=-1, seed=-1
            ).subgraphs()
    ]

    commL = [[i for i in comm if len(i)>thr]]

    for i in range(n):
        commi = [
        list(i.to_networkx().nodes)
        for i in leidenalg.find_partition(
                ig.Graph.from_networkx(G), leidenalg.ModularityVertexPartition, 
                n_iterations=-1, seed=i
            ).subgraphs()
        ] 
        comm = [
        np.intersect1d(j, commi[np.argmax([jaccard(j,k) for k in commi])]) for j in comm
        ]

        #comm = [j for j in comm if nx.is_connected(G.subgraph(j)) and len(j)>thr]
        comm = [list(k) for j in comm for k in nx.connected_components(G.subgraph(j)) if len(k)>thr]
        commL.append(comm)
        if len(comm)==0: return commL
    
    return commL


#qqcom = get_communities(qqg, n=n, thr=10)
#mmcom = get_communities(mmg, n=n, thr=10)

In [220]:
#
# Apply get_communities function to random graphs so as to estimate 
# number of iterations n. 
# We calculate the minimum n at which <1% of random graphs have communities with order >thr
# Probability under randomness of obtain a community

thr = 10 # Minimum size of a community
pvThr = 0.05 # Maximum fraction of random graph with a community greater than thr
n = 100

commLqq = [get_communities(i, n=n, thr=thr) for i in qqA]
commLmm = [get_communities(i, n=n, thr=thr) for i in mmA]

In [221]:
# Get fraction of random graph containing communities after x iterations

commLqqn = [[len(gA[i]) if len(gA)>i else 0 for i in range(n+1)] for gA in commLqq]
commLmmn = [[len(gA[i]) if len(gA)>i else 0 for i in range(n+1)] for gA in commLmm]

commLqqn = (np.array(commLqqn)>0).sum(axis=0)/len(qqA)
commLmmn = (np.array(commLmmn)>0).sum(axis=0)/len(mmA)

pvThr = 0.05
qqThr = np.argwhere(commLqqn < pvThr)[0][0]
mmThr = np.argwhere(commLmmn < pvThr)[0][0]

print(f"Protoemics minimum iteration: {qqThr+1}")
print(f"Metabolomics minimum iteration: {mmThr+1}")

Protoemics minimum iteration: 4
Metabolomics minimum iteration: 4


In [224]:
i = 40

file = 'Plots/ClusterConsensus.html'
if os.path.exists(file):
    os.remove(file)


fig = go.Figure()
fig.add_trace(go.Scatter(
    x=np.arange(1, i+1),
    y =commLqqn[:i],
    mode='lines+markers',
    name='Proteomics'
))
fig.add_trace(go.Scatter(
    x=np.arange(1, i+1),
    y =commLmmn[:i],
    mode='lines+markers',
    name='Metabolomics'
))
fig.add_hline(y=0.05,line_width=1, line_dash="dot", line_color="black")
fig.update_layout(title='Fraction of Random Networks with communities vs N. Iterations')
fig.update_xaxes(title='Iterations'); fig.update_yaxes(title='Fraction of Random Networks')

fig.show()
with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

In [223]:
qqcoms = get_communities(qqg, n=n, thr=thr)
mmcoms = get_communities(mmg, n=n, thr=thr)

In [60]:
i = 40


fig = make_subplots(rows=1, cols=2, subplot_titles=['Number of communities', 'Number of features'])

fig.add_trace(go.Scatter(
    x=np.arange(1,i+1),
    y=[len(i) for i in qqcoms][:i],
    mode='lines+markers',
    name='Proteomics', marker_color=palette[0]
), col=1, row=1)
fig.add_trace(go.Scatter(
    x=np.arange(1,i+1),
    y=[len(i) for i in mmcoms][:i],
    mode='lines+markers',
    name='Metabolomics', marker_color=palette[1]
), col=1, row=1)

fig.add_trace(go.Scatter(
    x=np.arange(1,i+1),
    y=[sum([len(j) for j in i]) for i in qqcoms][:i],
    mode='lines+markers',
    name='Proteomics', marker_color=palette[0]
), col=2, row=1)
fig.add_trace(go.Scatter(
    x=np.arange(1,i+1),
    y=[sum([len(j) for j in i]) for i in mmcoms][:i],
    mode='lines+markers',
    name='Metabolomics', marker_color=palette[1]
), col=2, row=1)

fig.add_vline(x=qqThr+1,line_width=1, line_dash="dot", line_color="black", 
              annotation_text='Prot. Thr.', annotation_position='bottom right')
fig.add_vline(x=mmThr+1,line_width=1, line_dash="dot", line_color="black", 
              annotation_text='Metab. Thr.', annotation_position='top right')

fig.update_xaxes(title='Iterations')

fig.show()
with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

In [61]:
qqcom = qqcoms[qqThr]
mmcom = mmcoms[mmThr]

In [62]:
# Number of features per cluster

fig = make_subplots(rows=1, cols=2, subplot_titles=['Proteomics', 'Metabolomics'])
fig.add_trace(go.Bar(
    x=[str(i) for i in range(len(qqcom))],
    y=[len(i) for i in qqcom], width=0.1, showlegend=False
), row=1, col=1)

# mmcl = [1,9,10]
fig.add_trace(go.Bar(
    x=[str(i) for i in range(len(mmcom))],
    y=[len(i) for n,i in enumerate(mmcom)], width=0.2, showlegend=False
), row=1, col=2)
fig.update_xaxes(title='Cluster')
fig.update_yaxes(title='N. Features')

fig.show()
with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

Module Eigenvector

In [63]:
from sklearn.decomposition import PCA

qqcomPCA = [PCA(n_components=1).fit(xq[com]) for com in qqcom]
mmcomPCA = [PCA(n_components=1).fit(xm[com]) for com in mmcom]

fig = make_subplots(rows=1, cols=2, subplot_titles=['Proteomics', 'Metabolomics'])
fig.add_trace(go.Bar(
    x=[str(i) for i in range(len(qqcom))],
    y=[qqcomPCA[i].explained_variance_ratio_[0] for i in range(len(qqcom))], 
    width=0.1, showlegend=False
), row=1, col=1)

fig.add_trace(go.Bar(
    x=[str(i) for i in range(len(mmcom))],
    y=[mmcomPCA[i].explained_variance_ratio_[0] for i in range(len(mmcom))], 
    width=0.2, showlegend=False
), row=1, col=2)

fig.update_yaxes(range=(0,1))
fig.update_xaxes(title='Cluster')
fig.update_layout(title='Ratio of Explained Variance in the 1st PC')

In [64]:
qqcomE = pd.DataFrame(
    np.array([PCA(n_components=1).fit_transform(xq[com])[:,0] for com in qqcom]).T,
    index=xq.index
)

mmcomE = pd.DataFrame(
    np.array([PCA(n_components=1).fit_transform(xm[com])[:,0] for com in mmcom]).T,
    index=xm.index
)

In [66]:
from PCA_UMAP import PCA_Var

catVars = ['Group', 'Control', 'Ig']
conVars = []

qqCP = pd.concat([
    PCA_Var(xq[com], mdata, conVars, catVars, n_comp=1)
    for com in qqcom
]).reset_index(drop=True).T

mmCP = pd.concat([
    PCA_Var(xm[com], mdata, conVars, catVars, n_comp=2).loc[[1]]
    for com in mmcom
]).reset_index(drop=True).T


# Apply FDR B-H correction

from statsmodels.stats.multitest import multipletests

qqCPF = qqCP.iloc[1:, :].copy().T
for i in qqCPF.columns:
    qqCPF[i] = multipletests(qqCPF.loc[:,i], method='fdr_bh')[1]

mmCPF = mmCP.iloc[1:, :].copy().T
for i in mmCPF.columns:
    mmCPF[i] = multipletests(mmCPF.loc[:,i], method='fdr_bh')[1]

In [225]:
qqCP

#mmCPF.T.iloc[:, 20:]
# qqCPF.T

Unnamed: 0,0,1,2,3
%Var PCA,55.780651,67.535087,64.72639,55.785986
Group,0.000284,0.003468,0.004295,0.000342
Control,3.9e-05,0.005651,0.557395,0.087869
Ig,0.059316,0.960181,0.028921,4.5e-05


In [94]:
mdatam = mdata.set_index('Seqn').copy()
mdatam = mdatam.loc[xm.index.tolist()]


(
    xm.loc[mdatam.index[mdatam.Group == 'D'].tolist()].mean()-\
    xm.loc[mdatam.index[mdatam.Group == 'C'].tolist()].mean()
)[mmcom[6]]

#xm[mmcom[8]].loc[mdata[(mdata.set_index('Seqn')['Group']=='C').tolist()].Seqn]


C18N146   -0.015548
C18N91    -0.069683
C18N112    0.022390
C18N136   -0.053705
C18N39    -0.012509
C18N61     0.074186
C18N63     0.071671
C18N96    -0.044834
C18N6     -0.044128
C18N4     -0.070231
C18N1     -0.023281
C18N193   -0.069115
dtype: float64

In [92]:
list(range(qqcomE.shape[1]))

[0, 1, 2, 3]

In [97]:
palette = px.colors.qualitative.Plotly

In [147]:
from sklearn.feature_selection import f_classif

from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [227]:
g2s = {i: j.Seqn.tolist() for i,j in mdata.groupby('Group')}

iicomE = mmcomE

fig = make_subplots(rows=1, cols=iicomE.shape[1], subplot_titles=[f'Com. {i}' for i in iicomE.columns])

for n,pcom in enumerate(iicomE.columns):

    myanova = {}

    for nj, g in enumerate(['PBS', 'B1-8', 'A12']):
        gsn = [i for i in g2s[g] if i in iicomE.index]

        fig.add_trace(go.Scatter(
            y = iicomE.loc[gsn,pcom],
            x = [g]*len(gsn), mode='markers', marker_color=palette[nj], showlegend=False
        ), row=1, col=n+1)

        myanova[g] = iicomE.loc[gsn,pcom].tolist()

    x,y = list(zip(*[(v, g) for g in ['PBS', 'B1-8', 'A12'] for v in myanova[g]]))
    print(f_oneway(*list(myanova.values())))
    tukey = pairwise_tukeyhsd(endog=x,
                            groups=y,
                            alpha=0.05)
    print(tukey.groupsunique)
    print(tukey.pvalues)

fig.show()

F_onewayResult(statistic=7.774958490600925, pvalue=0.004820689782333268)
['A12' 'B1-8' 'PBS']
[0.00878626 0.01168916 0.98882558]
F_onewayResult(statistic=8.294220782897856, pvalue=0.003751655404945847)
['A12' 'B1-8' 'PBS']
[0.00282888 0.25557629 0.07247088]
F_onewayResult(statistic=10.01116675604755, pvalue=0.0017302001573930588)
['A12' 'B1-8' 'PBS']
[0.00234586 0.81704632 0.00795044]
F_onewayResult(statistic=8.08934773911927, pvalue=0.004137613220081679)
['A12' 'B1-8' 'PBS']
[0.00358056 0.45295436 0.04119814]
F_onewayResult(statistic=7.605656159608667, pvalue=0.005240980135628799)
['A12' 'B1-8' 'PBS']
[0.75135193 0.00590987 0.02512763]
F_onewayResult(statistic=8.506759729835563, pvalue=0.0033937807385095767)
['A12' 'B1-8' 'PBS']
[0.00249053 0.08414492 0.20400916]
F_onewayResult(statistic=7.396207606339429, pvalue=0.005819579462735506)
['A12' 'B1-8' 'PBS']
[0.37580638 0.06973173 0.00467534]


In [131]:
aa = m2i.loc[mmcom[0],:]

aa['LFC']=xm.loc[[i for i in g2s['A12'] if i in mmcomE.index], mmcom[0]].mean() - xm.loc[[i for i in g2s['B1-8'] if i in mmcomE.index], mmcom[0]].mean()

In [116]:
myset='''O35459
Q9QXD1
O88833
Q8VDQ8
O35728
P32020
Q3TCN2
P55096'''.strip().split('\n')

xq.loc[[i for i in g2s['A12'] if i in qqcomE.index], myset].mean() - xq.loc[[i for i in g2s['B1-8'] if i in qqcomE.index], myset].mean()

O35459   -0.478770
Q9QXD1    1.168928
O88833   -1.084654
Q8VDQ8    1.365193
O35728   -0.730446
P32020    1.325906
Q3TCN2   -1.344831
P55096    1.307934
dtype: float64

In [228]:
q2i.loc[qqcom[0]]
for i in qqcom[0]:
    print(i)

O35459
Q91VC9
Q3USD5
P09925
Q9QXD1
Q6P6M5
P58710
Q8BTY8
O88833
P70266
Q99K95
P84075
Q91ZX7
Q9CQW1
P18581
P14685
Q3U2P1
Q64FW2
Q8VDQ8
V9GZG9
O35728
Q8VC19
Q9JJW6-2
P36536
O55137
P32020
O88668
Q3U1J4
Q3TCN2
P55096


In [50]:
# atable = q2i.loc[qqcom[0]]
# atable = m2i.loc[mmcom[9], ['ID', 'TP_ID']]

In [43]:
q2i['qName'] = q2i.qdesc #[re.search(r'HUMAN (.*) OS=', i).groups()[0] for i in q2i.qdesc]

In [48]:
writer = pd.ExcelWriter('Clusters/DCA.xlsx', engine='openpyxl', mode='w')

pd.DataFrame(
    [(q2i.loc[i0, 'qName'],q2i.loc[i1, 'qName'],w['weight']) for i0, i1, w in qqg.edges.data()]
    ).to_excel(writer, sheet_name='q')

pd.DataFrame(
    [(m2i.loc[i0,'TP_ID'],m2i.loc[i1,'TP_ID'],w['weight']) for i0, i1, w in mmg.edges.data()]
    ).to_excel(writer, sheet_name='m')

pd.DataFrame(
    [(q2i.loc[i0, 'qName'],m2i.loc[i1,'TP_ID'],w['weight']) for i0, i1, w in qmg.edges.data()]
    ).to_excel(writer, sheet_name='qm')

writer.close()

Pertenencia de cada nodo a su cluster

In [204]:
qstats = pd.read_csv(r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\01-BasicStats\ALDH4\Xq_stats.tsv", sep='\t', index_col=0)
mstats = pd.read_csv(r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\01-BasicStats\ALDH4\Xm_stats.tsv", sep='\t', index_col=0)
# m2iC

In [205]:
q2iC = q2i.join(
    pd.DataFrame(
        {q: dict(zip(['ComCorr', 'ComPval', 'Com'], [*pearsonr(qqcomE[n], xq[q]), n])) 
        for n,com in enumerate(qqcom) for q in com}
        ).T,
    how='left'
).join(qstats, how='left')

m2iC = m2i.join(
    pd.DataFrame(
        {m: dict(zip(['ComCorr', 'ComPval', 'Com'], [*pearsonr(mmcomE[n], xm[m]), n])) 
        for n,com in enumerate(mmcom) for m in com}
        ).T,
    how='left'
).join(mstats, how='left')#.loc[:, ['TP_ID', 'ComCorr', 'ComPval', 'Com']]

In [43]:
# Write feature info tables separated by cluster

_ = [
    i[1].to_csv(f'Clusters/qq_{int(i[0])}.tsv', sep='\t')
    for i in q2iC[~q2iC['Com'].isna()].groupby('Com')
]

_ = [
    i[1].to_csv(f'Clusters/mm_{int(i[0])}.tsv', sep='\t')
    for i in m2iC[~m2iC['Com'].isna()].groupby('Com')
]

In [206]:
# Write feature info tables separated by cluster

with pd.ExcelWriter(f'Clusters/qq_allClusters.xlsx') as writer:
    _ = [
        i[1].to_excel(writer, sheet_name=f'qq_{int(i[0])}')
        for i in q2iC[~q2iC['Com'].isna()].groupby('Com')
    ]

with pd.ExcelWriter(f'Clusters/mm_allClusters.xlsx') as writer:
    _ = [
        i[1].to_excel(writer, sheet_name=f'mm_{int(i[0])}')
        for i in m2iC[~m2iC['Com'].isna()].groupby('Com')
    ]
    

Conexion entre comunidades de proteomica y metabolomica

In [197]:
# Calcular meta-network mediante correlaciones entre autovectores de los clusteres

import itertools


metaqq = [
    ([i0, i1], [*pearsonr(qqcomE[i0], qqcomE[i1])])
    for i0, i1 in itertools.combinations(qqcomE.columns, 2)
]

metamm = [
    ([i0, i1], [*pearsonr(mmcomE[i0], mmcomE[i1])])
    for i0, i1 in itertools.combinations(mmcomE.columns, 2)
]

cidx = np.intersect1d(qqcomE.index, mmcomE.index)

metaqm = [
    ([i0, i1], [*pearsonr(qqcomE.loc[cidx, i0], mmcomE.loc[cidx, i1])])
    for i0, i1 in itertools.product(qqcomE.columns, mmcomE.columns)
]
qmfdr = multipletests(list(zip(*list(zip(*metaqm))[1]))[1], method='fdr_bh')[1]

qmfdr = pd.DataFrame([[f'q{i[0][0]}',f'm{i[0][1]}', *i[1],j, -np.log(j)] for i,j in zip(metaqm, qmfdr)], columns=['q', 'm', 'r', 'pv', 'qv','Lqv'])

qmfdr.loc[qmfdr.qv < 0.05,['q', 'm', 'Lqv']].rename(columns={'q':'var1', 'm':'var2', 'Lqv':'value'}).to_csv('ChordDiagram/meta-network_WGCNA.tsv', sep='\t', index=False)

qmfdr

# pd.DataFrame(
#     [(f'q{i[0]}',f'm{i[1]}',j[0]) for i,j in metaqm if j[1]<0.01],
#     columns=['var1', 'var2', 'value']
# )

Unnamed: 0,q,m,r,pv,qv,Lqv
0,q0,m0,-0.892448,3e-06,9.4e-05,9.275961
1,q0,m1,-0.720417,0.001644,0.015348,4.176778
2,q0,m2,0.490598,0.053675,0.136627,1.990502
3,q0,m3,-0.468171,0.067412,0.145195,1.929679
4,q0,m4,0.477907,0.061166,0.14272,1.946872
5,q0,m5,0.309541,0.243348,0.367084,1.002166
6,q0,m6,0.123364,0.648984,0.698906,0.358239
7,q1,m0,0.278187,0.296828,0.406251,0.900785
8,q1,m1,0.305982,0.249092,0.367084,1.002166
9,q1,m2,-0.676523,0.004004,0.022425,3.797601


Distribucion de Protein-Metabolite por proteina y clase de metabolito

In [229]:
rcca = corr.rcca.qm.a.copy()

In [248]:
rcca.index = [i.replace('.','-') for i in rcca.index]

In [235]:
pd.melt(rcca.reset_index(), 'index', rcca.columns)

Unnamed: 0,index,variable,value
0,Q543F6,P687,-0.261660
1,Q5H8C4,P687,-0.284663
2,P18581,P687,-0.621841
3,O08547,P687,0.749938
4,Q99KU0,P687,-0.171353
...,...,...,...
249995,Q8VC12,P2517,0.548403
249996,B2RY19,P2517,-0.408234
249997,Q91ZJ5,P2517,0.117186
249998,P11531,P2517,-0.282072


In [249]:
aa = pd.merge(
    pd.merge(
        pd.melt(rcca.reset_index(), 'index', rcca.columns).rename(columns={'index':'q', 'variable':'m'}),
        q2i.reset_index(),
        left_on='q', right_on='fid', how='left'
    ),
    m2i.loc[:, ['TP_ID']].reset_index(),
    left_on='m', right_on='fid', how='left'
)


In [269]:
aa.to_excel('rcca.xlsx')

In [50]:
m2im = m2i.loc[xm.columns]

In [None]:
# 

In [252]:
i

'Q99P30'

In [273]:
apos = [
    ('Q99P30', 'Peroxisomal coenzyme A diphosphatase NUDT7'), # PI
    ('Q9CRB3', '5-hydroxyisourate hydrolase'), # PI
    ('Q3UEJ8', 'Pipecolic acid oxidase'), # PI
    ('D6REH1', 'Peroxisomal membrane protein 2'), # PI

    ('O35728', 'CP4AE_MOUSE Cytochrome P450 4A14'), # TG # DG # PC
    ('Q9ET01', 'Glycogen phosphorylase, liver form') # TG # DG # PC

]

lre = r"LMSD{ TG "

fig = make_subplots(rows=2, cols=4, subplot_titles=list(zip(*apos))[1], shared_yaxes=True, vertical_spacing=0.1)

i,j  = apos[0]

for n,(i,j) in enumerate(apos):

    fig.add_trace(go.Scatter(
        y = rcca.loc[i], x = np.random.uniform(0.5,1.5, rcca.shape[1]),
        mode='markers', marker_size=2, marker_color=palette[0], showlegend=False
    ), row=1+n//4, col=1+n%4)

    fig.add_trace(go.Scatter(
        y = rcca.loc[i, m2i.loc[rcca.columns,:].TP_ID[[bool(re.search(lre, str(i))) for i in m2i.loc[rcca.columns,:].TP_ID]].index], 
        x = np.random.uniform(2.8,3.2, rcca.shape[1]),
        mode='markers', marker_size=3, marker_color=palette[1], showlegend=False
    ), row=1+n//4, col=1+n%4)

fig.update_layout(height=800)
fig.update_xaxes(tickvals=[1,3], ticktext=['All', 'TG'], range=(0,4))
#fig.update_yaxes(range=(-0.3,0.3))

fig.show()




In [62]:
apos = [
    ('P00738', 'HPT'),
    ('P01833', 'PIGR'),
    ('P05546', 'HEP2'),
    ('P08519', 'LPA'),
    ('P01877', 'IGHA2'),
    ('P01834', 'IGKC'),
]

lre = [
    (r"LMSD{ PE [0-9]", 'PE'),
    (r"LMSD{ PC [0-9]", 'PC'),
    (r"LMSD{ LPE [0-9]", 'LPE'),
    (r"LMSD{ LPE [0-9]", 'LPC')
]

fig = make_subplots(rows=2, cols=3, subplot_titles=list(zip(*apos))[1], shared_yaxes=True, vertical_spacing=0.1)

i,j  = apos[0]

for n,(i,j) in enumerate(apos):
    if i not in rcca.index: continue
    fig.add_trace(go.Scatter(
        y = rcca.loc[i], x = np.random.uniform(0.5,1.5, rcca.shape[1]),
        mode='markers', marker_size=1, marker_color=palette[0], showlegend=False
    ), row=1+n//3, col=1+n%3)

    for m,(lre_i, lre_j) in enumerate(lre):

        fig.add_trace(go.Scatter(
            y = rcca.loc[i, m2i.TP_ID[[bool(re.search(lre_i, i)) for i in m2i.TP_ID]].index], 
            x = np.random.uniform(m+1.8,m+2.2, rcca.shape[1]),
            mode='markers', marker_size=3, marker_color=palette[m+1], showlegend=False
        ), row=1+n//3, col=1+n%3)

fig.update_layout(height=800)
fig.update_xaxes(tickvals=[1,2,3,4,5], ticktext=['All', *list(zip(*lre))[1]], range=(0,6))
fig.update_yaxes(range=(-0.3,0.3))

fig.show()





In [66]:
mdatam = mdata.set_index('Seqn').copy()
mdatam = mdatam.loc[xm.index.tolist()]

In [67]:
apos = [
    ('P00738', 'HPT'),
    ('P01833', 'PIGR'),
    ('P05546', 'HEP2'),
    ('P08519', 'LPA'),
    ('P01877', 'IGHA2'),
    ('P01834', 'IGKC'),
]

lre = [
    (r"LMSD{ PE [0-9]", 'PE'),
    (r"LMSD{ PC [0-9]", 'PC'),
    (r"LMSD{ LPE [0-9]", 'LPE'),
    (r"LMSD{ LPE [0-9]", 'LPC')
]

#fig = make_subplots(rows=2, cols=3, subplot_titles=list(zip(*apos))[1], shared_yaxes=True, vertical_spacing=0.1)
fig = go.Figure()

i,j  = apos[0]

# for n,(i,j) in enumerate(apos):

#     fig.add_trace(go.Scatter(
#         y = rcca.loc[i], x = np.random.uniform(0.5,1.5, rcca.shape[1]),
#         mode='markers', marker_size=1, marker_color=palette[0], showlegend=False
#     ), row=1+n//3, col=1+n%3)

points = xm.loc[mdatam.loc[mdatam.Group == 'D'].index, :].mean()\
    -xm.loc[mdatam.loc[mdatam.Group == 'C'].index, :].mean()
fig.add_trace(go.Scatter(
    y = points, 
    x = np.random.uniform(0.5,1.5, len(points)),
    mode='markers', marker_size=1, marker_color=palette[0], showlegend=False
))

for m,(lre_i, lre_j) in enumerate(lre):
    myL = m2i.TP_ID[[bool(re.search(lre_i, i)) for i in m2i.TP_ID]].index
    
    points = xm.loc[mdatam.loc[mdatam.Group == 'D'].index, myL].mean()\
        -xm.loc[mdatam.loc[mdatam.Group == 'C'].index, myL].mean()

    fig.add_trace(go.Scatter(
        y = points, 
        x = np.random.uniform(m+1.8,m+2.2, len(points)),
        mode='markers', marker_size=3, marker_color=palette[m+1], showlegend=False
    ))

fig.update_layout(width=700)
fig.update_xaxes(tickvals=[1,2,3,4,5], ticktext=['All', *list(zip(*lre))[1]], range=(0,6))
#fig.update_yaxes(range=(-0.3,0.3))

fig.show()


