In [25]:
#
# Import Libraries
#

import os
import pandas as pd
import pickle
import sys
import networkx as nx
import numpy as np
from scipy.stats import poisson
import itertools

from scipy.stats import pearsonr, spearmanr, kendalltau

from functools import reduce

from plotly import graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly

import re

palette = plotly.colors.qualitative.Plotly

from pyvis.network import Network

import subprocess

utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)

In [26]:
#
# Paths & Constants
#

corr_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\02-Correlations\PESA_V2\corr.pkl'
pval_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\02-Correlations\PESA_V2\pvals.pkl'

xm_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\PESA_V2\WorkingFiles\Xm_norm.tsv'
xq_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Proteomics\PESA_V2\WorkingFiles\Xq_minus_X_norm.tsv'

m2i_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\PESA_V2\WorkingFiles\f2i.tsv'
q2i_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Proteomics\PESA_V2\WorkingFiles\q2info.tsv'

mdata_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\PESA_V2\WorkingFiles\main_metadata.tsv'

In [27]:
xq = pd.read_csv(xq_path, sep='\t', index_col='seqn')
xm = pd.read_csv(xm_path, sep='\t', index_col='Seqn')

q2i = pd.read_csv(q2i_path, sep='\t', index_col='fid')
m2i = pd.read_csv(m2i_path, sep='\t', index_col='fid')

mdata = pd.read_csv(mdata_path, sep='\t')

In [28]:
#
# Read raw data from which the graph will be built
#

with open(corr_path, 'rb') as f:
    corr, corrSL = pickle.load(f)

with open(pval_path, 'rb') as f:
    pv, adpv = pickle.load(f)

Analizar orden y tamaño del grafo usando diferentes tipos de correlaciones y umbrales de FDR

In [29]:
#
# 
#

ctypes = ['rpc', 'psk']

pvrange = np.arange(0,0.2,0.05)

plot = {}

for ctype in ctypes:
    plot[ctype] = {
        'qq': {'nodes':[], 'edges':[]},
        'mm': {'nodes':[], 'edges':[]}, 
    }
    for pv in pvrange:

        qqc = getattr(corr, ctype).qq.dc.copy()
        qqp = adpv[ctype].qq.dc.copy()
        qqg = nx.from_pandas_adjacency(qqc[qqp<=pv].fillna(0))
        plot[ctype]['qq']['nodes'].append((np.array(list(dict(nx.degree(qqg)).values()))>0).sum())
        plot[ctype]['qq']['edges'].append(qqg.size())

        mmc = getattr(corr, ctype).mm.dc.copy()
        mmp = adpv[ctype].mm.dc.copy()
        mmg = nx.from_pandas_adjacency(mmc[mmp<=pv].fillna(0))
        plot[ctype]['mm']['nodes'].append((np.array(list(dict(nx.degree(mmg)).values()))>0).sum())
        plot[ctype]['mm']['edges'].append(mmg.size())


ctypes2 = ['rcca', 'psk', 'cca']

for ctype in ctypes2:
    if ctype not in plot.keys(): plot[ctype]={}
    plot[ctype]['qm'] = {
        'nodes':[], 'edges':[],
    }
    for pv in pvrange:

        qmc = getattr(corr, ctype).qm.dc.copy()
        qmp = adpv[ctype].qm.dc.copy()
        qmg = qmc[qmp<pv].fillna(0)
        qmg = pd.concat([
            pd.DataFrame(columns=qmg.index, index=qmg.index).fillna(0).join(qmg),
            qmg.T.join(pd.DataFrame(columns=qmg.columns, index=qmg.columns).fillna(0))
        ])
        qmg = nx.from_pandas_adjacency(qmg)
        plot[ctype]['qm']['nodes'].append((np.array(list(dict(nx.degree(qmg)).values()))>0).sum())
        plot[ctype]['qm']['edges'].append(qmg.size())

In [30]:
from plotly.subplots import make_subplots
from plotly import graph_objects as go
import plotly.express as px

palette = px.colors.qualitative.Plotly

file = 'Plots/FDR_CorrType.html'
if os.path.exists(file):
    os.remove(file)

fig = make_subplots(rows=3, cols=2, subplot_titles=['qq - Nodes', 'qq - Edges','mm - Nodes', 'mm - Edges', 'qm - Nodes', 'qm - Edges'])
plot['psk']['qq']['nodes']

for n,ctype in enumerate(ctypes):
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['qq']['nodes'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=1, col=1)
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['qq']['edges'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=1, col=2)
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['mm']['nodes'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=2, col=1)
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['mm']['edges'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=2, col=2)

for n,ctype in enumerate(ctypes2):
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['qm']['nodes'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=3, col=1)
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['qm']['edges'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=3, col=2)

fig.update_layout(title='Number of nodes and edges per correlation type', width=700, height=500)
fig.show()

with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))


Construcción de grafo usando Graphical Lasso y rCCA con FDR <0.05

In [31]:
# Checkpoint
qqc,qqp,qqg,mmc,mmp,mmg,qmc,qmp,qmg = pickle.load(open('sourceData.pkl', 'rb'))

In [32]:
pv = 0.05

qqc = corr.rpc.qq.dc.copy()
qqp = adpv.rpc.qq.dc.copy()
qqg = nx.from_pandas_adjacency(qqc[qqp<=pv].fillna(0))

mmc = corr.rpc.mm.dc.copy()
mmp = adpv.rpc.mm.dc.copy()
mmg = nx.from_pandas_adjacency(mmc[mmp<=pv].fillna(0))

qmc = corr.rcca.qm.dc.copy()
qmp = adpv.rcca.qm.dc.copy()
qmg = qmc[qmp<pv].fillna(0)
qmg = pd.concat([
    pd.DataFrame(columns=qmg.index, index=qmg.index).fillna(0).join(qmg),
    qmg.T.join(pd.DataFrame(columns=qmg.columns, index=qmg.columns).fillna(0))
])
qmg.index, qmg.columns = [i.replace('.', '-') for i in qmg.index], [i.replace('.', '-') for i in qmg.columns]
qmg = nx.from_pandas_adjacency(qmg)

In [33]:
# Checkpoint
pickle.dump([qqc,qqp,qqg,mmc,mmp,mmg,qmc,qmp,qmg], open('sourceData.pkl', 'wb'))

In [34]:
#
# Generate random graphs
#
n = 100

qqA = [nx.gnm_random_graph(qqg.order(), qqg.size(), seed=i) for i in range(n)]
mmA = [nx.gnm_random_graph(mmg.order(), mmg.size(), seed=i) for i in range(n)]

In [35]:
#
# Basic graph characteristics
#

G = qqg
GA = qqA

for G, GA, name in [(qqg, qqA, 'Proteomics'), (mmg, mmA, 'Metabolomics')]:

    print()
    print(name)
    print()

    print(f'Order: {G.order()}')
    print(f'Size: {G.size()}')
    print(f'Density: {round(nx.density(G), 5)}')

    i = [nx.average_clustering(i) for i in GA]
    print(f'Average clustering: {round(nx.average_clustering(G), 5)} (random graph: mean = {round(np.mean(i),5)}, std = {round(np.std(i),5)})')
    print(f'Average degree: {round(np.mean(list(dict(G.degree).values())),5)}')

    i = [len(list(nx.connected_components(i))) for i in GA]
    print(f'Number of connected components: {round(len(list(nx.connected_components(G))), 5)} (random graph: mean = {round(np.mean(i), 5)}, std = {round(np.std(i), 5)})')

    i = [len([i for i in nx.connected_components(g) if len(i)>1]) for g in GA]
    print(f'Number of connected components (order g.t. 1): {round(len([i for i in nx.connected_components(G) if len(i)>1]), 5)} (random graph: mean = {round(np.mean(i),5)}, std = {round(np.std(i),5)})')

fig = make_subplots(rows=1, cols=2, subplot_titles=['Proteomics', 'Metabolomics'])

plot = nx.degree_histogram(qqg)
fig.add_trace(go.Bar(
    x=np.arange(0,len(plot)),
    y=plot,
    offsetgroup=1, marker_color=palette[0]
), row=1,col=1)
fig.add_trace(go.Bar(
    x=np.arange(0,len(plot)),
    y=G.order()*poisson.pmf(np.arange(0, len(plot)),2*G.size()/G.order()),
    offsetgroup=1, marker_color='black', opacity=0.2
), row=1,col=1)


plot = nx.degree_histogram(mmg)
fig.add_trace(go.Bar(
    x=np.arange(0,len(plot)),
    y=plot, 
    offsetgroup=1, marker_color=palette[1]
), row=1,col=2)
fig.add_trace(go.Bar(
    x=np.arange(0,len(plot)),
    y=G.order()*poisson.pmf(np.arange(0, len(plot)),2*G.size()/G.order()),
    offsetgroup=1, marker_color='black', opacity=0.2
), row=1,col=2)

fig.update_layout(bargap=0.2, title='Degree Distribution', showlegend=False)


Proteomics

Order: 249
Size: 2546
Density: 0.08246
Average clustering: 0.25056 (random graph: mean = 0.08267, std = 0.0023)
Average degree: 20.4498
Number of connected components: 3 (random graph: mean = 1.0, std = 0.0)
Number of connected components (order g.t. 1): 1 (random graph: mean = 1.0, std = 0.0)

Metabolomics

Order: 2062
Size: 6134
Density: 0.00289
Average clustering: 0.3258 (random graph: mean = 0.0028, std = 0.00057)
Average degree: 5.94956
Number of connected components: 904 (random graph: mean = 5.84, std = 2.48081)
Number of connected components (order g.t. 1): 83 (random graph: mean = 1.04, std = 0.19596)


In [36]:
#
# Number of connected components with different sizes
#

G = mmg

fig = make_subplots(rows=1, cols=2, subplot_titles=['Proteomics', 'Metabolomics'])#go.Figure()
for n,G in enumerate([qqg, mmg]):
    plot = pd.Series([len(i) for i in nx.connected_components(G)]).value_counts().sort_index()
    fig.add_trace(go.Bar(
        x = [str(i) for i in plot.index],
        y = plot.values,
        width=0.5,
        text=plot.values, textposition='auto', 
    ), row=1, col=n+1)

fig.update_layout(bargap=1)
fig.update_layout(title=f'Number of connected components with different order', showlegend=False)
fig.show()

In [37]:
#
# Number of cliques with diferent orders
#

fig = make_subplots(rows=1, cols=2, subplot_titles=['Proteomics', 'Metabolomics'])#go.Figure()

for n,G in enumerate([qqg, mmg]):
    plot = pd.Series([len(i) for i in nx.find_cliques(G)]).value_counts().sort_index()
    fig.add_trace(go.Bar(
        x = [str(i) for i in plot.index],
        y = plot.values,
        width=0.5,
        text=plot.values, textposition='auto', 
    ), row=1, col=n+1)

    fig.update_layout(bargap=1)
fig.update_layout(title=f'Number of cliques with different order', showlegend=False)
fig.show()

In [38]:
#
# Build complete graph adding omic type to each node
#

g = nx.compose_all([qqg, mmg, qmg])

attr = {}
_ = [attr.update({i:'q'}) for i in q2i.index]
_ = [attr.update({i:'m'}) for i in m2i.index]
nx.set_node_attributes(g, attr, 'group')
attr = {}
_ = [attr.update({i:q2i.loc[i, 'qdesc']}) for i in q2i.index]
_ = [attr.update({i:m2i.loc[i, 'TP_ID']}) for i in m2i.index]
nx.set_node_attributes(g, attr, 'title')
# attr = {}
# _ = [attr.update({i:'blue'}) for i in q2i.index]
# _ = [attr.update({i:'red'}) for i in m2i.index]
# nx.set_node_attributes(g, attr, 'color')

OBTAIN CLUSTERS/COMMUNITIES

In [39]:
#
# Detect communities (clusters) 
# Louvain & Leiden; Finally we use Leiden

# comm = nx_comm.louvain_communities(g, resolution=1, seed=0)

In [40]:
# Get clustering consensus 

import leidenalg
import igraph as ig
import networkx.algorithms.community as nx_comm

G= mmg
n = 4
thr=10


def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def get_communities(G, n=5, thr=5):
    comm = [
        list(i.to_networkx().nodes)
        for i in leidenalg.find_partition(
                ig.Graph.from_networkx(G), leidenalg.ModularityVertexPartition, 
                n_iterations=-1, seed=-1
            ).subgraphs()
    ]

    commL = [[i for i in comm if len(i)>thr]]

    for i in range(n):
        commi = [
        list(i.to_networkx().nodes)
        for i in leidenalg.find_partition(
                ig.Graph.from_networkx(G), leidenalg.ModularityVertexPartition, 
                n_iterations=-1, seed=i
            ).subgraphs()
        ] 
        comm = [
        np.intersect1d(j, commi[np.argmax([jaccard(j,k) for k in commi])]) for j in comm
        ]

        #comm = [j for j in comm if nx.is_connected(G.subgraph(j)) and len(j)>thr]
        comm = [list(k) for j in comm for k in nx.connected_components(G.subgraph(j)) if len(k)>thr]
        commL.append(comm)
        if len(comm)==0: return commL
    
    return commL


#qqcom = get_communities(qqg, n=n, thr=10)
#mmcom = get_communities(mmg, n=n, thr=10)

In [41]:
#
# Apply get_communities function to random graphs so as to estimate 
# number of iterations n. 
# We calculate the minimum n at which <1% of random graphs have communities with order >thr
# Probability under randomness of obtain a community

thr = 10 # Minimum size of a community
pvThr = 0.05 # Maximum fraction of random graph with a community greater than thr
n = 100

commLqq = [get_communities(i, n=n, thr=thr) for i in qqA]
commLmm = [get_communities(i, n=n, thr=thr) for i in mmA]

In [42]:
# Get fraction of random graph containing communities after x iterations

commLqqn = [[len(gA[i]) if len(gA)>i else 0 for i in range(n+1)] for gA in commLqq]
commLmmn = [[len(gA[i]) if len(gA)>i else 0 for i in range(n+1)] for gA in commLmm]

commLqqn = (np.array(commLqqn)>0).sum(axis=0)/len(qqA)
commLmmn = (np.array(commLmmn)>0).sum(axis=0)/len(mmA)

pvThr = 0.05
qqThr = np.argwhere(commLqqn < pvThr)[0][0]
mmThr = np.argwhere(commLmmn < pvThr)[0][0]

print(f"Protoemics minimum iteration: {qqThr+1}")
print(f"Metabolomics minimum iteration: {mmThr+1}")

Protoemics minimum iteration: 5
Metabolomics minimum iteration: 5


In [43]:
i = 40

file = 'Plots/ClusterConsensus.html'
if os.path.exists(file):
    os.remove(file)


fig = go.Figure()
fig.add_trace(go.Scatter(
    x=np.arange(1, i+1),
    y =commLqqn[:i],
    mode='lines+markers',
    name='Proteomics'
))
fig.add_trace(go.Scatter(
    x=np.arange(1, i+1),
    y =commLmmn[:i],
    mode='lines+markers',
    name='Metabolomics'
))
fig.add_hline(y=0.05,line_width=1, line_dash="dot", line_color="black")
fig.update_layout(title='Fraction of Random Networks with communities vs N. Iterations')
fig.update_xaxes(title='Iterations'); fig.update_yaxes(title='Fraction of Random Networks')

fig.show()
with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

In [44]:
qqcoms = get_communities(qqg, n=n, thr=thr)
mmcoms = get_communities(mmg, n=n, thr=thr)

In [45]:
i = 40


fig = make_subplots(rows=1, cols=2, subplot_titles=['Number of communities', 'Number of features'])

fig.add_trace(go.Scatter(
    x=np.arange(1,i+1),
    y=[len(i) for i in qqcoms][:i],
    mode='lines+markers',
    name='Proteomics', marker_color=palette[0]
), col=1, row=1)
fig.add_trace(go.Scatter(
    x=np.arange(1,i+1),
    y=[len(i) for i in mmcoms][:i],
    mode='lines+markers',
    name='Metabolomics', marker_color=palette[1]
), col=1, row=1)

fig.add_trace(go.Scatter(
    x=np.arange(1,i+1),
    y=[sum([len(j) for j in i]) for i in qqcoms][:i],
    mode='lines+markers',
    name='Proteomics', marker_color=palette[0]
), col=2, row=1)
fig.add_trace(go.Scatter(
    x=np.arange(1,i+1),
    y=[sum([len(j) for j in i]) for i in mmcoms][:i],
    mode='lines+markers',
    name='Metabolomics', marker_color=palette[1]
), col=2, row=1)

fig.add_vline(x=qqThr+1,line_width=1, line_dash="dot", line_color="black", 
              annotation_text='Prot. Thr.', annotation_position='bottom right')
fig.add_vline(x=mmThr+1,line_width=1, line_dash="dot", line_color="black", 
              annotation_text='Metab. Thr.', annotation_position='top right')

fig.update_xaxes(title='Iterations')

fig.show()
with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

In [46]:
qqcom = qqcoms[qqThr]
mmcom = mmcoms[mmThr]

In [73]:
# Number of features per cluster

fig = make_subplots(rows=1, cols=2, subplot_titles=['Proteomics', 'Metabolomics'])
fig.add_trace(go.Bar(
    x=[str(i) for i in range(len(qqcom))],
    y=[len(i) for i in qqcom], width=0.1, showlegend=False
), row=1, col=1)

mmcl = [4,6,20,21]
fig.add_trace(go.Bar(
    x=[str(i) for i in range(len(mmcom)) if i in mmcl],
    y=[len(i) for n,i in enumerate(mmcom) if n in mmcl], width=0.2, showlegend=False
), row=1, col=2)
fig.update_xaxes(title='Cluster')
fig.update_yaxes(title='N. Features')

fig.show()
with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

In [48]:
# import socket
# import json

# json_data = json.dumps(qqcom)
# s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# s.connect(('127.0.0.1', 8080))
# s.send(json_data.encode())

Module Eigenvector

In [71]:
from sklearn.decomposition import PCA

qqcomPCA = [PCA(n_components=1).fit(xq[com]) for com in qqcom]
mmcomPCA = [PCA(n_components=1).fit(xm[com]) for com in mmcom]

fig = make_subplots(rows=1, cols=2, subplot_titles=['Proteomics', 'Metabolomics'])
fig.add_trace(go.Bar(
    x=[str(i) for i in range(len(qqcom))],
    y=[qqcomPCA[i].explained_variance_ratio_[0] for i in range(len(qqcom))], 
    width=0.1, showlegend=False
), row=1, col=1)


fig.add_trace(go.Bar(
    x=[str(i) for i in range(len(mmcom))],
    y=[mmcomPCA[i].explained_variance_ratio_[0] for i in range(len(mmcom))], 
    width=0.2, showlegend=False
), row=1, col=2)

fig.update_yaxes(range=(0,1))
fig.update_xaxes(title='Cluster')
fig.update_layout(title='Ratio of Explained Variance in the 1st PC')

In [50]:
qqcomE = pd.DataFrame(
    np.array([PCA(n_components=1).fit_transform(xq[com])[:,0] for com in qqcom]).T,
    index=xq.index
)

mmcomE = pd.DataFrame(
    np.array([PCA(n_components=1).fit_transform(xm[com])[:,0] for com in mmcom]).T,
    index=xm.index
)

In [51]:
from PCA_UMAP import PCA_Var

catVars = ['Group', 'Smoke_dummy']
conVars = ['Calcium_Score', 'HDL', 'LDL', 'Total_Cholesterol','Ox-LDL','Lipoprotein a',
           'CRP', 'Plaque_thickness','Framingham 10y','Framingham 30y','Systolic Blood Pressure',
           'Diastolic Blood Pressure']

qqCP = pd.concat([
    PCA_Var(xq[com], mdata, conVars, catVars+['Cohort'], n_comp=1)
    for com in qqcom
]).reset_index(drop=True).T

mmCP = pd.concat([
    PCA_Var(xm[com], mdata, conVars, catVars+['Cohort'], n_comp=2).loc[[1]]
    for com in mmcom
]).reset_index(drop=True).T


# Apply FDR B-H correction

from statsmodels.stats.multitest import multipletests

qqCPF = qqCP.iloc[1:, :].copy().T
for i in qqCPF.columns:
    qqCPF[i] = multipletests(qqCPF.loc[:,i], method='fdr_bh')[1]

mmCPF = mmCP.iloc[1:, :].copy().T
for i in mmCPF.columns:
    mmCPF[i] = multipletests(mmCPF.loc[:,i], method='fdr_bh')[1]

In [69]:
mmCPF.T.iloc[:, 10:]
# qqCPF.T

Unnamed: 0,10,11,12,13,14,15,16,17,18,19,20,21
Calcium_Score,0.984459,0.224568,0.121376,0.755498,0.755498,0.755498,0.755498,0.719862,0.559008,0.559008,0.01142668,0.1028507
HDL,0.001193,0.454819,0.501365,0.0027,0.937431,0.937431,0.501063,0.797933,0.428724,0.937431,0.02806867,3.325185e-25
LDL,0.295739,0.997812,0.59163,0.000403,0.560855,0.050137,2e-05,0.213308,0.955782,0.087568,0.3470535,0.5952995
Total_Cholesterol,0.965364,0.965364,0.733944,0.000217,0.610684,0.038207,2.3e-05,0.610684,0.930227,0.129676,0.965364,0.6106845
Ox-LDL,0.48843,0.629136,0.908805,0.066926,0.656726,0.057917,0.066926,0.629136,0.48843,0.48843,0.6291359,0.003407194
Lipoprotein a,0.987602,0.987602,0.987602,0.987602,0.987602,0.987602,0.987602,0.987602,0.987602,0.987602,0.9876022,0.9876022
CRP,0.198141,0.337973,0.091609,0.39153,0.741461,0.337973,0.002784,0.337973,0.337973,0.337973,0.00993138,0.09160925
Plaque_thickness,0.732485,0.919827,0.812775,0.812775,0.732485,0.91631,0.812775,0.812775,0.177882,0.603756,0.0004460482,0.01647297
Framingham 10y,0.188924,0.408702,0.408702,8e-06,0.915262,0.175559,0.002556,0.623296,0.096635,0.188924,3.685622e-06,1.363802e-15
Framingham 30y,0.22126,0.219753,0.220848,7.5e-05,0.477548,0.12953,0.001829,0.11828,0.09772,0.219753,5.207454e-08,5.615226000000001e-17


In [53]:
# La PC1 de qq1 y qq3 separan entre grupos. Sacar violin plot de la proyección
qqcomE
tmp = dict(list(mdata.set_index('Seqn').loc[qqcomE.index].reset_index(names='Seqn').groupby('Group')))
tmp2 = pd.melt(qqcomE.reset_index(names='Seqn'), id_vars='Seqn', value_vars=qqcomE.columns).set_index('Seqn')

fig = go.Figure()
fig.add_trace(go.Violin(x=tmp2.loc[tmp['C'].Seqn].variable,
                        y=tmp2.loc[tmp['C'].Seqn].value,
                        legendgroup='C', scalegroup='C', name='C',
                        side='negative',
                        line_color=palette[0])
             )
fig.add_trace(go.Violin(x=tmp2.loc[tmp['D'].Seqn].variable,
                        y=tmp2.loc[tmp['D'].Seqn].value,
                        legendgroup='D', scalegroup='D', name='D',
                        side='positive',
                        line_color=palette[1])
             )

fig.update_traces(box_visible=False, meanline_visible=True)
fig.update_layout(violinmode='overlay', title='PC1 Projection Distribution per Group')
fig.update_yaxes(title='PC1', range=(-12,12))
fig.update_xaxes(title='Protein Cluster')
fig.show()

In [54]:
# La PC1 de qq1 y qq3 separan entre grupos. Sacar violin plot de la proyección
qqcomE

clusters = [4,6,20,21]
mmcomE_tmp = mmcomE.iloc[:, clusters]

tmp = dict(list(mdata.set_index('Seqn').loc[mmcomE_tmp.index].reset_index(names='Seqn').groupby('Group')))
tmp2 = pd.melt(mmcomE_tmp.reset_index(names='Seqn'), id_vars='Seqn', value_vars=mmcomE_tmp.columns).set_index('Seqn')

fig = go.Figure()
fig.add_trace(go.Violin(x=[str(i) for i in tmp2.loc[tmp['C'].Seqn].variable],
                        y=tmp2.loc[tmp['C'].Seqn].value,
                        legendgroup='C', scalegroup='C', name='C',
                        side='negative',
                        line_color=palette[0])
             )
fig.add_trace(go.Violin(x=[str(i) for i in tmp2.loc[tmp['D'].Seqn].variable],
                        y=tmp2.loc[tmp['D'].Seqn].value,
                        legendgroup='D', scalegroup='D', name='D',
                        side='positive',
                        line_color=palette[1])
             )

fig.update_traces(box_visible=False, meanline_visible=True)
fig.update_layout(violinmode='overlay', title='PC1 Projection Distribution per Group')
fig.update_yaxes(title='PC1', range=(-12,12))
fig.update_xaxes(title='Protein Cluster')
fig.show()

In [55]:
atable = q2i.loc[qqcom[0]]
atable = m2i.loc[mmcom[9], ['TP_ID']]

In [62]:
q2i['qName'] = [re.search(r'HUMAN (.*) OS=', i).groups()[0] for i in q2i.qdesc]

In [63]:
writer = pd.ExcelWriter('Clusters/DCA.xlsx', engine='openpyxl', mode='w')

pd.DataFrame(
    [(q2i.loc[i0, 'qName'],q2i.loc[i1, 'qName'],w['weight']) for i0, i1, w in qqg.edges.data()]
    ).to_excel(writer, sheet_name='q')

pd.DataFrame(
    [(m2i.loc[i0,'TP_ID'],m2i.loc[i1,'TP_ID'],w['weight']) for i0, i1, w in mmg.edges.data()]
    ).to_excel(writer, sheet_name='m')

pd.DataFrame(
    [(q2i.loc[i0, 'qName'],m2i.loc[i1,'TP_ID'],w['weight']) for i0, i1, w in qmg.edges.data()]
    ).to_excel(writer, sheet_name='qm')

writer.close()

Pertenencia de cada nodo a su cluster

In [74]:
q2iC = q2i.join(
    pd.DataFrame(
        {q: dict(zip(['ComCorr', 'ComPval', 'Com'], [*pearsonr(qqcomE[n], xq[q]), n])) 
        for n,com in enumerate(qqcom) for q in com}
        ).T,
    how='left'
)

m2iC = m2i.join(
    pd.DataFrame(
        {m: dict(zip(['ComCorr', 'ComPval', 'Com'], [*pearsonr(mmcomE[n], xm[m]), n])) 
        for n,com in enumerate(mmcom) for m in com}
        ).T,
    how='left'
)#.loc[:, ['TP_ID', 'ComCorr', 'ComPval', 'Com']]

In [43]:
# Write feature info tables separated by cluster

_ = [
    i[1].to_csv(f'Clusters/qq_{int(i[0])}.tsv', sep='\t')
    for i in q2iC[~q2iC['Com'].isna()].groupby('Com')
]

_ = [
    i[1].to_csv(f'Clusters/mm_{int(i[0])}.tsv', sep='\t')
    for i in m2iC[~m2iC['Com'].isna()].groupby('Com')
]

In [56]:
# Write feature info tables separated by cluster

with pd.ExcelWriter(f'Clusters/qq_allClusters_V2.xlsx') as writer:
    _ = [
        i[1].to_excel(writer, sheet_name=f'qq_{int(i[0])}')
        for i in q2iC[~q2iC['Com'].isna()].groupby('Com')
    ]

with pd.ExcelWriter(f'Clusters/mm_allClusters_V2.xlsx') as writer:
    _ = [
        i[1].to_excel(writer, sheet_name=f'mm_{int(i[0])}')
        for i in m2iC[~m2iC['Com'].isna()].groupby('Com')
    ]

Ilustracion de clusteres

In [64]:
file='Plots/qqClusterCorrDif.html'
if os.path.exists(file):
    os.remove(file)

for i in range(len(qqcom)):
    tmp = qqcom[i]

    tmp2 = adpv.rpc.qq.dc.loc[tmp,tmp].to_numpy()[
        np.triu_indices(len(tmp), 1)
    ]<0.05

    rc = corr.rpc.qq.c.loc[tmp,tmp].to_numpy()[
        np.triu_indices(len(tmp), 1)
    ]

    rd = corr.rpc.qq.d.loc[tmp,tmp].to_numpy()[
        np.triu_indices(len(tmp), 1)
    ]

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=rc,
        y=rd,
        mode='markers', marker_size=2.5, showlegend=False
    ))

    fig.add_trace(go.Scatter(
        x=rc[tmp2],
        y=rd[tmp2],
        mode='markers', marker_size=2.5, marker_color=palette[1], showlegend=False
    ))

    fig.update_xaxes(range=(-0.2,0.5), title='Control correlation')
    fig.update_yaxes(range=(-0.2,0.5), title='Disease correlation')
    fig.add_shape(
        type='line',
        x0=-1, x1=1, y0=-1, y1=1,
        line=dict(width=0.5)
    )
    fig.update_layout(width=600, height=600, title=f'Proteomics Cluster {i}')

    with open(file, 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

#corr.rpc.qq.c.loc[qqcom[0],qqcom[0]]

In [65]:
file='Plots/mmClusterCorrDif.html'
if os.path.exists(file):
    os.remove(file)

for i in range(len(mmcom)):
    tmp = mmcom[i]

    tmp2 = adpv.rpc.mm.dc.loc[tmp,tmp].to_numpy()[
        np.triu_indices(len(tmp), 1)
    ]<0.05

    rc = corr.rpc.mm.c.loc[tmp,tmp].to_numpy()[
        np.triu_indices(len(tmp), 1)
    ]

    rd = corr.rpc.mm.d.loc[tmp,tmp].to_numpy()[
        np.triu_indices(len(tmp), 1)
    ]

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=rc,
        y=rd,
        mode='markers', marker_size=2.5, showlegend=False
    ))

    fig.add_trace(go.Scatter(
        x=rc[tmp2],
        y=rd[tmp2],
        mode='markers', marker_size=2.5, marker_color=palette[1], showlegend=False
    ))

    fig.update_xaxes(range=(-0.2,0.5), title='Control correlation')
    fig.update_yaxes(range=(-0.2,0.5), title='Disease correlation')
    fig.add_shape(
        type='line',
        x0=-1, x1=1, y0=-1, y1=1,
        line=dict(width=0.5)
    )
    fig.update_layout(width=600, height=600, title=f'Metabolomics Cluster {i}')

    with open(file, 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

#corr.rpc.qq.c.loc[qqcom[0],qqcom[0]]

In [66]:
Rengine = r"C:\Users\rbarreror\AppData\Local\Programs\R\R-4.2.2\bin\Rscript.exe"
Rpath = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils\ChordDiagram.R"
wpath = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\03-Network\PESA\ChordDiagram"

omic = 'qq'
n = 0

for omic, coms in [('qq',qqcom), ('mm',mmcom)]:
    for n in range(len(coms)):
        comL = coms[n]

        tmp_pv = adpv.rpc[omic].dc.loc[comL, comL]
        tmp_pv = pd.melt(tmp_pv.reset_index(), id_vars='index', value_vars=tmp_pv.columns)

        tmp_c = corr.rpc[omic].c.loc[comL, comL].copy()
        tmp_c.to_numpy()[np.tril_indices(tmp_c.shape[0])] = 0
        tmp_c = pd.melt(tmp_c.reset_index(), id_vars='index', value_vars=tmp_c.columns)
        tmp_c.columns = ['var1', 'var2', 'c']
        tmp_c = tmp_c[tmp_pv.value<0.05]

        tmp_d = corr.rpc[omic].d.loc[comL, comL].copy()
        tmp_d.to_numpy()[np.tril_indices(tmp_d.shape[0])] = 0
        tmp_d = pd.melt(tmp_d.reset_index(), id_vars='index', value_vars=tmp_d.columns)
        tmp_d.columns = ['var1', 'var2', 'd']
        tmp_d = tmp_d[tmp_pv.value<0.05]

        tmp = pd.merge(
            tmp_c, tmp_d,
            how='inner', on=['var1', 'var2']
        )

        tmp.to_csv(f'{wpath}/{omic}_{n}.tsv', sep='\t', index=False)


Meta-Network WGCNA

In [75]:
# Calcular meta-network mediante correlaciones entre autovectores de los clusteres

import itertools


metaqq = [
    ([i0, i1], [*pearsonr(qqcomE[i0], qqcomE[i1])])
    for i0, i1 in itertools.combinations(qqcomE.columns, 2)
]

metamm = [
    ([i0, i1], [*pearsonr(mmcomE[i0], mmcomE[i1])])
    for i0, i1 in itertools.combinations(mmcomE.columns, 2)
]

cidx = np.intersect1d(qqcomE.index, mmcomE.index)

metaqm = [
    ([i0, i1], [*pearsonr(qqcomE.loc[cidx, i0], mmcomE.loc[cidx, i1])])
    for i0, i1 in itertools.product(qqcomE.columns, mmcomE.columns)
]

In [76]:
multipletests(list(zip(*list(zip(*metaqm))[1]))[1], method='fdr_bh')

(array([False, False, False, False, False, False, False,  True, False,
        False, False, False, False, False, False, False,  True, False,
        False, False, False, False,  True, False, False, False, False,
        False, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False, False,  True,
        False, False, False,  True, False, False, False, False, False,
        False, False, False, False, False, False,  True, False, False,
        False, False, False, False, False, False, False,  True, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False,  True, False,
        False, False, False, False, False,  True,  True, False, False,
        False, False,  True, False, False, False,  True, False, False,
         True,  True]),
 array([5.24897071e-01, 7.49436221e-01, 2.46572634e-01, 1.18169401e-01,
        5.64375252e-01, 3.64710637e-01, 4.92008715e-

In [69]:
metag = nx.Graph()

_ = [metag.add_node(f'q{i}', group='q', sizee=1-np.log(qqCP.loc['Group',i])) for i in qqcomE.columns]
_ = [metag.add_node(f'm{i}', group='m', sizee=1-np.log(mmCP.loc['Group',i])) for i in mmcomE.columns]
_ = [
    metag.add_edge(f'{xx[0]}{nodes[0]}', f'{xx[1]}{nodes[1]}', r=vals[0], p=vals[1], omic=xx)
    for metaxx, xx in [(metaqq, 'qq'), (metamm, 'mm'), (metaqm, 'qm')][2:]
    for nodes, vals in metaxx if vals[1]<0.05
]

nt = Network(height='500px', width='100%', select_menu=True, filter_menu=True)
nt.force_atlas_2based()
nt.from_nx(metag)
_ = [i.update({'color': palette[1]}) if i['group']=='m' else i.update({'color': palette[0]}) for i in nt.nodes]
_ = [i.update({'color': 'grey'}) if i['r']>0 else i.update({'color': 'grey'}) for i in nt.edges]
_ = [i.update({'width': -np.log(i['p'])}) for i in nt.edges]
nt.show_buttons(filter_='physics')
nt.show('NT_MetaNetwork_WGCNA.html', notebook=False)

NT_MetaNetwork_WGCNA.html


In [80]:
res = []


pd.DataFrame(
    [(f'q{i[0]}',f'm{i[1]}',j[0]) for i,j in metaqm if j[1]<0.01],
    columns=['var1', 'var2', 'value']
).to_csv('ChordDiagram/meta-network_WGCNA.tsv', sep='\t', index=False)


#res.to_csv('ChordDiagram/meta-network_WGCNA.tsv', sep='\t', index=False)
#res

Conexiones entre qq y mm mediante Binomial, Poisson y Hypergeom

In [81]:
#
# Binomial
#
from scipy.stats import binom, poisson, hypergeom

mmclusters = list(zip(*np.argwhere([mmCPF.Group<0.1]).tolist()))[1]


tmp = [
    [(i,j), (len(qqcom[i]), len(mmcom[j]))] 
    for i,j in itertools.product(range(len(qqcom)), range(len(mmcom)))
    ]

binRes = []
for iqm, vqm in tmp:

    # The probability estimation for edge existence is the ratio of existing edges
    p = qmg.size()/(xq.shape[1]*xm.shape[1])

    # The number of tests is the total number of possible edges between both clusters
    n = vqm[0]*vqm[1]

    # The x value is the number of existing edges between clusters
    x = qmg.subgraph(
        qqcom[iqm[0]] + mmcom[iqm[1]]
    ).size()

    binRes.append(
        [
        iqm, x, n, p, 
        1-binom.cdf(x-1, n, p), 
        1-poisson.cdf(x-1, n*p),
        1-hypergeom.cdf(x-1, xq.shape[1]*xm.shape[1], n, qmg.size())
        ]
    )

binRes = pd.DataFrame(binRes,
             columns=['iqm', 'x', 'n', 'p', 'Bin', 'Poi', 'Hyp']) 

for i in ['Bin', 'Poi', 'Hyp']:
    binRes[i+'_adj'] = multipletests(binRes['Bin'], method='fdr_bh')[1]

binRes = binRes[[i1 in mmclusters for i0,i1 in binRes.iqm]]

In [82]:
res = []

tmp = qmc[qmp<0.05].fillna(0)
tmp.index = [i.replace('.', '-') for i in tmp.index]

for pair in binRes[binRes.Hyp_adj<0.01].iqm.tolist():
    # pair = binRes[binRes.Hyp_adj<0.05].iqm.tolist()[0]

    r = [tmp.loc[q, m] for q,m in list(itertools.product(qqcom[pair[0]], mmcom[pair[1]]))]
    r = np.median([abs(i) for i in r if i!=0])
    res.append([*pair, r])

    # Positive and negative
    # r = [(i, i>0) for i in r if i!=0]
    # r.sort(key=lambda x:x[1])
    # r = [np.median(list(zip(*list(j)))[0]) for i,j in itertools.groupby(r, lambda x:x[1])]
    # res.append([*pair, r[0]]); res.append([*pair, r[1]])

res = pd.DataFrame(
    [[f"q{i[0]}", f"m{i[1]}", i[2]] for i in res],
    columns=['var1', 'var2', 'value']
)

res.to_csv('ChordDiagram/meta-network.tsv', sep='\t', index=False)
res


Unnamed: 0,var1,var2,value
0,q0,m4,0.090639
1,q0,m6,0.077262
2,q0,m20,0.082182
3,q0,m21,0.093195
4,q1,m4,0.07624
5,q1,m6,0.091892
6,q1,m20,0.107085
7,q1,m21,0.119917
8,q2,m4,0.075368
9,q2,m6,0.072789


In [21]:
pv = 0.01
metag = nx.Graph()
_ = [metag.add_node(f'q{i}', group='q') for i in range(len(qqcom))]
_ = [metag.add_node(f'm{i}', group='m') for i in range(len(mmcom))]
_ = [metag.add_edge(f'q{i}',f'm{j}') for i,j in binRes.iqm[binRes['Hyp_adj'] < pv]]

In [318]:
nt = Network(height='500px', width='100%', select_menu=True, filter_menu=True)
nt.force_atlas_2based()
nt.from_nx(metag)
_ = [i.update({'color': palette[1]}) if i['group']=='m' else i.update({'color': palette[0]}) for i in nt.nodes]
_ = [i.update({'color': 'grey'}) for i in nt.edges]
#_ = [i.update({'width': -np.log(i['p'])}) for i in nt.edges]
nt.show_buttons(filter_='physics')
nt.show('NT_MetaNetwork_Hyper.html', notebook=False)

NT_MetaNetwork_Hyper.html


In [87]:
# Intersection WGCNA & Hypergeometric
wgcna = pd.read_csv(r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\03-Network\PESA_V2\ChordDiagram\meta-network_WGCNA.tsv", sep='\t', index_col=['var1', 'var2'])
hyper = pd.read_csv(r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\03-Network\PESA_V2\ChordDiagram\meta-network.tsv", sep='\t', index_col=['var1', 'var2'])

In [92]:
np.intersect1d(
wgcna.index,
hyper.index
)

array([('q2', 'm4'), ('q4', 'm20'), ('q4', 'm21')], dtype=object)

Calcular proteinas más conectadas con metabolitos

In [100]:
qSig = pd.DataFrame([
    (q,x, 1-hypergeom.cdf(x-1, xq.shape[1]*xm.shape[1], xm.shape[1], qmg.size())) 
    for q,x in pd.Series([i[0] for i in qmg.edges.data()]).value_counts().to_dict().items()
], columns=['q', 'x', 'pv']).set_index('q')

qSig['pv_adj'] = multipletests(qSig['pv'], method='bonferroni')[1]

In [101]:
qSig = qSig.join(q2i)

In [102]:
qSig.to_csv('qSig.tsv', sep='\t')

Metabolitos más conectados con proteínas

In [103]:
mSig = pd.DataFrame([
    (m,x, 1-hypergeom.cdf(x-1, xq.shape[1]*xm.shape[1], xq.shape[1], qmg.size())) 
    for m,x in pd.Series([i[1] for i in qmg.edges.data()]).value_counts().to_dict().items()
], columns=['m', 'x', 'pv']).set_index('m')

mSig['pv_adj'] = multipletests(mSig['pv'], method='bonferroni')[1]

In [105]:
mSig
tmp = mSig.join(m2i).loc[:, ['x','pv', 'pv_adj', 'TP_ID']]

Explorar proteínas más asociadas a metabolitos de interés

In [106]:
metabolite = 'C18N71'

tmp2 = pd.DataFrame(
    [(i[0], i[2]['weight']) for i in qmg.edges.data() if i[1] == metabolite],
    columns=['q', 'r']
).set_index('q').join(q2i)

In [109]:
midList = [
    ('C18N756', 'PE(38:4)'),
    ('HILP1984', 'PE(38:4)'),
    ('C18N687', 'PE(36:4)'),
    ('HILP833', 'PC(38:4)'),
    ('C18P829', 'LPE(20:4)'),
    ('HILP658', 'LPE(20:4)'),
    ('C18N301', 'LPE(18:1)'),
    ('HILP632', 'LPE(18:1)'),
    ('C18N217', 'LPC(18:2)'),
    ('C18P834', 'LPC(18:2)'),
    ('HILP669', 'LPC(18:2)'),
]

# midList = [
#     ('HILP520', 'Arginine'),
#     ('HILP78', 'Biliverdin'), 
#     ('C18P619', 'Biliverdin'), 
#     ('C18P380', 'Atorvastatin'),
# ]

mid = midList[2]


file = f'Plots/AlessiaLipids/PESA_V2_Lipids_Ig_Apo_C.html'
if os.path.exists(file):
    os.remove(file)

for mid in midList:
    fig = make_subplots(rows=1, cols=3, subplot_titles=['Inmunoglobulin C Region', 'Apolipoprotein', 'Complement'])

    fig = plot_qgroup(fig, r=0, c=0, mid=mid[0], regex='chain C region', title='', cortype='rcca')
    fig = plot_qgroup(fig, r=0, c=1, mid=mid[0], regex='Apolipoprotein', title='', cortype='rcca')
    fig = plot_qgroup(fig, r=0, c=2, mid=mid[0], regex='Complement', title='', cortype='rcca')
    fig.update_layout(title=f'{mid[1]} | {mid[0]}')

    with open(file, 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='100%'))


# fig.show()

In [107]:
def plot_qgroup(fig, r, c, mid, regex='', title='q Group', cortype='rcca'):
    #qid = 'P02741' # ApoB-100

    atable = pd.DataFrame(
    [
        (
        q2i.qdesc[i], 
        i,
        k['weight'], 
        getattr(corr, cortype).qm.c.loc[i.replace('-','.'), mid], 
        getattr(corr, cortype).qm.d.loc[i.replace('-','.'), mid],
        adpv[cortype].qm.c.loc[i.replace('-','.'), mid],
        adpv[cortype].qm.d.loc[i.replace('-','.'), mid]
        )
        for i,j,k in qmg.edges.data() if j==mid
    ],
    columns=['qdesc', 'fid', 'dc_corr', 'c_corr', 'd_corr', 'c_pv', 'd_pv']
    )

    tmp = atable[[bool(re.search(fr'{regex}', str(i))) for i in atable.qdesc]]

    y0 = tmp.c_corr.to_numpy()
    y1 = tmp.d_corr.to_numpy()

    cpv, dpv = tmp.c_pv.to_numpy(), tmp.d_pv.to_numpy()

    df = pd.DataFrame({'graph_name':['Control']*len(y0)+['Disease']*len(y1),
                    'value': np.concatenate([y0,y1],0)}
                    # 'color':np.random.choice([0,1,2,3,4,5,6,7,8,9], size=100, replace=True)}
                    )

    #fig = go.Figure()

    ## i will set jittering to 0.1
    x0 = np.array([0]*len(y0)) + np.random.uniform(-0.05,0.05,len(y0))
    x1 = np.array([1]*len(y0)) + np.random.uniform(-0.05,0.05,len(y0))

    for x_start,x_end,y_start,y_end,name,cpvi,dpvi in zip(x0,x1,y0,y1,tmp.qdesc, cpv, dpv):
        ## if the color hasn't been added to the legend yet, add a legend entry
        text = name[:5]
        text = re.search(r'GN=([^\s]+)(.)', name)
        text = text.groups()[0] if text else ''
        fig.add_trace(
            go.Scatter(
                x=[x_start,x_end],
                y=[y_start,y_end],
                mode='lines+markers+text',
                marker=dict(color=['rgba(48, 39, 245, 0.9)' if cpvi<0.05 else 'grey', 'rgba(48, 39, 245, 0.9)' if dpvi<0.05 else 'grey']),
                line=dict(color="rgba(100,100,100,0.5)"),
                #legendgroup=color_number[color],
                #name=color_number[color],
                name='', 
                hovertemplate=name,
                text=['', text], textposition='middle left',
                showlegend=False,
                line_width=0.8,
                marker_size=4
                #hoverinfo='skip'
            ), row=r+1, col=c+1
        )

    fig.add_trace(go.Box(
        y=df.query('graph_name == "Control"')['value'], x=len(y0)*[0],
        name='Control',hoverinfo='skip', width=0.2, showlegend=False,
        marker_color=palette[1], boxpoints = False), row=r+1, col=c+1
    )
    fig.add_trace(go.Box(
        y=df.query('graph_name == "Disease"')['value'],  x=len(y1)*[1],
        name='Disease',hoverinfo='skip', width=0.2, showlegend=False,
        marker_color=palette[2], boxpoints = False), row=r+1, col=c+1
    )

    fig.update_xaxes(tickvals=[0,1], ticktext=['Control', 'Disease'], range=(-0.2,1.2), title=title, row=r+1, col=c+1)
    return fig

Explorar proteinas y grupos de lipidos del meta-network

In [110]:
qid = 'P06396'

atable = pd.DataFrame(
[
    (
    m2i.TP_ID[j], 
    j,
    k['weight'],
    corr.rcca.qm.c.loc[qid, j], 
    corr.rcca.qm.d.loc[qid, j],
    adpv.rcca.qm.c.loc[qid, j],
    adpv.rcca.qm.d.loc[qid, j]
    )
    for i,j,k in qmg.edges.data() if i==qid
],
columns=['TP_ID', 'fid', 'dc_corr', 'c_corr', 'd_corr', 'c_pv', 'd_pv']
)

In [111]:
def plot_qmclass(fig, r, c, qid, regex='', title='q vs lipid class', cortype='rcca'):
    #qid = 'P02741' # ApoB-100

    atable = pd.DataFrame(
    [
        (
        m2i.TP_ID[j], 
        j,
        k['weight'], 
        getattr(corr, cortype).qm.c.loc[qid, j], 
        getattr(corr, cortype).qm.d.loc[qid, j],
        adpv[cortype].qm.c.loc[qid, j],
        adpv[cortype].qm.d.loc[qid, j]
        )
        for i,j,k in qmg.edges.data() if i==qid
    ],
    columns=['TP_ID', 'fid', 'dc_corr', 'c_corr', 'd_corr', 'c_pv', 'd_pv']
    )

    tmp = atable[[bool(re.search(fr'{regex}', str(i))) for i in atable.TP_ID]]

    y0 = tmp.c_corr.to_numpy()
    y1 = tmp.d_corr.to_numpy()

    cpv, dpv = tmp.c_pv.to_numpy(), tmp.d_pv.to_numpy()

    df = pd.DataFrame({'graph_name':['Control']*len(y0)+['Disease']*len(y1),
                    'value': np.concatenate([y0,y1],0)}
                    # 'color':np.random.choice([0,1,2,3,4,5,6,7,8,9], size=100, replace=True)}
                    )

    #fig = go.Figure()

    ## i will set jittering to 0.1
    x0 = np.array([0]*len(y0)) + np.random.uniform(-0.05,0.05,len(y0))
    x1 = np.array([1]*len(y0)) + np.random.uniform(-0.05,0.05,len(y0))

    for x_start,x_end,y_start,y_end,name,cpvi,dpvi in zip(x0,x1,y0,y1,tmp.TP_ID, cpv, dpv):
        ## if the color hasn't been added to the legend yet, add a legend entry
        text = name[:5]
        fig.add_trace(
            go.Scatter(
                x=[x_start,x_end],
                y=[y_start,y_end],
                mode='lines+markers',
                marker=dict(color=['rgba(48, 39, 245, 0.9)' if cpvi<0.05 else 'grey', 'rgba(48, 39, 245, 0.9)' if dpvi<0.05 else 'grey']),
                line=dict(color="rgba(100,100,100,0.5)"),
                #legendgroup=color_number[color],
                #name=color_number[color],
                name='', 
                hovertemplate=name,
                text=text, textposition='top left',
                showlegend=False,
                line_width=0.8,
                marker_size=4
                #hoverinfo='skip'
            ), row=r+1, col=c+1
        )

    fig.add_trace(go.Box(
        y=df.query('graph_name == "Control"')['value'], x=len(y0)*[0],
        name='Control',hoverinfo='skip', width=0.2, showlegend=False,
        marker_color=palette[1], boxpoints = False), row=r+1, col=c+1
    )
    fig.add_trace(go.Box(
        y=df.query('graph_name == "Disease"')['value'],  x=len(y1)*[1],
        name='Disease',hoverinfo='skip', width=0.2, showlegend=False,
        marker_color=palette[2], boxpoints = False), row=r+1, col=c+1
    )

    fig.update_xaxes(tickvals=[0,1], ticktext=['Control', 'Disease'], range=(-0.2,1.2), title=title, row=r+1, col=c+1)
    return fig

In [124]:
cortype='rcca'

myProts = [
    # ('P04114', 'ApoB-100'),
    # ('P02647', 'ApoA-I'),
    # ('P02652', 'ApoA-II'),
    # ('P02649', 'ApoE'),
    # ('P02654', 'ApoC-I'),
    # ('P02655', 'ApoC-II'),
    # ('P01031', 'C5'),
    # ('P01834', 'Ig kappa chain C region'),
    # ('P01876', 'Ig alpha-1 chain C region'),
    # ('P01857', 'Ig gamma-1 chain C region'),
    # ('P01859', 'Ig gamma-2 chain C region'),
    # ('P01860', 'Ig gamma-3 chain C region'),
    # ('P0CG05', 'Ig lambda-2 chain C region'),
    ('P00738', 'Haptoglobin'),
    ('P05546', 'Hep2'),
    ('P01877', 'IGHA2'),
    ('P08519', 'APOA')
]

lipidClass = [
    ('(^|\s)DG\s', 'DG'),
    ('(^|\s)PC\s', 'PC'),
    ('(^|\s)LPC\s', 'LPC'),
    ('(^|\s)PE\s', 'PE'),
    ('(^|\s)LPE\s', 'LPE'),
    ('(^|\s)PS\s', 'PS'),
    ('(^|\s)LPS\s', 'LPS'),
    ('(^|\s)SM\s', 'SM'),
    ('(^|\s)CAR\s', 'CAR'),
    ('(^|\s)FA\s', 'FA'),
    ('(^|\s)ST\s', 'ST'),
]

r=3; c=5

for qid, qdesc in myProts:
    fig = make_subplots(rows=r, cols=c, vertical_spacing=0.05)

    file = f'Plots/q2LipidClass/{cortype}/{qid}_{qdesc}.html'
    if os.path.exists(file):
        os.remove(file)

    #print(qSig.loc[[qid], :])
    for n, (regex, lipid) in enumerate(lipidClass):
        fig = plot_qmclass(fig, n//c, n%c, qid,  regex, lipid, cortype)

    with open(file, 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='150%', default_width='100%'))


In [121]:
cortype = 'rcca'

# myProts = [
#     ('P01834', 'Ig kappa chain C region'),
#     ('P01876', 'Ig alpha-1 chain C region'),
#     ('P01857', 'Ig gamma-1 chain C region'),
#     ('P01859', 'Ig gamma-2 chain C region'),
#     ('P01860', 'Ig gamma-3 chain C region'),
#     ('P0CG05', 'Ig lambda-2 chain C region')
# ]

myProts = [
    ('P04114', 'ApoB-100'),
    ('P02647', 'ApoA-I'),
    ('P02652', 'ApoA-II'),
    ('P02649', 'ApoE'),
    ('P02654', 'ApoC-I'),
    ('P02655', 'ApoC-II'),
    #('P05546', 'Hep2'),
    # ('P00738', 'Haptoglobin')
]

myProts = [
    ('P01031', 'C5'),
    ('P00751', 'CFB'),
    ('P07360', 'CO8G'),
    ('P08603', 'CFH'),
    ('P0C0L4', 'C4A'),
    ('P0C0L5', 'C4B'),
    ('P13671', 'C6'),
    ('P05546', 'Hep2'),
    ('P00738', 'Haptoglobin')
]

lipidClass = [
    ('(^|\s)DG\s', 'DG'),
    ('(^|\s)PC\s', 'PC'),
    ('(^|\s)LPC\s', 'LPC'),
    ('(^|\s)PE\s', 'PE'),
    ('(^|\s)LPE\s', 'LPE'),
    ('(^|\s)PS\s', 'PS'),
    ('(^|\s)LPS\s', 'LPS'),
    ('(^|\s)SM\s', 'SM'),
    ('(^|\s)CAR\s', 'CAR'),
    ('(^|\s)FA\s', 'FA'),
    ('(^|\s)ST\s', 'ST'),
]

file = f'Plots/q2LipidClass/{cortype}/Compl_LipidClass.html'

r=len(lipidClass); c=len(myProts)

#for qid, qdesc in myProts:
if os.path.exists(file):
    os.remove(file)
fig = make_subplots(rows=r, cols=c, vertical_spacing=0.03, subplot_titles=r*list(zip(*myProts))[1])

for ni, (regex, lipid) in enumerate(lipidClass):
    #print(qSig.loc[[qid], :])
    for nj, (qid, qdesc) in enumerate(myProts):
        fig = plot_qmclass(fig, ni, nj, qid,  regex, lipid, cortype)

with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='500%', default_width='100%'))


Representar correlaciones diferenciales en proteínas significativas por Limma

In [59]:
# Read results from Limma

qlim = pd.read_csv(r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\01-BasicStats\PESA\qLimma.tsv", sep='\t')
mlim = pd.read_csv(r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\01-BasicStats\PESA\mLimma.tsv", sep='\t')

qlim.index = [i.replace('.', '-') for i in qlim.index]

qlim = q2i.join(qlim, how='right')
mlim = m2i.loc[:, ['ID', 'TP_ID']].join(mlim, how='right')

In [80]:
# Get Limma significative proteins
pv = 0.1
cortype='rcca'


myProts = [
    (i,re.search(r'GN=([A-z0-9]+)',j))
    for i,j in zip(
        qlim.index[qlim['adj.P.Val']<pv],
        qlim.qdesc[qlim['adj.P.Val']<pv]
    )
][12:]

myProts = [
(i, j.groups()[0]) if j else
(i, i)
for i,j in myProts
]

file = f'Plots/q2LipidClass/{cortype}/Limma_LipidClass_12-24.html'

r=len(lipidClass); c=len(myProts)

#for qid, qdesc in myProts:
if os.path.exists(file):
    os.remove(file)
fig = make_subplots(rows=r, cols=c, vertical_spacing=0.03, subplot_titles=r*list(zip(*myProts))[1])

for ni, (regex, lipid) in enumerate(lipidClass):
    #print(qSig.loc[[qid], :])
    for nj, (qid, qdesc) in enumerate(myProts):
        fig = plot_qmclass(fig, ni, nj, qid,  regex, lipid, cortype)

with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='500%', default_width='100%'))


Representar distribucion de valores de proteinas en control y AS

In [93]:
myProts = [
    ('P04114', 'ApoB-100'),
    ('P02647', 'ApoA-I'),
    ('P02652', 'ApoA-II'),
    ('P02649', 'ApoE'),
    ('P02654', 'ApoC-I'),
    ('P02655', 'ApoC-II'),
    # ('P01031', 'C5'),
    # ('P00738', 'Haptoglobin'),
    # ('P05546', 'Hep2')
    # ('P01834', 'Ig kappa chain C region'),
    # ('P01876', 'Ig alpha-1 chain C region'),
    # ('P01857', 'Ig gamma-1 chain C region'),
    # ('P01859', 'Ig gamma-2 chain C region'),
    # ('P01860', 'Ig gamma-3 chain C region'),
    # ('P0CG05', 'Ig lambda-2 chain C region'),
]


# Limma proteins

# pv=0.1
# myProts = [
#     (i,re.search(r'GN=([A-z0-9]+)',j))
#     for i,j in zip(
#         qlim.index[qlim['adj.P.Val']<pv],
#         qlim.qdesc[qlim['adj.P.Val']<pv]
#     )
# ][:]

# myProts = [
# (i, j.groups()[0]) if j else
# (i, i)
# for i,j in myProts
# ]

In [92]:
qid, qdesc = [('P02647', 'ApoB-100')][0]

cidx = np.intersect1d(xq.index, mdata.Seqn).tolist()
qmdata = mdata.set_index('Seqn').loc[cidx]


c = qmdata.index[qmdata.Group=='C'].tolist()
d = qmdata.index[qmdata.Group=='D'].tolist()


fig = go.Figure()

for qid, qdesc in myProts:

    fig.add_trace(go.Violin(x=len(c)*[qdesc],
                            y=xq.loc[c, qid],
                            legendgroup='Yes', scalegroup='Yes', name='Control',
                            side='negative', showlegend=False,
                            line_color=palette[1], hoverinfo='skip')
                )
    fig.add_trace(go.Violin(x=len(d)*[qdesc],
                            y=xq.loc[d, qid],
                            legendgroup='No', scalegroup='No', name='Disease',
                            side='positive', showlegend=False,
                            line_color=palette[2], hoverinfo='skip')
                )
fig.update_traces(meanline_visible=True)
fig.update_xaxes(tickangle=20, range=(-0.5, len(myProts)))
fig.update_layout(violingap=0, violinmode='overlay')
fig.show()

# xq.loc[c, qid]
# xq.loc[d, qid]
# mdata.loc[cidx, ['Seqn', 'Group']]


Explorar Proteinas del paper de PESA no exploradas antes

In [138]:
g.subgraph(qqcom[3]+mmcom[11])

sg = nx.Graph()
sg.add_nodes_from(qqcom[3], group='q')
sg.add_nodes_from(mmcom[11], group='m')

qmg.edges(data=True)
_ = [sg.add_edge(i[0], i[1], weight=i[2]['weight']) for i in qmg.edges.data() if i[0] in sg.nodes and i[1] in sg.nodes]


nt = Network(height='500px', width='100%', select_menu=True, filter_menu=True)
nt.from_nx(sg)

_ = [i.update({'color': palette[1]}) if i['group']=='m' else i.update({'color': palette[0]}) for i in nt.nodes]
_ = [i.update({'color': palette[3]}) if i['width']>0 else i.update({'color': palette[4]}) for i in nt.edges]

nt.show_buttons(filter_='physics')
nt.force_atlas_2based()
nt.show('NT_qq3_mm11.html', notebook=False)

NT_qq3_mm11.html


In [50]:
myprotein = 'P00738'
mmcom_ni = 9

file = f'Plots/{myprotein}_mm{mmcom_ni}.html'

tmp = m2i.loc[[j for i,j,k in g.edges.data() if i=='P00738' and j in mmcom[mmcom_ni]], 'TP_ID']

corr.rcca.qm.c.loc['P00738',tmp.keys()]
corr.rcca.qm.d[adpv.rcca.qm.c<0.05].loc['P00738',tmp.keys()]

x = [f'{j}-{i}' for i,j in zip(tmp, tmp.keys())]

fig = go.Figure(data=[
    go.Bar(name='Control', x=x, y=corr.rcca.qm.c[adpv.rcca.qm.c<0.05].loc['P00738',tmp.keys()], opacity=0.8),
    go.Bar(name='Disease', x=x, y=corr.rcca.qm.d[adpv.rcca.qm.d<0.05].loc['P00738',tmp.keys()], opacity=0.8),
])

fig.update_layout(barmode='group', title=f'Correlation Difference | {myprotein} & Cluster {mmcom_ni}')
fig.show()

with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

GRAPH VISUALIZATION

In [123]:
# G = leidenalg.find_partition(
#     ig.Graph.from_networkx(g), leidenalg.ModularityVertexPartition, 
#     n_iterations=-1, seed=0
# ).subgraphs()[0]
#G = g.subgraph(G.to_networkx().nodes)

G = nx_comm.louvain_communities(g, resolution=2, seed=0)[1]
G = g.subgraph(G)

In [142]:
G = g.subgraph(qqcom[3])

In [151]:


nt = Network(height='500px', width='100%', select_menu=True, filter_menu=True)
nt.force_atlas_2based()
nt.from_nx(metag)
#nt.from_nx(g.subgraph(mmcom[1]))
#nt.from_nx(qqA[0].subgraph(list(get_communities(qqA[0], n=n)[0])))
_ = [i.update({'color': palette[1]}) if i['group']=='m' else i.update({'color': palette[0]}) for i in nt.nodes]
_ = [i.update({'color': 'grey'}) if i['r']>0 else i.update({'color': 'grey'}) for i in nt.edges]
_ = [i.update({'width': -np.log(i['p'])}) for i in nt.edges]
nt.show_buttons(filter_='physics')
nt.show('nx.html', notebook=False)

nx.html


STRING ENRICHMENT

In [271]:
from STRING import Uniprot2String, FunctionalEnrichment
background = Uniprot2String(
    q2i.index.tolist()
)

background = [i[1] for i in background]

In [None]:
import requests

for comm_ni in range(len(qqcom)):

    my_genes = Uniprot2String(
        #qqcom[comm_ni],
        qSig.index[qSig.pv_adj<0.01].tolist()
    )
    my_genes = [i[1] for i in my_genes]
    data = FunctionalEnrichment(my_genes, background, species=9606)

    from io import StringIO

    data = pd.read_csv(
        StringIO(data),
        sep='\t'
    )

    col = ['category', 'number_of_genes', 'number_of_genes_in_background', 'fdr', 'description']
    categories = ['Process', 'KEGG', 'Component']
    atable = data[np.isin(data['category'], categories)].sort_values('fdr').loc[:, col]#.head(15)
    atable.head(15)

    with pd.ExcelWriter('Clusters/qq_allClusters.xlsx', mode='a') as writer:
        atable.to_excel(writer, sheet_name=f'qq_{comm_ni}_string', index=False)
    
    # String network
    res = requests.get(
    f'https://string-db.org/api/tsv/get_link?identifiers={"%0d".join(my_genes)}&species=9606'
    )

    print(f'Proteomics Cluster {comm_ni}')
    print(res.content.decode('ascii'))


In [276]:
import requests

my_genes = Uniprot2String(
    #qqcom[comm_ni],
    qSig.index[qSig.pv_adj<0.01].tolist()
)
my_genes = [i[1] for i in my_genes]
data = FunctionalEnrichment(my_genes, background, species=9606)

from io import StringIO

data = pd.read_csv(
    StringIO(data),
    sep='\t'
)

col = ['category', 'number_of_genes', 'number_of_genes_in_background', 'fdr', 'description']
categories = ['Process', 'KEGG', 'Component']
atable = data[np.isin(data['category'], categories)].sort_values('fdr').loc[:, col]#.head(15)
atable.head(15)

# String network
res = requests.get(
f'https://string-db.org/api/tsv/get_link?identifiers={"%0d".join(my_genes)}&species=9606'
)

print(res.content.decode('ascii'))


url
https://string-db.org/cgi/link?to=AE6938C979F29147

