In [1]:
#
# Import Libraries
#

import os
import pandas as pd
import pickle
import sys
import networkx as nx
import numpy as np
from scipy.stats import poisson
import itertools

from scipy.stats import pearsonr, spearmanr, kendalltau

from functools import reduce

from plotly import graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly

import re

palette = plotly.colors.qualitative.Plotly

from pyvis.network import Network

import subprocess

utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)

In [5]:
#
# Paths & Constants
#

corr_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\02-Correlations\PESA_V2\corr.pkl'
pval_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Analysis\02-Correlations\PESA_V2\pvals.pkl'

xm_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\PESA_V2\WorkingFiles\Xm_norm.tsv'
xq_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Proteomics\PESA_V2\WorkingFiles\Xq_minus_X_norm.tsv'

m2i_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\PESA_V2\WorkingFiles\f2i.tsv'
q2i_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Proteomics\PESA_V2\WorkingFiles\q2info.tsv'

mdata_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\PESA_V2\WorkingFiles\main_metadata.tsv'

In [8]:
xq = pd.read_csv(xq_path, sep='\t', index_col='seqn')
xm = pd.read_csv(xm_path, sep='\t', index_col='Seqn')

q2i = pd.read_csv(q2i_path, sep='\t', index_col='fid')
m2i = pd.read_csv(m2i_path, sep='\t', index_col='fid')

mdata = pd.read_csv(mdata_path, sep='\t')

In [9]:
#
# Read raw data from which the graph will be built
#

with open(corr_path, 'rb') as f:
    corr, corrSL = pickle.load(f)

with open(pval_path, 'rb') as f:
    pv, adpv = pickle.load(f)

Analizar orden y tama침o del grafo usando diferentes tipos de correlaciones y umbrales de FDR

In [10]:
#
# 
#

ctypes = ['rpc', 'psk']

pvrange = [0,0.005, 0.01,0.05, 0.1] #np.arange(0,0.2,0.05)

plot = {}

for ctype in ctypes:
    plot[ctype] = {
        'qq': {'nodes':[], 'edges':[]},
        'mm': {'nodes':[], 'edges':[]}, 
    }
    for pv in pvrange:

        qqc = getattr(corr, ctype).qq.a.copy()
        qqp = adpv[ctype].qq.a.copy()
        qqg = nx.from_pandas_adjacency(qqc[qqp<=pv].fillna(0))
        plot[ctype]['qq']['nodes'].append((np.array(list(dict(nx.degree(qqg)).values()))>0).sum())
        plot[ctype]['qq']['edges'].append(qqg.size())

        mmc = getattr(corr, ctype).mm.a.copy()
        mmp = adpv[ctype].mm.a.copy()
        mmg = nx.from_pandas_adjacency(mmc[mmp<=pv].fillna(0))
        plot[ctype]['mm']['nodes'].append((np.array(list(dict(nx.degree(mmg)).values()))>0).sum())
        plot[ctype]['mm']['edges'].append(mmg.size())


ctypes2 = ['rcca', 'psk', 'cca']

for ctype in ctypes2:
    if ctype not in plot.keys(): plot[ctype]={}
    plot[ctype]['qm'] = {
        'nodes':[], 'edges':[],
    }
    for pv in pvrange:

        qmc = getattr(corr, ctype).qm.a.copy()
        qmp = adpv[ctype].qm.a.copy()
        qmg = qmc[qmp<pv].fillna(0)
        qmg = pd.concat([
            pd.DataFrame(columns=qmg.index, index=qmg.index).fillna(0).join(qmg),
            qmg.T.join(pd.DataFrame(columns=qmg.columns, index=qmg.columns).fillna(0))
        ])
        qmg = nx.from_pandas_adjacency(qmg)
        plot[ctype]['qm']['nodes'].append((np.array(list(dict(nx.degree(qmg)).values()))>0).sum())
        plot[ctype]['qm']['edges'].append(qmg.size())

In [12]:
from plotly.subplots import make_subplots
from plotly import graph_objects as go
import plotly.express as px

palette = px.colors.qualitative.Plotly

file = 'Plots/FDR_CorrType.html'
if os.path.exists(file):
    os.remove(file)

fig = make_subplots(rows=3, cols=2, subplot_titles=['qq - Nodes', 'qq - Edges','mm - Nodes', 'mm - Edges', 'qm - Nodes', 'qm - Edges'])
plot['psk']['qq']['nodes']

for n,ctype in enumerate(ctypes):
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['qq']['nodes'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=1, col=1)
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['qq']['edges'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=1, col=2)
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['mm']['nodes'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=2, col=1)
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['mm']['edges'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=2, col=2)

for n,ctype in enumerate(ctypes2):
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['qm']['nodes'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=3, col=1)
    fig.add_trace(go.Scatter(
        x=pvrange,
        y=plot[ctype]['qm']['edges'],
        name=ctype, marker_color=palette[n], legendgroup=n
    ), row=3, col=2)

fig.update_layout(title='Number of nodes and edges per correlation type', width=700, height=500)
fig.show()

with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))


Construcci칩n de grafo usando Graphical Lasso y rCCA con FDR <0.05

In [7]:
# Checkpoint
qqc,qqp,qqg,mmc,mmp,mmg,qmc,qmp,qmg = pickle.load(open('sourceData.pkl', 'rb'))

In [13]:
pv = 0.01

qqc = corr.rpc.qq.a.copy()
qqp = adpv.rpc.qq.a.copy()
qqg = nx.from_pandas_adjacency(qqc[qqp<=pv].fillna(0))

mmc = corr.rpc.mm.a.copy()
mmp = adpv.rpc.mm.a.copy()
mmg = nx.from_pandas_adjacency(mmc[mmp<=pv].fillna(0))

qmc = corr.rcca.qm.a.copy()
qmp = adpv.rcca.qm.a.copy()
qmg = qmc[qmp<pv].fillna(0)
qmg = pd.concat([
    pd.DataFrame(columns=qmg.index, index=qmg.index).fillna(0).join(qmg),
    qmg.T.join(pd.DataFrame(columns=qmg.columns, index=qmg.columns).fillna(0))
])
qmg.index, qmg.columns = [i.replace('.', '-') for i in qmg.index], [i.replace('.', '-') for i in qmg.columns]
qmg = nx.from_pandas_adjacency(qmg)

In [18]:
# Checkpoint
pickle.dump([qqc,qqp,qqg,mmc,mmp,mmg,qmc,qmp,qmg], open('sourceData.pkl', 'wb'))

In [14]:
#
# Generate random graphs
#
n = 100

qqA = [nx.gnm_random_graph(qqg.order(), qqg.size(), seed=i) for i in range(n)]
mmA = [nx.gnm_random_graph(mmg.order(), mmg.size(), seed=i) for i in range(n)]

In [15]:
#
# Basic graph characteristics
#

G = qqg
GA = qqA

for G, GA, name in [(qqg, qqA, 'Proteomics'), (mmg, mmA, 'Metabolomics')]:

    print()
    print(name)
    print()

    print(f'Order: {G.order()}')
    print(f'Size: {G.size()}')
    print(f'Density: {round(nx.density(G), 5)}')

    i = [nx.average_clustering(i) for i in GA]
    print(f'Average clustering: {round(nx.average_clustering(G), 5)} (random graph: mean = {round(np.mean(i),5)}, std = {round(np.std(i),5)})')
    print(f'Average degree: {round(np.mean(list(dict(G.degree).values())),5)}')

    i = [len(list(nx.connected_components(i))) for i in GA]
    print(f'Number of connected components: {round(len(list(nx.connected_components(G))), 5)} (random graph: mean = {round(np.mean(i), 5)}, std = {round(np.std(i), 5)})')

    i = [len([i for i in nx.connected_components(g) if len(i)>1]) for g in GA]
    print(f'Number of connected components (order g.t. 1): {round(len([i for i in nx.connected_components(G) if len(i)>1]), 5)} (random graph: mean = {round(np.mean(i),5)}, std = {round(np.std(i),5)})')

fig = make_subplots(rows=1, cols=2, subplot_titles=['Proteomics', 'Metabolomics'])

plot = nx.degree_histogram(qqg)
fig.add_trace(go.Bar(
    x=np.arange(0,len(plot)),
    y=plot,
    offsetgroup=1, marker_color=palette[0]
), row=1,col=1)
fig.add_trace(go.Bar(
    x=np.arange(0,len(plot)),
    y=G.order()*poisson.pmf(np.arange(0, len(plot)),2*G.size()/G.order()),
    offsetgroup=1, marker_color='black', opacity=0.2
), row=1,col=1)


plot = nx.degree_histogram(mmg)
fig.add_trace(go.Bar(
    x=np.arange(0,len(plot)),
    y=plot, 
    offsetgroup=1, marker_color=palette[1]
), row=1,col=2)
fig.add_trace(go.Bar(
    x=np.arange(0,len(plot)),
    y=G.order()*poisson.pmf(np.arange(0, len(plot)),2*G.size()/G.order()),
    offsetgroup=1, marker_color='black', opacity=0.2
), row=1,col=2)

fig.update_layout(bargap=0.2, title='Degree Distribution', showlegend=False)


Proteomics

Order: 249
Size: 1820
Density: 0.05895
Average clustering: 0.2575 (random graph: mean = 0.05895, std = 0.00237)
Average degree: 14.61847
Number of connected components: 16 (random graph: mean = 1.0, std = 0.0)
Number of connected components (order g.t. 1): 4 (random graph: mean = 1.0, std = 0.0)

Metabolomics

Order: 2062
Size: 9052
Density: 0.00426
Average clustering: 0.34628 (random graph: mean = 0.00415, std = 0.00044)
Average degree: 8.77983
Number of connected components: 780 (random graph: mean = 1.29, std = 0.57088)
Number of connected components (order g.t. 1): 70 (random graph: mean = 1.0, std = 0.0)


In [16]:
#
# Number of connected components with different sizes
#

G = mmg

fig = make_subplots(rows=1, cols=2, subplot_titles=['Proteomics', 'Metabolomics'])#go.Figure()
for n,G in enumerate([qqg, mmg]):
    plot = pd.Series([len(i) for i in nx.connected_components(G)]).value_counts().sort_index()
    fig.add_trace(go.Bar(
        x = [str(i) for i in plot.index],
        y = plot.values,
        width=0.5,
        text=plot.values, textposition='auto', 
    ), row=1, col=n+1)

fig.update_layout(bargap=1)
fig.update_layout(title=f'Number of connected components with different order', showlegend=False)
fig.show()

In [None]:
#
# Number of cliques with diferent orders
#

fig = make_subplots(rows=1, cols=2, subplot_titles=['Proteomics', 'Metabolomics'])#go.Figure()

for n,G in enumerate([qqg, mmg]):
    plot = pd.Series([len(i) for i in nx.find_cliques(G)]).value_counts().sort_index()
    fig.add_trace(go.Bar(
        x = [str(i) for i in plot.index],
        y = plot.values,
        width=0.5,
        text=plot.values, textposition='auto', 
    ), row=1, col=n+1)

    fig.update_layout(bargap=1)
fig.update_layout(title=f'Number of cliques with different order', showlegend=False)
fig.show()

In [17]:
#
# Build complete graph adding omic type to each node
#

g = nx.compose_all([qqg, mmg, qmg])

attr = {}
_ = [attr.update({i:'q'}) for i in q2i.index]
_ = [attr.update({i:'m'}) for i in m2i.index]
nx.set_node_attributes(g, attr, 'group')
attr = {}
_ = [attr.update({i:q2i.loc[i, 'qdesc']}) for i in q2i.index]
_ = [attr.update({i:m2i.loc[i, 'TP_ID']}) for i in m2i.index]
nx.set_node_attributes(g, attr, 'title')
# attr = {}
# _ = [attr.update({i:'blue'}) for i in q2i.index]
# _ = [attr.update({i:'red'}) for i in m2i.index]
# nx.set_node_attributes(g, attr, 'color')

OBTAIN CLUSTERS/COMMUNITIES

In [19]:
#
# Detect communities (clusters) 
# Louvain & Leiden; Finally we use Leiden

# comm = nx_comm.louvain_communities(g, resolution=1, seed=0)

In [18]:
# Get clustering consensus 

import leidenalg
import igraph as ig
import networkx.algorithms.community as nx_comm

G= mmg
n = 4
thr=10


def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def get_communities(G, n=5, thr=5):
    comm = [
        list(i.to_networkx().nodes)
        for i in leidenalg.find_partition(
                ig.Graph.from_networkx(G), leidenalg.ModularityVertexPartition, 
                n_iterations=-1, seed=-1
            ).subgraphs()
    ]

    commL = [[i for i in comm if len(i)>thr]]

    for i in range(n):
        commi = [
        list(i.to_networkx().nodes)
        for i in leidenalg.find_partition(
                ig.Graph.from_networkx(G), leidenalg.ModularityVertexPartition, 
                n_iterations=-1, seed=i
            ).subgraphs()
        ] 
        comm = [
        np.intersect1d(j, commi[np.argmax([jaccard(j,k) for k in commi])]) for j in comm
        ]

        #comm = [j for j in comm if nx.is_connected(G.subgraph(j)) and len(j)>thr]
        comm = [list(k) for j in comm for k in nx.connected_components(G.subgraph(j)) if len(k)>thr]
        commL.append(comm)
        if len(comm)==0: return commL
    
    return commL


#qqcom = get_communities(qqg, n=n, thr=10)
#mmcom = get_communities(mmg, n=n, thr=10)

In [19]:
#
# Apply get_communities function to random graphs so as to estimate 
# number of iterations n. 
# We calculate the minimum n at which <1% of random graphs have communities with order >thr
# Probability under randomness of obtain a community

thr = 10 # Minimum size of a community
pvThr = 0.05 # Maximum fraction of random graph with a community greater than thr
n = 100

commLqq = [get_communities(i, n=n, thr=thr) for i in qqA]
commLmm = [get_communities(i, n=n, thr=thr) for i in mmA]

In [20]:
# Get fraction of random graph containing communities after x iterations

commLqqn = [[len(gA[i]) if len(gA)>i else 0 for i in range(n+1)] for gA in commLqq]
commLmmn = [[len(gA[i]) if len(gA)>i else 0 for i in range(n+1)] for gA in commLmm]

commLqqn = (np.array(commLqqn)>0).sum(axis=0)/len(qqA)
commLmmn = (np.array(commLmmn)>0).sum(axis=0)/len(mmA)

pvThr = 0.05
qqThr = np.argwhere(commLqqn < pvThr)[0][0]
mmThr = np.argwhere(commLmmn < pvThr)[0][0]

print(f"Protoemics minimum iteration: {qqThr+1}")
print(f"Metabolomics minimum iteration: {mmThr+1}")

Protoemics minimum iteration: 6
Metabolomics minimum iteration: 4


In [22]:
i = 40

file = 'Plots/ClusterConsensus.html'
if os.path.exists(file):
    os.remove(file)


fig = go.Figure()
fig.add_trace(go.Scatter(
    x=np.arange(1, i+1),
    y =commLqqn[:i],
    mode='lines+markers',
    name='Proteomics'
))
fig.add_trace(go.Scatter(
    x=np.arange(1, i+1),
    y =commLmmn[:i],
    mode='lines+markers',
    name='Metabolomics'
))
fig.add_hline(y=0.05,line_width=1, line_dash="dot", line_color="black")
fig.update_layout(title='Fraction of Random Networks with communities vs N. Iterations')
fig.update_xaxes(title='Iterations'); fig.update_yaxes(title='Fraction of Random Networks')

fig.show()
with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

In [23]:
qqcoms = get_communities(qqg, n=n, thr=thr)
mmcoms = get_communities(mmg, n=n, thr=thr)

In [24]:
i = 40


fig = make_subplots(rows=1, cols=2, subplot_titles=['Number of communities', 'Number of features'])

fig.add_trace(go.Scatter(
    x=np.arange(1,i+1),
    y=[len(i) for i in qqcoms][:i],
    mode='lines+markers',
    name='Proteomics', marker_color=palette[0]
), col=1, row=1)
fig.add_trace(go.Scatter(
    x=np.arange(1,i+1),
    y=[len(i) for i in mmcoms][:i],
    mode='lines+markers',
    name='Metabolomics', marker_color=palette[1]
), col=1, row=1)

fig.add_trace(go.Scatter(
    x=np.arange(1,i+1),
    y=[sum([len(j) for j in i]) for i in qqcoms][:i],
    mode='lines+markers',
    name='Proteomics', marker_color=palette[0]
), col=2, row=1)
fig.add_trace(go.Scatter(
    x=np.arange(1,i+1),
    y=[sum([len(j) for j in i]) for i in mmcoms][:i],
    mode='lines+markers',
    name='Metabolomics', marker_color=palette[1]
), col=2, row=1)

fig.add_vline(x=qqThr+1,line_width=1, line_dash="dot", line_color="black", 
              annotation_text='Prot. Thr.', annotation_position='bottom right')
fig.add_vline(x=mmThr+1,line_width=1, line_dash="dot", line_color="black", 
              annotation_text='Metab. Thr.', annotation_position='top right')

fig.update_xaxes(title='Iterations')

fig.show()
with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

In [25]:
qqcom = qqcoms[qqThr]
mmcom = mmcoms[mmThr]

In [26]:
# Number of features per cluster

fig = make_subplots(rows=1, cols=2, subplot_titles=['Proteomics', 'Metabolomics'])
fig.add_trace(go.Bar(
    x=[str(i) for i in range(len(qqcom))],
    y=[len(i) for i in qqcom], width=0.1, showlegend=False
), row=1, col=1)

# mmcl = [1,9,10]
fig.add_trace(go.Bar(
    x=[str(i) for i in range(len(mmcom))],
    y=[len(i) for n,i in enumerate(mmcom)], width=0.2, showlegend=False
), row=1, col=2)
fig.update_xaxes(title='Cluster')
fig.update_yaxes(title='N. Features')

fig.show()
with open(file, 'a') as f:
    f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

Module Eigenvector

In [27]:
from sklearn.decomposition import PCA

qqcomPCA = [PCA(n_components=1).fit(xq[com]) for com in qqcom]
mmcomPCA = [PCA(n_components=1).fit(xm[com]) for com in mmcom]

fig = make_subplots(rows=1, cols=2, subplot_titles=['Proteomics', 'Metabolomics'])
fig.add_trace(go.Bar(
    x=[str(i) for i in range(len(qqcom))],
    y=[qqcomPCA[i].explained_variance_ratio_[0] for i in range(len(qqcom))], 
    width=0.1, showlegend=False
), row=1, col=1)

fig.add_trace(go.Bar(
    x=[str(i) for i in range(len(mmcom))],
    y=[mmcomPCA[i].explained_variance_ratio_[0] for i in range(len(mmcom))], 
    width=0.2, showlegend=False
), row=1, col=2)

fig.update_yaxes(range=(0,1))
fig.update_xaxes(title='Cluster')
fig.update_layout(title='Ratio of Explained Variance in the 1st PC')

In [28]:
qqcomE = pd.DataFrame(
    np.array([PCA(n_components=1).fit_transform(xq[com])[:,0] for com in qqcom]).T,
    index=xq.index
)

mmcomE = pd.DataFrame(
    np.array([PCA(n_components=1).fit_transform(xm[com])[:,0] for com in mmcom]).T,
    index=xm.index
)

In [33]:
from PCA_UMAP import PCA_Var

catVars = ['Group', 'Smoke_dummy']
conVars = ['Calcium_Score', 'HDL', 'LDL', 'Total_Cholesterol','Ox-LDL','Lipoprotein a',
           'CRP', 'Plaque_thickness','Framingham 10y','Framingham 30y','Systolic Blood Pressure',
           'Diastolic Blood Pressure']

qqCP = pd.concat([
    PCA_Var(xq[com], mdata, conVars, catVars+['Cohort'], n_comp=1)
    for com in qqcom
]).reset_index(drop=True).T

mmCP = pd.concat([
    PCA_Var(xm[com], mdata, conVars, catVars+['Cohort'], n_comp=2).loc[[1]]
    for com in mmcom
]).reset_index(drop=True).T


# Apply FDR B-H correction

from statsmodels.stats.multitest import multipletests

qqCPF = qqCP.iloc[1:, :].copy().T
for i in qqCPF.columns:
    qqCPF[i] = multipletests(qqCPF.loc[:,i], method='fdr_bh')[1]

mmCPF = mmCP.iloc[1:, :].copy().T
for i in mmCPF.columns:
    mmCPF[i] = multipletests(mmCPF.loc[:,i], method='fdr_bh')[1]

In [47]:
mmCPF.T.iloc[:, 20:]
# qqCPF.T

Unnamed: 0,20,21,22,23,24,25,26
Calcium_Score,0.705968,0.56114,0.844143,0.899737,0.01402366,0.1229169,0.172749
HDL,0.803443,0.412954,0.000827,0.51684,0.0275845,4.0809090000000003e-25,0.239873
LDL,0.125047,0.918246,0.012796,0.06442,0.3407435,0.6210057,0.584602
Total_Cholesterol,0.409302,0.832272,0.014868,0.002528,0.9456099,0.4280639,0.163252
Ox-LDL,0.592801,0.346262,0.203693,0.551642,0.5516425,0.004181557,0.070367
Lipoprotein a,0.991453,0.991453,0.991453,0.991453,0.9914532,0.9914532,0.991453
CRP,0.466807,0.428123,0.715413,0.540044,0.009919688,0.09837584,0.357072
Plaque_thickness,0.887347,0.173884,0.952892,0.887347,0.0002737114,0.01345252,0.000241
Framingham 10y,0.66994,0.081856,0.065669,0.081856,4.523264e-06,1.673757e-15,0.007375
Framingham 30y,0.104415,0.084322,0.073397,0.112169,6.85002e-08,6.891413e-17,4.3e-05


In [94]:
mdatam = mdata.set_index('Seqn').copy()
mdatam = mdatam.loc[xm.index.tolist()]


(
    xm.loc[mdatam.index[mdatam.Group == 'D'].tolist()].mean()-\
    xm.loc[mdatam.index[mdatam.Group == 'C'].tolist()].mean()
)[mmcom[6]]

#xm[mmcom[8]].loc[mdata[(mdata.set_index('Seqn')['Group']=='C').tolist()].Seqn]


C18N146   -0.015548
C18N91    -0.069683
C18N112    0.022390
C18N136   -0.053705
C18N39    -0.012509
C18N61     0.074186
C18N63     0.071671
C18N96    -0.044834
C18N6     -0.044128
C18N4     -0.070231
C18N1     -0.023281
C18N193   -0.069115
dtype: float64

In [46]:
# La PC1 de qq1 y qq3 separan entre grupos. Sacar violin plot de la proyecci칩n
qqcomE
tmp = dict(list(mdata.set_index('Seqn').loc[qqcomE.index].reset_index('Seqn').groupby('Group')))
tmp2 = pd.melt(qqcomE.reset_index('Seqn'), id_vars='Seqn', value_vars=qqcomE.columns).set_index('Seqn')

fig = go.Figure()
fig.add_trace(go.Violin(x=tmp2.loc[tmp['C'].Seqn].variable,
                        y=tmp2.loc[tmp['C'].Seqn].value,
                        legendgroup='C', scalegroup='C', name='C',
                        side='negative',
                        line_color=palette[0])
             )
fig.add_trace(go.Violin(x=tmp2.loc[tmp['D'].Seqn].variable,
                        y=tmp2.loc[tmp['D'].Seqn].value,
                        legendgroup='D', scalegroup='D', name='D',
                        side='positive',
                        line_color=palette[1])
             )

fig.update_traces(box_visible=False, meanline_visible=True)
fig.update_layout(violinmode='overlay', title='PC1 Projection Distribution per Group')
fig.update_yaxes(title='PC1', range=(-12,12))
fig.update_xaxes(title='Protein Cluster')
fig.show()

In [97]:
# La PC1 de qq1 y qq3 separan entre grupos. Sacar violin plot de la proyecci칩n
qqcomE

clusters = list(range(len(mmcom)))
mmcomE_tmp = mmcomE.iloc[:, clusters]

tmp = dict(list(mdata.set_index('Seqn').loc[mmcomE_tmp.index].reset_index(names='Seqn').groupby('Group')))
tmp2 = pd.melt(mmcomE_tmp.reset_index(names='Seqn'), id_vars='Seqn', value_vars=mmcomE_tmp.columns).set_index('Seqn')

fig = go.Figure()
fig.add_trace(go.Violin(x=[str(i) for i in tmp2.loc[tmp['C'].Seqn].variable],
                        y=tmp2.loc[tmp['C'].Seqn].value,
                        legendgroup='C', scalegroup='C', name='C',
                        side='negative',
                        line_color=palette[0])
             )
fig.add_trace(go.Violin(x=[str(i) for i in tmp2.loc[tmp['D'].Seqn].variable],
                        y=tmp2.loc[tmp['D'].Seqn].value,
                        legendgroup='D', scalegroup='D', name='D',
                        side='positive',
                        line_color=palette[1])
             )

fig.update_traces(box_visible=False, meanline_visible=True)
fig.update_layout(violinmode='overlay', title='PC1 Projection Distribution per Group')
fig.update_yaxes(title='PC1', range=(-12,12))
fig.update_xaxes(title='Protein Cluster')
fig.show()

In [50]:
# atable = q2i.loc[qqcom[0]]
# atable = m2i.loc[mmcom[9], ['ID', 'TP_ID']]

In [43]:
q2i['qName'] = q2i.qdesc #[re.search(r'HUMAN (.*) OS=', i).groups()[0] for i in q2i.qdesc]

In [48]:
writer = pd.ExcelWriter('Clusters/DCA.xlsx', engine='openpyxl', mode='w')

pd.DataFrame(
    [(q2i.loc[i0, 'qName'],q2i.loc[i1, 'qName'],w['weight']) for i0, i1, w in qqg.edges.data()]
    ).to_excel(writer, sheet_name='q')

pd.DataFrame(
    [(m2i.loc[i0,'TP_ID'],m2i.loc[i1,'TP_ID'],w['weight']) for i0, i1, w in mmg.edges.data()]
    ).to_excel(writer, sheet_name='m')

pd.DataFrame(
    [(q2i.loc[i0, 'qName'],m2i.loc[i1,'TP_ID'],w['weight']) for i0, i1, w in qmg.edges.data()]
    ).to_excel(writer, sheet_name='qm')

writer.close()

Pertenencia de cada nodo a su cluster

In [44]:
q2iC = q2i.join(
    pd.DataFrame(
        {q: dict(zip(['ComCorr', 'ComPval', 'Com'], [*pearsonr(qqcomE[n], xq[q]), n])) 
        for n,com in enumerate(qqcom) for q in com}
        ).T,
    how='left'
)

m2iC = m2i.join(
    pd.DataFrame(
        {m: dict(zip(['ComCorr', 'ComPval', 'Com'], [*pearsonr(mmcomE[n], xm[m]), n])) 
        for n,com in enumerate(mmcom) for m in com}
        ).T,
    how='left'
)#.loc[:, ['TP_ID', 'ComCorr', 'ComPval', 'Com']]

In [43]:
# Write feature info tables separated by cluster

_ = [
    i[1].to_csv(f'Clusters/qq_{int(i[0])}.tsv', sep='\t')
    for i in q2iC[~q2iC['Com'].isna()].groupby('Com')
]

_ = [
    i[1].to_csv(f'Clusters/mm_{int(i[0])}.tsv', sep='\t')
    for i in m2iC[~m2iC['Com'].isna()].groupby('Com')
]

In [45]:
# Write feature info tables separated by cluster

with pd.ExcelWriter(f'Clusters/qq_allClusters.xlsx') as writer:
    _ = [
        i[1].to_excel(writer, sheet_name=f'qq_{int(i[0])}')
        for i in q2iC[~q2iC['Com'].isna()].groupby('Com')
    ]

with pd.ExcelWriter(f'Clusters/mm_allClusters.xlsx') as writer:
    _ = [
        i[1].to_excel(writer, sheet_name=f'mm_{int(i[0])}')
        for i in m2iC[~m2iC['Com'].isna()].groupby('Com')
    ]
    

Distribucion de Protein-Metabolite por proteina y clase de metabolito

In [49]:
rcca = corr.rcca.qm.a.copy()

In [50]:
m2im = m2i.loc[xm.columns]

In [64]:
apos = [
    ('P04114', 'ApoB-100'),
    ('P02649', 'ApoE'),
    ('P02654', 'ApoC-I'),
    ('P02655', 'ApoC-II'),
    ('P02647', 'ApoA-I'),
    ('P02652', 'ApoA-II'),
    ('P05090', 'ApoD'),
    ('Q13790', 'ApoF'),
]

lre = r"LMSD{ DG "

fig = make_subplots(rows=2, cols=4, subplot_titles=list(zip(*apos))[1], shared_yaxes=True, vertical_spacing=0.1)

i,j  = apos[0]

for n,(i,j) in enumerate(apos):

    fig.add_trace(go.Scatter(
        y = rcca.loc[i], x = np.random.uniform(0.5,1.5, rcca.shape[1]),
        mode='markers', marker_size=1, marker_color=palette[0], showlegend=False
    ), row=1+n//4, col=1+n%4)

    fig.add_trace(go.Scatter(
        y = rcca.loc[i, m2i.TP_ID[[bool(re.search(lre, i)) for i in m2i.TP_ID]].index], 
        x = np.random.uniform(2.8,3.2, rcca.shape[1]),
        mode='markers', marker_size=3, marker_color=palette[1], showlegend=False
    ), row=1+n//4, col=1+n%4)

fig.update_layout(height=800)
fig.update_xaxes(tickvals=[1,3], ticktext=['All', 'DG'], range=(0,4))
fig.update_yaxes(range=(-0.3,0.3))

fig.show()




In [62]:
apos = [
    ('P00738', 'HPT'),
    ('P01833', 'PIGR'),
    ('P05546', 'HEP2'),
    ('P08519', 'LPA'),
    ('P01877', 'IGHA2'),
    ('P01834', 'IGKC'),
]

lre = [
    (r"LMSD{ PE [0-9]", 'PE'),
    (r"LMSD{ PC [0-9]", 'PC'),
    (r"LMSD{ LPE [0-9]", 'LPE'),
    (r"LMSD{ LPE [0-9]", 'LPC')
]

fig = make_subplots(rows=2, cols=3, subplot_titles=list(zip(*apos))[1], shared_yaxes=True, vertical_spacing=0.1)

i,j  = apos[0]

for n,(i,j) in enumerate(apos):
    if i not in rcca.index: continue
    fig.add_trace(go.Scatter(
        y = rcca.loc[i], x = np.random.uniform(0.5,1.5, rcca.shape[1]),
        mode='markers', marker_size=1, marker_color=palette[0], showlegend=False
    ), row=1+n//3, col=1+n%3)

    for m,(lre_i, lre_j) in enumerate(lre):

        fig.add_trace(go.Scatter(
            y = rcca.loc[i, m2i.TP_ID[[bool(re.search(lre_i, i)) for i in m2i.TP_ID]].index], 
            x = np.random.uniform(m+1.8,m+2.2, rcca.shape[1]),
            mode='markers', marker_size=3, marker_color=palette[m+1], showlegend=False
        ), row=1+n//3, col=1+n%3)

fig.update_layout(height=800)
fig.update_xaxes(tickvals=[1,2,3,4,5], ticktext=['All', *list(zip(*lre))[1]], range=(0,6))
fig.update_yaxes(range=(-0.3,0.3))

fig.show()





In [66]:
mdatam = mdata.set_index('Seqn').copy()
mdatam = mdatam.loc[xm.index.tolist()]

In [67]:
apos = [
    ('P00738', 'HPT'),
    ('P01833', 'PIGR'),
    ('P05546', 'HEP2'),
    ('P08519', 'LPA'),
    ('P01877', 'IGHA2'),
    ('P01834', 'IGKC'),
]

lre = [
    (r"LMSD{ PE [0-9]", 'PE'),
    (r"LMSD{ PC [0-9]", 'PC'),
    (r"LMSD{ LPE [0-9]", 'LPE'),
    (r"LMSD{ LPE [0-9]", 'LPC')
]

#fig = make_subplots(rows=2, cols=3, subplot_titles=list(zip(*apos))[1], shared_yaxes=True, vertical_spacing=0.1)
fig = go.Figure()

i,j  = apos[0]

# for n,(i,j) in enumerate(apos):

#     fig.add_trace(go.Scatter(
#         y = rcca.loc[i], x = np.random.uniform(0.5,1.5, rcca.shape[1]),
#         mode='markers', marker_size=1, marker_color=palette[0], showlegend=False
#     ), row=1+n//3, col=1+n%3)

points = xm.loc[mdatam.loc[mdatam.Group == 'D'].index, :].mean()\
    -xm.loc[mdatam.loc[mdatam.Group == 'C'].index, :].mean()
fig.add_trace(go.Scatter(
    y = points, 
    x = np.random.uniform(0.5,1.5, len(points)),
    mode='markers', marker_size=1, marker_color=palette[0], showlegend=False
))

for m,(lre_i, lre_j) in enumerate(lre):
    myL = m2i.TP_ID[[bool(re.search(lre_i, i)) for i in m2i.TP_ID]].index
    
    points = xm.loc[mdatam.loc[mdatam.Group == 'D'].index, myL].mean()\
        -xm.loc[mdatam.loc[mdatam.Group == 'C'].index, myL].mean()

    fig.add_trace(go.Scatter(
        y = points, 
        x = np.random.uniform(m+1.8,m+2.2, len(points)),
        mode='markers', marker_size=3, marker_color=palette[m+1], showlegend=False
    ))

fig.update_layout(width=700)
fig.update_xaxes(tickvals=[1,2,3,4,5], ticktext=['All', *list(zip(*lre))[1]], range=(0,6))
#fig.update_yaxes(range=(-0.3,0.3))

fig.show()


