In [15]:
#
# Import Libraries
#

import numpy as np
import os
import pandas as pd
import sys
from dotmap import DotMap

from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.colors import n_colors

utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)


In [16]:
#
# Constants
#

modes = ['cp', 'cn', 'hp']

bpath = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\PESA_V2\OriginalFiles"
xm_path = DotMap({
    'cp': os.path.join(bpath, 'LOESS', 'C18P', 'statTarget/shiftCor/After_shiftCor/'),
    'cn': os.path.join(bpath, 'LOESS', 'C18N', 'statTarget/shiftCor/After_shiftCor/'),
    'hp': os.path.join(bpath, 'LOESS', 'HILP', 'statTarget/shiftCor/After_shiftCor/'),
})

mdata_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\PESA_V2\WorkingFiles\main_metadata.tsv'

In [17]:
#
# Read QC and plot values distribution
#

qc = DotMap({
    i: pd.read_csv(os.path.join(xm_path[i], 'shift_QC_cor.csv')).iloc[:, 1:]
    for i in modes
})

file = 'Plots/LOESS_QC.html'
if os.path.exists(file):
    os.remove(file)

for i in modes:
    tmp = list(qc[i].groupby('batch'))

    colors = n_colors('rgb(5, 200, 200)', 'rgb(200, 10, 10)', len(tmp), colortype='rgb')

    fig = make_subplots(rows=1, cols=2, subplot_titles=[f'QC batch distribution | Mode: {i}', 'All batches'])
    for data_line, color in zip(tmp, colors):
        fig.add_trace(go.Violin(
            x=data_line[1].iloc[:,2:].to_numpy().flatten()/1000,
            side='positive', line_color=color, points=False, width=3, name=data_line[0]
        ),row=1, col=1)

    fig.add_trace(go.Violin(
        x=qc[i].iloc[:, 2:].to_numpy().flatten()/1000,
        side='positive', points=False, line_color='black', showlegend=False, name=''
    ), row=1, col=2)

    fig.add_vline(x=1, line_width=0.2, line_dash='dash')
    fig.update_xaxes(range=(0,2))
    
    #fig.show()
    with open(file, 'a') as f:
            f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

In [18]:
#
# Read data per platform and adapt
#

xm = DotMap({
    i: pd.read_csv(os.path.join(xm_path[i], 'shift_sample_cor.csv'))
    for i in modes
})

mdata = pd.read_csv(mdata_path, sep='\t')

for i in xm:
    xm[i]['sample'] = list(zip(*xm[i]['sample'].str.split('_')))[1]
    xm[i] = pd.merge(
        mdata[['Seqn', 'MetaboCode']],
        xm[i],
        how='inner', left_on='MetaboCode', right_on='sample'
    ).drop(['MetaboCode', 'sample', 'class'], axis=1).set_index('Seqn')/1000

In [19]:
# for i in modes:
#     tmp = [i.split('_')[1] for i in xm[i]['sample']]
#     tmp = [i if i[0]!='0' else i[1:] for i in tmp]
#     xm[i].index = mdata.set_index('Name').loc[tmp]['Seqn']
#     xm[i] = xm[i].drop(['sample', 'class'], axis=1)/1000


# Change column names
f2i = DotMap({
    'cp': pd.read_excel('../OriginalFiles/RBR_f2i.xlsx', sheet_name='C18P'),
    'cn': pd.read_excel('../OriginalFiles/RBR_f2i.xlsx', sheet_name='C18N'),
    'hp': pd.read_excel('../OriginalFiles/RBR_f2i.xlsx', sheet_name='HILP'),
    # 'hn': pd.read_excel('../OriginalFiles/RBR_f2i.xlsx', sheet_name='HILN')
})

for i in modes:
    xm[i].columns = f2i[i].set_index('Name').loc[
        xm[i].columns
    ]['fid']

for i in modes:
    f2i[i] = f2i[i].set_index('fid').loc[xm[i].columns]

In [20]:
#
# PCA Quality Control
#

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from plotly import graph_objects as go

file = 'Plots/PCA_QC.html'
if os.path.exists(file):
    os.remove(file)

for i in modes:

    tmp = qc[i].drop('batch', axis=1).set_index('sample').T/1000

    tmp.index = f2i[i].reset_index(names='fid').set_index('Name').loc[tmp.index]['fid']
    tmp = xm[i].T.join(
        tmp,
        how='inner'
    ).T

    tmp = pd.DataFrame(
        StandardScaler().fit_transform(np.log2(tmp)),
        index=tmp.index, columns=tmp.columns
    )

    pca = PCA(n_components=2)
    pca.fit(tmp.to_numpy()),
    tmp = pd.DataFrame(
        pca.transform(tmp.to_numpy()),
        index=tmp.index
    )

    fig = go.Figure()
    [fig.add_trace(go.Scatter(
        #x = tmp.loc[xm[i].index, 0],
        #y = tmp.loc[xm[i].index, 1],
        x = tmp.loc[s.Seqn, 0],
        y = tmp.loc[s.Seqn, 1],
        mode='markers', marker=dict(size=3), name=b,
        #marker_color=mdata.set_index('Seqn').loc[xm[i].index, 'batch']
    ))
    for b, s in mdata.groupby('batch')]

    fig.add_trace(go.Scatter(
        x = tmp.loc[qc[i]['sample'], 0],
        y = tmp.loc[qc[i]['sample'], 1],
        mode='markers', marker=dict(size=3), name='QC'
    ))

    fig.update_xaxes(title=f'PCA 1 ({round(pca.explained_variance_ratio_[0], 4)})')
    fig.update_yaxes(title=f'PCA 2 ({round(pca.explained_variance_ratio_[1], 4)})')
    fig.update_layout(width=700, title=f'{i}')

    with open(file, 'a') as f:
            f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))


In [21]:
#
# Missing values | They were imputed using KNN so it must be 0
#

for i in modes:
    print(f"{i}: {xm[i].isna().sum().sum()}")

cp: 0
cn: 0
hp: 0


In [22]:
#
# Number of observations & features per mode
#

for i in modes:
    print(f'{i} | N. observations {xm[i].shape[0]}')
print()
for i in modes:
    print(f'{i} | N. features {xm[i].shape[1]}')

cp | N. observations 440
cn | N. observations 440
hp | N. observations 440

cp | N. features 411
cn | N. features 387
hp | N. features 1264


In [23]:
#
# Que se perdio en C18P?
#

print(mdata.set_index('Seqn').loc[:, ['Cohort', 'Group']].groupby(['Cohort', 'Group']).size())
print()
print(mdata.set_index('Seqn').loc[xm.cp.index, ['Cohort', 'Group']].groupby(['Cohort', 'Group']).size())

Cohort  Group
2       C        60
        D        60
3       C        58
        D        60
4       C        51
        D        51
5       C        51
        D        49
dtype: int64

Cohort  Group
2       C        60
        D        60
3       C        58
        D        60
4       C        51
        D        51
5       C        51
        D        49
dtype: int64


In [24]:
#
# Juntar las tres matrices
#

from functools import reduce

xm = reduce(lambda l, r: l.join(r, how='inner'), [xm[i] for i in modes])

In [25]:
palette = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880']

Plot Data distribution per platform

In [26]:
#
# Plot data distribution per platform
#

from sklearn.preprocessing import StandardScaler

file = 'Plots/DataDistribution.html'
if os.path.exists(file):
    os.remove(file)

def plotDataDist(f, i):
    fig = make_subplots(rows=1, cols=3, shared_yaxes=True, subplot_titles=['LOESS', 'LOESS+log2', 'LOESS+log2+CenterScal'])

    fig.add_trace(go.Histogram(
        x=xm.loc[:, f].to_numpy().flatten(),
        xbins={'size':0.1}, opacity=0.7, marker_color=palette[0],showlegend=False, histnorm='probability density'
    ), row=1, col=1)

    fig.add_trace(go.Histogram(
        x=np.log2(xm.loc[:, f].to_numpy().flatten()),
        xbins={'size':0.1}, opacity=0.7, marker_color=palette[0],showlegend=False, histnorm='probability density'
    ), row=1, col=2)

    fig.add_trace(go.Histogram(
        x=StandardScaler().fit_transform(np.log2(xm.loc[:, f])).flatten(),
        xbins={'size':0.1}, opacity=0.7, marker_color=palette[0],showlegend=False, histnorm='probability density'
    ), row=1, col=3)


    fig.update_xaxes(range=(-5,5))
    fig.update_layout(title=i)
    #fig.show()
    with open(file, 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))


for i in modes:
    f = f2i[i].index
    plotDataDist(f,i)

plotDataDist(xm.columns, 'All')

In [27]:
#
# Normalize data
# 

xmn = pd.DataFrame(
    StandardScaler().fit_transform(np.log2(xm)),
    columns=xm.columns, index=xm.index
)

In [28]:
#
# Check Batch Effect by Cohort
#

from PlotEDA import PlotEDA
file = 'Plots/CohortBatchEffect.html'
if os.path.exists(file):
    os.remove(file)

plotEDA = PlotEDA(xmn, mdata, file=file)
plotEDA.plotByGroup('batch',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)
plotEDA = PlotEDA(xmn, mdata, file=file)
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)
plotEDA = PlotEDA(xmn, mdata, file=file)
plotEDA.plotByGroup('Group',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)

In [29]:
catVars = ['Group', 'Smoke_dummy']
conVars = ['Calcium_Score', 'HDL', 'LDL', 'Total_Cholesterol','Ox-LDL','Lipoprotein a','CRP', 'Plaque_thickness']

In [30]:
#
# Adjust Cohort Batch Effect
#

from myComBat import myComBat

xmnb = myComBat(xmn, mdata, 'Cohort', catVars, conVars, Rpath=os.path.join('myRData'))

Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Loading required package: BiocParallel
1: package 'sva' was built under R version 4.2.1 
2: package 'mgcv' was built under R version 4.2.2 
3: package 'nlme' was built under R version 4.2.2 
4: package 'genefilter' was built under R version 4.2.2 
5: package 'BiocParallel' was built under R version 4.2.2 
Found4batches
Adjusting for10covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding nonparametric adjustments
Adjusting the Data




In [31]:
#
# Check Batch Effect by Cohort after 
#

from PlotEDA import PlotEDA
file = 'Plots/CohortBatchEffectComBat.html'
if os.path.exists(file):
    os.remove(file)
plotEDA = PlotEDA(xmn, mdata, file=file)
plotEDA.plotByGroup('batch',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)
plotEDA = PlotEDA(xmnb, mdata, file=file)
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)
plotEDA = PlotEDA(xmnb, mdata, file=file)
plotEDA.plotByGroup('Group',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)

In [32]:
#
# Write Xm normalised
#

xmnb.to_csv('Xm_norm.tsv', sep='\t')

In [33]:
#
# Dimensionality Reduction
#

from PCA_UMAP import PCA_UMAP, PCA_Var

file = 'Plots/PCA.html'
if os.path.exists(file):
    os.remove(file)

pcaumap = PCA_UMAP(xmn, mdata, file=file)
pcaumap.plotReduction('Cohort', pcacomp=[0,1])
pcaumap.plotReduction('Group', pcacomp=[0,1])
pcaumap.plotReduction('batch', pcacomp=[0,1])

pcaumap = PCA_UMAP(xmnb, mdata, file=file)
pcaumap.plotReduction('Cohort', pcacomp=[0,1], titleLabel='- Batch Corrected')
pcaumap.plotReduction('Group', pcacomp=[0,1], titleLabel='- Batch Corrected')
pcaumap.plotReduction('batch', pcacomp=[0,1], titleLabel='- Batch Corrected')

In [34]:
PCA_Var(xmn, mdata, conVars, catVars+['Cohort', 'batch'], n_comp=10).round(4)

Unnamed: 0,%Var PCA,Calcium_Score,HDL,LDL,Total_Cholesterol,Ox-LDL,Lipoprotein a,CRP,Plaque_thickness,Group,Smoke_dummy,Cohort,batch
1,6.1619,0.9735,0.0794,0.4393,0.0771,0.5253,0.507,0.4709,0.2913,0.2457,0.9534,0.0055,0.0
2,5.7384,0.1041,0.2297,0.2229,0.2237,0.3594,0.5957,0.0041,0.9481,0.1812,0.0019,0.0104,0.0
3,4.4225,0.7059,0.031,0.0279,0.001,0.1753,0.9814,0.0592,0.7504,0.3727,0.5277,0.0002,0.0
4,3.5413,0.0677,0.0,0.0344,0.0,0.1487,0.0711,0.0499,0.9869,0.9171,0.4852,0.1815,0.0
5,3.1277,0.0008,0.0,0.4117,0.0016,0.0023,0.1273,0.0003,0.0,0.0,0.0,0.0,0.0
6,2.6267,0.6673,0.0046,0.0137,0.0008,0.3907,0.7804,0.4205,0.9663,0.2793,0.8034,0.0,0.0
7,2.3642,0.0653,0.1138,0.002,0.0057,0.6764,0.5665,0.1856,0.6089,0.7577,0.0001,0.0001,0.0
8,2.0458,0.1162,0.7958,0.0214,0.156,0.1286,0.773,0.1527,0.085,0.0368,0.0,0.0,0.0
9,1.8339,0.4325,0.0,0.0263,0.0182,0.0108,0.9246,0.0093,0.3333,0.8795,0.2356,0.0,0.0
10,1.7141,0.9256,0.7452,0.2573,0.8695,0.4045,0.4531,0.8144,0.7779,0.9308,0.285,0.0151,0.0


In [35]:
xmnb = pd.read_csv('Xm_norm.tsv', sep='\t', index_col='Seqn')

In [36]:
from PCA_UMAP import PCA_Var
PCA_Var(xmnb, mdata, conVars, catVars+['Cohort', 'batch'], n_comp=10).T

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
%Var PCA,6.054992,5.390977,4.446299,3.672196,3.137663,2.618348,2.2502,1.9581,1.857443,1.691875
Calcium_Score,0.132082,0.8410555,0.1749749,0.2208568,0.0009772334,0.073354,0.443294,0.5044842,0.756365,0.1635421
HDL,0.0834842,0.1157649,0.1716224,1.398601e-12,4.777742e-07,0.917339,1.367418e-06,0.5667187,0.841255,0.02482104
LDL,0.1555875,0.2340389,0.07220988,0.02199731,0.5559692,3.7e-05,0.6121263,0.37259,0.048886,0.3471312
Total_Cholesterol,0.04755602,0.01456454,0.01446423,1.189936e-05,0.0014982,3.2e-05,0.9742737,0.9853762,0.343291,0.1438837
Ox-LDL,0.4528728,0.4509465,0.2968889,0.2384064,0.001961735,0.366572,0.2141657,0.1301176,0.353665,0.3813029
Lipoprotein a,0.8644367,0.4450226,0.8437207,0.06551741,0.2231576,0.678526,0.7974871,0.5522727,0.691924,0.1895765
CRP,0.01240818,0.6445078,0.06702184,0.00735127,5.821384e-05,0.326823,0.6737758,0.2685443,0.637951,0.9587967
Plaque_thickness,0.8387849,0.6013457,0.3872929,0.9635614,2.642998e-11,0.30688,0.2684278,0.83203,0.962564,0.04462682
Group,0.112085,0.8229713,0.4049291,0.8735814,3.087774e-07,0.743692,0.645808,0.6471211,0.902013,0.4547843
