In [15]:
#
# Import Libraries
#

import numpy as np
import os
import pandas as pd
import sys
from dotmap import DotMap

from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.colors import n_colors

utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)


In [16]:
#
# Constants
#

modes = ['cp', 'cn', 'hp', 'hn']

bpath = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\PESA\OriginalFiles"
xm_path = DotMap({
    'cp': os.path.join(bpath, 'LOESS', 'C18P', 'statTarget/shiftCor/After_shiftCor/'),
    'cn': os.path.join(bpath, 'LOESS', 'C18N', 'statTarget/shiftCor/After_shiftCor/'),
    'hp': os.path.join(bpath, 'LOESS', 'HILP', 'statTarget/shiftCor/After_shiftCor/'),
    'hn': os.path.join(bpath, 'LOESS', 'HILN', 'statTarget/shiftCor/After_shiftCor/')
})

mdata_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\PESA\WorkingFiles\main_metadata.tsv'

In [17]:
#
# Read QC and plot values distribution
#

qc = DotMap({
    i: pd.read_csv(os.path.join(xm_path[i], 'shift_QC_cor.csv')).iloc[:, 1:]
    for i in modes
})

file = 'Plots/LOESS_QC.html'
if os.path.exists(file):
    os.remove(file)

for i in modes:
    tmp = list(qc[i].groupby('batch'))

    colors = n_colors('rgb(5, 200, 200)', 'rgb(200, 10, 10)', len(tmp), colortype='rgb')

    fig = make_subplots(rows=1, cols=2, subplot_titles=[f'QC batch distribution | Mode: {i}', 'All batches'])
    for data_line, color in zip(tmp, colors):
        fig.add_trace(go.Violin(
            x=data_line[1].iloc[:,2:].to_numpy().flatten()/1000,
            side='positive', line_color=color, points=False, width=3, name=data_line[0]
        ),row=1, col=1)

    fig.add_trace(go.Violin(
        x=qc[i].iloc[:, 2:].to_numpy().flatten()/1000,
        side='positive', points=False, line_color='black', showlegend=False, name=''
    ), row=1, col=2)

    fig.add_vline(x=1, line_width=0.2, line_dash='dash')
    fig.update_xaxes(range=(0,2))
    
    #fig.show()
    with open(file, 'a') as f:
            f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

In [18]:
#
# Read data per platform and adapt
#

xm = DotMap({
    i: pd.read_csv(os.path.join(xm_path[i], 'shift_sample_cor.csv'))
    for i in modes
})

# Drop repeated value
xm.hn = xm.hn[xm.hn['sample'] != 'H_1506_3nrrr']

mdata = pd.read_csv(mdata_path, sep='\t')

for i in modes:
    tmp = [i.split('_')[1] for i in xm[i]['sample']]
    tmp = [i if i[0]!='0' else i[1:] for i in tmp]
    xm[i].index = mdata.set_index('Name').loc[tmp]['Seqn']
    xm[i] = xm[i].drop(['sample', 'class'], axis=1)/1000


# Change column names
f2i = DotMap({
    'cp': pd.read_excel('../OriginalFiles/RBR_f2i.xlsx', sheet_name='C18P'),
    'cn': pd.read_excel('../OriginalFiles/RBR_f2i.xlsx', sheet_name='C18N'),
    'hp': pd.read_excel('../OriginalFiles/RBR_f2i.xlsx', sheet_name='HILP'),
    'hn': pd.read_excel('../OriginalFiles/RBR_f2i.xlsx', sheet_name='HILN')
})

for i in modes:
    xm[i].columns = f2i[i].set_index('Name').loc[
        xm[i].columns
    ]['fid']

for i in modes:
    f2i[i] = f2i[i].set_index('fid').loc[xm[i].columns]

In [19]:
#
# PCA Quality Control
#

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from plotly import graph_objects as go

file = 'Plots/PCA_QC.html'
if os.path.exists(file):
    os.remove(file)

for i in modes:

    tmp = qc[i].drop('batch', axis=1).set_index('sample').T/1000

    tmp.index = f2i[i].reset_index(names='fid').set_index('Name').loc[tmp.index]['fid']
    tmp = xm[i].T.join(
        tmp,
        how='inner'
    ).T

    tmp = pd.DataFrame(
        StandardScaler().fit_transform(np.log2(tmp)),
        index=tmp.index, columns=tmp.columns
    )

    pca = PCA(n_components=2)
    pca.fit(tmp.to_numpy()),
    tmp = pd.DataFrame(
        pca.transform(tmp.to_numpy()),
        index=tmp.index
    )

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x = tmp.loc[xm[i].index, 0],
        y = tmp.loc[xm[i].index, 1],
        mode='markers', marker=dict(size=3), name='Observations'
    ))

    fig.add_trace(go.Scatter(
        x = tmp.loc[qc[i]['sample'], 0],
        y = tmp.loc[qc[i]['sample'], 1],
        mode='markers', marker=dict(size=3), name='QC'
    ))

    fig.update_xaxes(title=f'PCA 1 ({round(pca.explained_variance_ratio_[0], 4)})')
    fig.update_yaxes(title=f'PCA 2 ({round(pca.explained_variance_ratio_[1], 4)})')
    fig.update_layout(width=700, title=f'{i}')

    with open(file, 'a') as f:
            f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))


In [20]:
#
# Missing values | They were imputed using KNN so it must be 0
#

for i in modes:
    print(f"{i}: {xm[i].isna().sum().sum()}")

cp: 0
cn: 0
hp: 0
hn: 0


In [21]:
#
# Number of observations & features per mode
#

for i in modes:
    print(f'{i} | N. observations {xm[i].shape[0]}')
print()
for i in modes:
    print(f'{i} | N. features {xm[i].shape[1]}')

cp | N. observations 384
cn | N. observations 444
hp | N. observations 444
hn | N. observations 443

cp | N. features 482
cn | N. features 224
hp | N. features 539
hn | N. features 581


In [22]:
#
# Que se perdio en C18P?
#

print(mdata.set_index('Seqn').loc[:, ['Cohort', 'Group']].groupby(['Cohort', 'Group']).size())
print()
print(mdata.set_index('Seqn').loc[xm.cp.index, ['Cohort', 'Group']].groupby(['Cohort', 'Group']).size())

Cohort  Group
2       C        60
        D        60
3       C        60
        D        60
4       C        51
        D        51
5       C        51
        D        51
dtype: int64

Cohort  Group
2       C        60
        D        60
3       C        30
        D        30
4       C        51
        D        51
5       C        51
        D        51
dtype: int64


In [23]:
#
# Juntar las tres matrices
#

from functools import reduce

xm = reduce(lambda l, r: l.join(r, how='inner'), [xm[i] for i in modes])

In [24]:
palette = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880']

Plot Data distribution per platform

In [25]:
#
# Plot data distribution per platform
#

from sklearn.preprocessing import StandardScaler

file = 'Plots/DataDistribution.html'
if os.path.exists(file):
    os.remove(file)

def plotDataDist(f, i):
    fig = make_subplots(rows=1, cols=3, shared_yaxes=True, subplot_titles=['LOESS', 'LOESS+log2', 'LOESS+log2+CenterScal'])

    fig.add_trace(go.Histogram(
        x=xm.loc[:, f].to_numpy().flatten(),
        xbins={'size':0.1}, opacity=0.7, marker_color=palette[0],showlegend=False, histnorm='probability density'
    ), row=1, col=1)

    fig.add_trace(go.Histogram(
        x=np.log2(xm.loc[:, f].to_numpy().flatten()),
        xbins={'size':0.1}, opacity=0.7, marker_color=palette[0],showlegend=False, histnorm='probability density'
    ), row=1, col=2)

    fig.add_trace(go.Histogram(
        x=StandardScaler().fit_transform(np.log2(xm.loc[:, f])).flatten(),
        xbins={'size':0.1}, opacity=0.7, marker_color=palette[0],showlegend=False, histnorm='probability density'
    ), row=1, col=3)


    fig.update_xaxes(range=(-5,5))
    fig.update_layout(title=i)
    #fig.show()
    with open(file, 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))


for i in modes:
    f = f2i[i].index
    plotDataDist(f,i)

plotDataDist(xm.columns, 'All')

In [26]:
#
# Normalize data
# 

xmn = pd.DataFrame(
    StandardScaler().fit_transform(np.log2(xm)),
    columns=xm.columns, index=xm.index
)

In [27]:
#
# Check Batch Effect by Cohort
#

from PlotEDA import PlotEDA
file = 'Plots/CohortBatchEffect.html'
if os.path.exists(file):
    os.remove(file)
plotEDA = PlotEDA(xmn, mdata, file=file)
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)
plotEDA = PlotEDA(xmn, mdata, file=file)
plotEDA.plotByGroup('Group',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)

In [28]:
#
# Adjust Cohort Batch Effect
#

from myComBat import myComBat

catVars = ['Group', 'Smoke']
conVars = ['Calcium_Score', 'HDL', 'LDL', 'Total_Cholesterol','Ox-LDL','Lipoprotein(a)','CRP', 'Plaque_thickness']
xmnb = myComBat(xmn, mdata, 'Cohort', catVars, conVars, Rpath=os.path.join('myRData'))

Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Loading required package: BiocParallel
1: package 'sva' was built under R version 4.2.1 
2: package 'mgcv' was built under R version 4.2.2 
3: package 'nlme' was built under R version 4.2.2 
4: package 'genefilter' was built under R version 4.2.2 
5: package 'BiocParallel' was built under R version 4.2.2 
Found4batches
Adjusting for12covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding nonparametric adjustments
Adjusting the Data




In [29]:
#
# Check Batch Effect by Cohort after 
#

from PlotEDA import PlotEDA
file = 'Plots/CohortBatchEffectComBat.html'
if os.path.exists(file):
    os.remove(file)
plotEDA = PlotEDA(xmnb, mdata, file=file)
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)
plotEDA = PlotEDA(xmnb, mdata, file=file)
plotEDA.plotByGroup('Group',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)

In [30]:
#
# Write Xm normalised
#

xmnb.to_csv('Xm_norm.tsv', sep='\t')

In [31]:
#
# Dimensionality Reduction
#

from PCA_UMAP import PCA_UMAP, PCA_Var

file = 'Plots/PCA.html'
if os.path.exists(file):
    os.remove(file)

pcaumap = PCA_UMAP(xmn, mdata, file=file)
pcaumap.plotReduction('Cohort', pcacomp=[0,1])
pcaumap.plotReduction('Group', pcacomp=[0,1])
pcaumap.plotReduction('Global_Batch', pcacomp=[0,1])

pcaumap = PCA_UMAP(xmnb, mdata, file=file)
pcaumap.plotReduction('Cohort', pcacomp=[0,1], titleLabel='- Batch Corrected')
pcaumap.plotReduction('Group', pcacomp=[0,1], titleLabel='- Batch Corrected')
pcaumap.plotReduction('Global_Batch', pcacomp=[0,1], titleLabel='- Batch Corrected')

In [32]:
PCA_Var(xmn, mdata, conVars, catVars+['Cohort', 'Global_Batch'], n_comp=10)

Unnamed: 0,%Var PCA,Calcium_Score,HDL,LDL,Total_Cholesterol,Ox-LDL,Lipoprotein(a),CRP,Plaque_thickness,Group,Smoke,Cohort,Global_Batch
1,18.94805,0.000487,0.947608,0.095744,0.05194668,0.014225,0.175684,0.442047,0.453544,0.771446,0.000363,3.634427e-35,6.045926e-57
2,10.814844,0.154564,0.225289,0.547787,0.7030451,0.918278,0.289946,0.078505,0.461123,0.737581,0.789435,8.799230999999999e-21,1.257853e-28
3,8.013299,0.220777,0.822286,0.86876,0.2737797,0.016182,0.054504,0.238337,0.319419,0.579951,0.457446,5.443139e-09,2.024975e-31
4,4.228183,0.203252,0.853574,0.30323,0.1308258,0.27102,0.347748,0.996057,0.695923,0.941174,0.105424,0.0006762682,4.341786e-09
5,2.945379,0.594556,0.375965,0.012316,0.07531185,0.164919,0.975956,0.007982,0.460445,0.463339,0.000719,0.004512441,4.823968e-12
6,2.545051,0.195102,0.031316,0.109794,0.02192162,0.020028,0.921231,0.006244,0.057099,0.111491,0.02416,2.0588079999999997e-19,5.132410999999999e-19
7,2.267414,0.546839,0.002752,6e-06,9.218464e-08,0.010053,0.121086,0.938044,0.72067,0.141667,0.005054,9.098985e-12,2.185473e-22
8,2.019447,0.750676,0.249598,0.993795,0.2279789,0.001699,0.77727,0.12504,0.906429,0.304174,0.800952,3.022956e-16,1.162664e-27
9,1.643907,0.544214,0.003736,0.047753,0.0004536424,0.001507,0.696814,0.87604,0.920741,0.584738,0.004807,0.006413428,2.446406e-15
10,1.582284,0.034947,0.693296,0.035358,0.9978111,0.362421,0.082759,0.122714,0.335164,0.239952,0.158227,2.352222e-10,4.205377e-24


In [33]:
from PCA_UMAP import PCA_Var
PCA_Var(xmnb, mdata, conVars, catVars+['Cohort', 'Global_Batch'], n_comp=10).T

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
%Var PCA,16.07774,9.574435,6.85971,4.858675,3.110627,2.637547,2.156443,1.877944,1.758652,1.690287
Calcium_Score,0.6514403,0.382271,0.945677,0.727433,0.958318,0.8448039,0.5224111,0.6982222,0.05698165,0.7382257
HDL,0.3915076,0.5442188,0.43941,0.406529,0.464553,0.1111152,0.154475,0.001902807,0.0004612747,0.1552116
LDL,0.2972413,0.7080762,0.59015,0.344007,8e-06,0.04891596,0.1577936,0.1867115,0.0006721212,0.8729262
Total_Cholesterol,0.2226638,0.2860956,0.061977,0.172931,5e-06,0.001959799,0.4643226,0.0004084374,0.1058784,0.6465067
Ox-LDL,0.1137094,0.08483475,0.005335,0.485734,0.000834,0.1412619,0.4710056,0.0005056097,0.3943946,0.2602866
Lipoprotein(a),0.7300236,0.06395253,0.447477,0.188648,0.489785,0.2626993,0.8727635,0.9307521,0.1340043,0.2829695
CRP,0.7190947,0.03052226,0.627532,0.646929,0.00128,0.5166661,0.270868,0.5149672,0.7698739,0.7325355
Plaque_thickness,0.9436547,0.5040391,0.698078,0.442831,0.246617,0.8247717,0.6940899,0.9828264,0.02606999,0.6318636
Group,0.7136674,0.4029861,0.770794,0.618094,0.384392,0.3455566,0.5301467,0.4709976,0.004107001,0.8242696
