In [1]:
#
# Import Libraries
#

import numpy as np
import os
import pandas as pd
import sys
from dotmap import DotMap

from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.colors import n_colors

utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)


In [2]:
#
# Constants
#

modes = ['cp', 'cn', 'hp', 'hn']

bpath = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\PESA\OriginalFiles"
xm_path = DotMap({
    'cp': os.path.join(bpath, 'LOESS', 'C18P', 'statTarget/shiftCor/After_shiftCor/'),
    'cn': os.path.join(bpath, 'LOESS', 'C18N', 'statTarget/shiftCor/After_shiftCor/'),
    'hp': os.path.join(bpath, 'LOESS', 'HILP', 'statTarget/shiftCor/After_shiftCor/'),
    'hn': os.path.join(bpath, 'LOESS', 'HILN', 'statTarget/shiftCor/After_shiftCor/')
})

mdata_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\PESA\WorkingFiles\main_metadata.tsv'

In [3]:
#
# Read QC and plot values distribution
#

qc = DotMap({
    i: pd.read_csv(os.path.join(xm_path[i], 'shift_QC_cor.csv')).iloc[:, 1:]
    for i in modes
})

file = 'Plots/LOESS_QC.html'
if os.path.exists(file):
    os.remove(file)

for i in modes:
    tmp = list(qc[i].groupby('batch'))

    colors = n_colors('rgb(5, 200, 200)', 'rgb(200, 10, 10)', len(tmp), colortype='rgb')

    fig = make_subplots(rows=1, cols=2, subplot_titles=[f'QC batch distribution | Mode: {i}', 'All batches'])
    for data_line, color in zip(tmp, colors):
        fig.add_trace(go.Violin(
            x=data_line[1].iloc[:,2:].to_numpy().flatten()/1000,
            side='positive', line_color=color, points=False, width=3, name=data_line[0]
        ),row=1, col=1)

    fig.add_trace(go.Violin(
        x=qc[i].iloc[:, 2:].to_numpy().flatten()/1000,
        side='positive', points=False, line_color='black', showlegend=False, name=''
    ), row=1, col=2)

    fig.add_vline(x=1, line_width=0.2, line_dash='dash')
    fig.update_xaxes(range=(0,2))
    
    #fig.show()
    with open(file, 'a') as f:
            f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

In [4]:
#
# Read data per platform and adapt
#

xm = DotMap({
    i: pd.read_csv(os.path.join(xm_path[i], 'shift_sample_cor.csv'))
    for i in modes
})

# Drop repeated value
xm.hn = xm.hn[xm.hn['sample'] != 'H_1506_3nrrr']

mdata = pd.read_csv(mdata_path, sep='\t')

for i in modes:
    tmp = [i.split('_')[1] for i in xm[i]['sample']]
    tmp = [i if i[0]!='0' else i[1:] for i in tmp]
    xm[i].index = mdata.set_index('Name').loc[tmp]['Seqn']
    xm[i] = xm[i].drop(['sample', 'class'], axis=1)/1000


# Change column names
f2i = DotMap({
    'cp': pd.read_excel('../OriginalFiles/RBR_f2i.xlsx', sheet_name='C18P'),
    'cn': pd.read_excel('../OriginalFiles/RBR_f2i.xlsx', sheet_name='C18N'),
    'hp': pd.read_excel('../OriginalFiles/RBR_f2i.xlsx', sheet_name='HILP'),
    'hn': pd.read_excel('../OriginalFiles/RBR_f2i.xlsx', sheet_name='HILN')
})

for i in modes:
    xm[i].columns = f2i[i].set_index('Name').loc[
        xm[i].columns
    ]['fid']

for i in modes:
    f2i[i] = f2i[i].set_index('fid').loc[xm[i].columns]

In [82]:
#
# PCA Quality Control
#

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from plotly import graph_objects as go

file = 'Plots/PCA_QC.html'
if os.path.exists(file):
    os.remove(file)

for i in modes:

    tmp = qc[i].drop('batch', axis=1).set_index('sample').T/1000

    tmp.index = f2i[i].reset_index(names='fid').set_index('Name').loc[tmp.index]['fid']
    tmp = xm[i].T.join(
        tmp,
        how='inner'
    ).T

    tmp = pd.DataFrame(
        StandardScaler().fit_transform(np.log2(tmp)),
        index=tmp.index, columns=tmp.columns
    )

    pca = PCA(n_components=2)
    pca.fit(tmp.to_numpy()),
    tmp = pd.DataFrame(
        pca.transform(tmp.to_numpy()),
        index=tmp.index
    )

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x = tmp.loc[xm[i].index, 0],
        y = tmp.loc[xm[i].index, 1],
        mode='markers', marker=dict(size=3), name='Observations'
    ))

    fig.add_trace(go.Scatter(
        x = tmp.loc[qc[i]['sample'], 0],
        y = tmp.loc[qc[i]['sample'], 1],
        mode='markers', marker=dict(size=3), name='QC'
    ))

    fig.update_xaxes(title=f'PCA 1 ({round(pca.explained_variance_ratio_[0], 4)})')
    fig.update_yaxes(title=f'PCA 2 ({round(pca.explained_variance_ratio_[1], 4)})')
    fig.update_layout(width=700, title=f'{i}')

    with open(file, 'a') as f:
            f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))


In [5]:
#
# Missing values | They were imputed using KNN so it must be 0
#

for i in modes:
    print(f"{i}: {xm[i].isna().sum().sum()}")

cp: 0
cn: 0
hp: 0
hn: 0


In [6]:
#
# Number of observations & features per mode
#

for i in modes:
    print(f'{i} | N. observations {xm[i].shape[0]}')
print()
for i in modes:
    print(f'{i} | N. features {xm[i].shape[1]}')

cp | N. observations 384
cn | N. observations 444
hp | N. observations 444
hn | N. observations 443

cp | N. features 482
cn | N. features 224
hp | N. features 539
hn | N. features 581


In [7]:
#
# Que se perdio en C18P?
#

print(mdata.set_index('Seqn').loc[:, ['Cohort', 'Group']].groupby(['Cohort', 'Group']).size())
print()
print(mdata.set_index('Seqn').loc[xm.cp.index, ['Cohort', 'Group']].groupby(['Cohort', 'Group']).size())

Cohort  Group
2       C        60
        D        60
3       C        60
        D        60
4       C        51
        D        51
5       C        51
        D        51
dtype: int64

Cohort  Group
2       C        60
        D        60
3       C        30
        D        30
4       C        51
        D        51
5       C        51
        D        51
dtype: int64


In [8]:
#
# Juntar las tres matrices
#

from functools import reduce

xm = reduce(lambda l, r: l.join(r, how='inner'), [xm[i] for i in modes])

In [9]:
palette = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880']

Plot Data distribution per platform

In [10]:
#
# Plot data distribution per platform
#

from sklearn.preprocessing import StandardScaler

file = 'Plots/DataDistribution.html'
if os.path.exists(file):
    os.remove(file)

def plotDataDist(f, i):
    fig = make_subplots(rows=1, cols=3, shared_yaxes=True, subplot_titles=['LOESS', 'LOESS+log2', 'LOESS+log2+CenterScal'])

    fig.add_trace(go.Histogram(
        x=xm.loc[:, f].to_numpy().flatten(),
        xbins={'size':0.1}, opacity=0.7, marker_color=palette[0],showlegend=False, histnorm='probability density'
    ), row=1, col=1)

    fig.add_trace(go.Histogram(
        x=np.log2(xm.loc[:, f].to_numpy().flatten()),
        xbins={'size':0.1}, opacity=0.7, marker_color=palette[0],showlegend=False, histnorm='probability density'
    ), row=1, col=2)

    fig.add_trace(go.Histogram(
        x=StandardScaler().fit_transform(np.log2(xm.loc[:, f])).flatten(),
        xbins={'size':0.1}, opacity=0.7, marker_color=palette[0],showlegend=False, histnorm='probability density'
    ), row=1, col=3)


    fig.update_xaxes(range=(-5,5))
    fig.update_layout(title=i)
    #fig.show()
    with open(file, 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))


for i in modes:
    f = f2i[i].index
    plotDataDist(f,i)

plotDataDist(xm.columns, 'All')

In [11]:
#
# Normalize data
# 

xmn = pd.DataFrame(
    StandardScaler().fit_transform(np.log2(xm)),
    columns=xm.columns, index=xm.index
)

In [12]:
#
# Check Batch Effect by Cohort
#

from PlotEDA import PlotEDA
file = 'Plots/CohortBatchEffect.html'
if os.path.exists(file):
    os.remove(file)
plotEDA = PlotEDA(xmn, mdata, file=file)
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)
plotEDA = PlotEDA(xmn, mdata, file=file)
plotEDA.plotByGroup('Group',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)

In [13]:
#
# Adjust Cohort Batch Effect
#

from myComBat import myComBat

catVars = ['Group', 'Smoke']
conVars = ['Calcium_Score', 'HDL', 'LDL', 'Total_Cholesterol','Ox-LDL','Lipoprotein(a)','CRP', 'Plaque_thickness']
xmnb = myComBat(xmn, mdata, 'Cohort', catVars, conVars, Rpath=os.path.join('myRData'))

Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Loading required package: BiocParallel
1: package 'sva' was built under R version 4.2.1 
2: package 'mgcv' was built under R version 4.2.2 
3: package 'nlme' was built under R version 4.2.2 
4: package 'genefilter' was built under R version 4.2.2 
5: package 'BiocParallel' was built under R version 4.2.2 
Found4batches
Adjusting for12covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding nonparametric adjustments
Adjusting the Data




In [14]:
#
# Check Batch Effect by Cohort after 
#

from PlotEDA import PlotEDA
file = 'Plots/CohortBatchEffectComBat.html'
if os.path.exists(file):
    os.remove(file)
plotEDA = PlotEDA(xmnb, mdata, file=file)
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)
plotEDA = PlotEDA(xmnb, mdata, file=file)
plotEDA.plotByGroup('Group',vl1=[0],vl2=[1], r1=(-5,5), r2=(-5,5), binsize=0.01, plotN=True)

In [15]:
#
# Write Xm normalised
#

xmnb.to_csv('Xm_norm.tsv', sep='\t')

In [16]:
#
# Dimensionality Reduction
#

from PCA_UMAP import PCA_UMAP, PCA_Var

file = 'Plots/PCA.html'
if os.path.exists(file):
    os.remove(file)

pcaumap = PCA_UMAP(xmn, mdata, file=file)
pcaumap.plotReduction('Cohort', pcacomp=[0,1])
pcaumap.plotReduction('Group', pcacomp=[0,1])
pcaumap.plotReduction('Global_Batch', pcacomp=[0,1])

pcaumap = PCA_UMAP(xmnb, mdata, file=file)
pcaumap.plotReduction('Cohort', pcacomp=[0,1], titleLabel='- Batch Corrected')
pcaumap.plotReduction('Group', pcacomp=[0,1], titleLabel='- Batch Corrected')
pcaumap.plotReduction('Global_Batch', pcacomp=[0,1], titleLabel='- Batch Corrected')

In [17]:
PCA_Var(xmn, mdata, conVars, catVars+['Cohort', 'Global_Batch'], n_comp=10)

Unnamed: 0,%Var PCA,Calcium_Score,HDL,LDL,Total_Cholesterol,Ox-LDL,Lipoprotein(a),CRP,Plaque_thickness,Group,Smoke,Cohort,Global_Batch
1,18.94805,0.0005,0.9476,0.0957,0.0519,0.0142,0.1757,0.442,0.4535,0.7714,0.0004,0.0,0.0
2,10.814844,0.1546,0.2253,0.5478,0.703,0.9183,0.2899,0.0785,0.4611,0.7376,0.7894,0.0,0.0
3,8.013299,0.2208,0.8223,0.8688,0.2738,0.0162,0.0545,0.2383,0.3194,0.58,0.4574,0.0,0.0
4,4.228183,0.2033,0.8536,0.3032,0.1308,0.271,0.3477,0.9961,0.6959,0.9412,0.1054,0.0007,0.0
5,2.945379,0.5945,0.376,0.0123,0.0753,0.1649,0.976,0.008,0.4604,0.4633,0.0007,0.0045,0.0
6,2.545051,0.1951,0.0313,0.1099,0.0219,0.02,0.9213,0.0062,0.0571,0.1115,0.0242,0.0,0.0
7,2.267414,0.5469,0.0027,0.0,0.0,0.0101,0.1211,0.9378,0.7204,0.1416,0.005,0.0,0.0
8,2.019448,0.7504,0.2487,0.9924,0.2289,0.0017,0.7771,0.125,0.906,0.3041,0.8006,0.0,0.0
9,1.643912,0.5451,0.0036,0.0472,0.0004,0.0015,0.698,0.8741,0.9182,0.5844,0.0049,0.0065,0.0
10,1.582283,0.035,0.6916,0.0357,0.9944,0.3619,0.0823,0.1226,0.3364,0.2404,0.1583,0.0,0.0


In [18]:
PCA_Var(xmnb, mdata, conVars, catVars+['Cohort', 'Global_Batch'], n_comp=10)

Unnamed: 0,%Var PCA,Calcium_Score,HDL,LDL,Total_Cholesterol,Ox-LDL,Lipoprotein(a),CRP,Plaque_thickness,Group,Smoke,Cohort,Global_Batch
1,16.077739,0.6514,0.3915,0.2972,0.2227,0.1137,0.73,0.7191,0.9437,0.7137,0.0119,0.8839,0.0
2,9.574435,0.3823,0.5442,0.7081,0.2861,0.0848,0.064,0.0305,0.504,0.403,0.7146,0.9883,0.0
3,6.85971,0.9457,0.4394,0.5901,0.062,0.0053,0.4475,0.6275,0.6981,0.7708,0.6469,0.9205,0.0007
4,4.858675,0.7274,0.4065,0.344,0.1729,0.4857,0.1886,0.6469,0.4428,0.6181,0.0145,0.9884,0.0017
5,3.110627,0.9583,0.4645,0.0,0.0,0.0008,0.4898,0.0013,0.2466,0.3844,0.0774,0.7523,0.0
6,2.637547,0.8447,0.1112,0.0489,0.002,0.1413,0.2627,0.5167,0.825,0.3455,0.0255,0.9855,0.0
7,2.156443,0.5224,0.1544,0.1576,0.464,0.4712,0.8727,0.2708,0.6938,0.53,0.3062,0.9724,0.0
8,1.877945,0.6991,0.0019,0.1874,0.0004,0.0005,0.9297,0.5152,0.9819,0.4704,0.0506,0.9023,0.0
9,1.758656,0.0568,0.0005,0.0007,0.107,0.3924,0.1335,0.7685,0.0259,0.0041,0.0018,0.4534,0.0
10,1.690301,0.7387,0.1554,0.8687,0.652,0.2625,0.2819,0.7303,0.6339,0.8229,0.1591,0.911,0.0
