In [1]:
#
# Import Libraries
#

import numpy as np
import os
import pandas as pd
import sys
from dotmap import DotMap

from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.colors import n_colors

utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)

In [2]:
#
# Constants
#

modes = ['cp', 'cn']

xm_path = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\ALDH4\OriginalFiles\RBR_Liver_Antibodies.xlsx"
f2i_path = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\ALDH4\OriginalFiles\f2i.xlsx"
mdata_path = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\ALDH4\WorkingFiles\main_metadata.tsv"
w_path = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\ALDH4\WorkingFiles"

In [3]:
#
# Read metadata
#

mdata = pd.read_csv(mdata_path, sep='\t')

In [4]:
#
# Generate f2i.tsv
#

f2i = DotMap({
    'cp': pd.read_excel(f2i_path, sheet_name='C18P'),
    'cn': pd.read_excel(f2i_path, sheet_name='C18N')
})

f2i = pd.concat([f2i.cp, f2i.cn])
f2i.to_csv(os.path.join(w_path, 'f2i.tsv'), sep='\t', index=False)

In [5]:
#
# 
#

xm = DotMap({
    'cp': pd.read_excel(xm_path, sheet_name='C18P').set_index('fid').T,
    'cn': pd.read_excel(xm_path, sheet_name='C18N').set_index('fid').T
})

qc = DotMap({
    'cp': pd.read_excel(xm_path, sheet_name='C18P_QC').set_index('fid').T,
    'cn': pd.read_excel(xm_path, sheet_name='C18N_QC').set_index('fid').T
})

In [6]:
#
# Remove from xm features with more than 1 missing value in any group
# In QC, remove if it has more than 1 missing value

thr = 1
g2sn = mdata.groupby('Group').agg(list).Seqn

xmf = DotMap()
qcf = DotMap()

for i in modes:
    print('Samples:')
    print(f'Features prior to filtration in {i}: {xm[i].shape[1]}')
    xmf[i] = xm[i].loc[
        :,
        ~np.logical_or.reduce(
            [xm[i].loc[j].isna().sum()>thr for j in g2sn]
        )
    ]
    print(f'Features after to filtration in {i}: {xmf[i].shape[1]}')
    print('QC:')
    print(f'Features prior to filtration in {i}: {qc[i].shape[1]}')
    qcf[i] = qc[i].loc[:, ~(qc[i].isna().sum()>1)]
    print(f'Features after filtration in {i}: {qcf[i].shape[1]}')
    print()

Samples:
Features prior to filtration in cp: 3409
Features after to filtration in cp: 2226
QC:
Features prior to filtration in cp: 3409
Features after filtration in cp: 2884

Samples:
Features prior to filtration in cn: 1185
Features after to filtration in cn: 709
QC:
Features prior to filtration in cn: 1185
Features after filtration in cn: 1096



In [30]:
# La normalización que ofrece mejores resultados es la que resta a cada su mediana, considerando todas las features

In [31]:
#
# Normalizar restando la mediana del logaritmo
# 

#i = modes[0]

xmfr = DotMap()
qcr = DotMap()
for i in modes:
    # median = xmf[i].loc[:, ~xmf[i].isna().any()].median(axis=1) # Common
    median = xmf[i].median(axis=1) # All
    xmfr[i] = pd.DataFrame(index=xmf[i].index, columns=xmf[i].columns)
    for j in range(xmfr[i].shape[1]):
        col = xmfr[i].columns[j]
        xmfr[i][col] = xmf[i].loc[:, col]/median

    # apply ratio to quality control using the same features used with samples
    median_qc = qc[i].loc[:, xmf[i].columns[~xmf[i].isna().any()]].median(axis=1)
    qcr[i]  = pd.DataFrame(index=qc[i].index, columns=qc[i].columns)
    for j in range(qcr[i].shape[1]):
        col = qcr[i].columns[j]
        qcr[i][col] = qc[i].loc[:, col]/median_qc

In [28]:
#
# Apply MSTUS using features with no missing values
# Apply MSTUS to QC using the same features

#i = modes[0]

# xmfr = DotMap()
# qcr = DotMap()
# for i in modes:
#     mstus = xmf[i].loc[:, ~xmf[i].isna().any()].sum(axis=1) # Common
#     # mstus = xmf[i].sum(axis=1) # All
#     xmfr[i] = pd.DataFrame(index=xmf[i].index, columns=xmf[i].columns)
#     for j in range(xmfr[i].shape[1]):
#         col = xmfr[i].columns[j]
#         xmfr[i][col] = xmf[i].loc[:, col]/mstus

#     # apply ratio to quality control using the same features used with samples
#     mstusqc = qc[i].loc[:, xmf[i].columns[~xmf[i].isna().any()]].sum(axis=1)
#     qcr[i]  = pd.DataFrame(index=qc[i].index, columns=qc[i].columns)
#     for j in range(qcr[i].shape[1]):
#         col = qcr[i].columns[j]
#         qcr[i][col] = qc[i].loc[:, col]/mstusqc

In [8]:
#
# Check MSTUS in QC
#

In [32]:
files = ['Plots/MedianAll_Samples.html', 'Plots/MedianAll_QC.html']
xmdf = [xmf, xmfr]
qcdf = [qc, qcr]

for file, dflist in [(files[0], xmdf), (files[1], qcdf)]:

    if os.path.exists(file):
        os.remove(file)


    titles = ['C18P | Log', 'C18P | Log + MSTUS', 'C18N | Log', 'C18N | Log + MSTUS']

    fig0 = make_subplots(rows=2, cols=2, subplot_titles=titles, vertical_spacing=0.1) 
    fig1 = make_subplots(rows=2, cols=2, subplot_titles=titles, vertical_spacing=0.1)
    fig2 = make_subplots(rows=2, cols=2, subplot_titles=titles, vertical_spacing=0.1)


    for nr,i in enumerate(modes):
        for nc,df in enumerate(dflist):
            colors = n_colors('rgb(5, 200, 200)', 'rgb(200, 10, 10)', df[i].shape[0], colortype='rgb')
            for j,c in zip(df[i].index, colors):

                fig0.add_trace(go.Box(
                    #x=[j], 
                    y=np.log2(df[i].loc[j].to_numpy()),
                    name=j, 
                    showlegend=False, marker_color=c
                ), row=nr+1, col=nc+1)

                fig1.add_trace(go.Violin(
                    #x=[j], 
                    y=np.log2(df[i].loc[j].to_numpy()),
                    name=j, box_visible=True, meanline_visible=False,
                    line_color=c, showlegend=False, side=None
                ), row=nr+1, col=nc+1)

                fig2.add_trace(go.Violin(
                    x=np.log2(df[i].loc[j].to_numpy()),
                    side='positive', line_color=c, points=False, width=4, name=j, showlegend=False
                ),row=nr+1, col=nc+1)

    fig0.update_layout(height=1200, title='Boxplot')
    fig1.update_layout(height=1200, title='Violin plot')
    fig2.update_layout(height=1200, title='Density plot')
    #fig.show(); fig2.show()
    with open(file, 'a') as f:
                f.write(fig0.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))
                f.write(fig1.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))
                f.write(fig2.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))

In [33]:
#
# Apply log transformation
#

for i in modes:
    xmfr[i] = np.log2(xmfr[i])

In [41]:
from PlotEDA import PlotEDA
file = 'Plots/GroupsEffect_C18P_Median.html'
if os.path.exists(file):
    os.remove(file)

plotEDA = PlotEDA(xmfr.cp, mdata, file=file)

plotEDA.plotSummary(r11=(-10,10), r12=(0,2), r21=(-0.4,0.4), r22=(0,2), r3=(-4,4),vl3=[0], binsize=0.1)
plotEDA.plotByGroup('Group', vl1=[0],vl2=[0], r1=(-10,10),binsize=0.1)
plotEDA.plotByGroup('Ig', vl1=[0],vl2=[0], r1=(-10,10),binsize=0.1)
plotEDA.plotByGroup('Control', vl1=[0],vl2=[0], r1=(-10,10),binsize=0.1)

from PlotEDA import PlotEDA
file = 'Plots/GroupsEffect_C18N_Median.html'
if os.path.exists(file):
    os.remove(file)

plotEDA = PlotEDA(xmfr.cn, mdata, file=file)

plotEDA.plotSummary(r11=(-10,10), r12=(0,2), r21=(-0.4,0.4), r22=(0,2), r3=(-4,4),vl3=[0], binsize=0.1)
plotEDA.plotByGroup('Group', vl1=[0],vl2=[0], r1=(-10,10),binsize=0.1)
plotEDA.plotByGroup('Ig', vl1=[0],vl2=[0], r1=(-10,10),binsize=0.1)
plotEDA.plotByGroup('Control', vl1=[0],vl2=[0], r1=(-10,10),binsize=0.1)

In [42]:
#
# Combine cp and cn
#

xmfr = xmfr.cp.join(xmfr.cn)

In [89]:
from PlotEDA import PlotEDA
file = 'Plots/GroupsEffect_ALL_Median.html'
if os.path.exists(file):
    os.remove(file)

plotEDA = PlotEDA(xmfr, mdata, file=file)

plotEDA.plotSummary(r11=(-10,10), r12=(0,2), r21=(-0.4,0.4), r22=(0,2), r3=(-4,4),vl3=[0], binsize=0.1)
plotEDA.plotByGroup('Group', vl1=[0],vl2=[0], r1=(-10,10),binsize=0.1)
plotEDA.plotByGroup('Ig', vl1=[0],vl2=[0], r1=(-10,10),binsize=0.1)
plotEDA.plotByGroup('Control', vl1=[0],vl2=[0], r1=(-10,10),binsize=0.1)

In [43]:
#
# Center and scale
#

from sklearn.preprocessing import StandardScaler

xmfrn = pd.DataFrame(
    StandardScaler().fit_transform(xmfr),
    columns=xmfr.columns, index=xmfr.index
)

In [44]:
#
# Imputation of missing values using KNN
#

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
xmfrnv = pd.DataFrame(
    imputer.fit_transform(xmfrn),
    columns=xmfrn.columns,
    index=xmfrn.index
)

In [45]:
print(f"Total number of values: {xmfrnv.shape[0]*xmfrnv.shape[1]}")
print(f"Number of missing values imputed by KNN (n=3): {xmfrn.isna().sum().sum()} ({round(xmfrn.isna().sum().sum()/(xmfrnv.shape[0]*xmfrnv.shape[1])*100,2)}%)")

Total number of values: 52830
Number of missing values imputed by KNN (n=3): 1184 (2.24%)


In [46]:
#
# Write normalized 
#

xmfrnv.to_csv('Xm_norm.tsv', sep='\t')

In [52]:
from PCA_UMAP import PCA_UMAP, PCA_Var
file = 'Plots/PCA_UMAP_Median.html'
if os.path.exists(file):
    os.remove(file)


pcaumap = PCA_UMAP(xmfrnv, mdata, file=file)
pcaumap.plotReduction('Group', pcacomp=[0,1])
pcaumap.plotReduction('Ig', pcacomp=[0,1])
pcaumap.plotReduction('Control', pcacomp=[0,1])

In [76]:
res = PCA_Var(xmfrnv, mdata, [], ['Group', 'Ig', 'Control'], n_comp=10)
res

Unnamed: 0,%Var PCA,Group,Ig,Control
1,22.407936,0.924013,0.688833,0.879142
2,16.495109,0.027394,0.776986,0.014168
3,11.861955,0.276329,0.462073,0.399388
4,8.986936,0.550177,0.635397,0.536535
5,5.465797,0.000576,9.9e-05,0.197059
6,5.371582,0.19412,0.421151,0.065905
7,4.16376,0.652008,0.594719,0.348738
8,3.806349,0.59854,0.619012,0.603249
9,3.316956,0.801132,0.843476,0.50943
10,2.878576,0.672754,0.615961,0.36646


In [55]:
from sklearn.decomposition import PCA
comps = pd.DataFrame(PCA(n_components=10).fit_transform(xmfrnv), index=xmfrnv.index)

In [61]:
from plotly import graph_objects as go

In [87]:
n = 1

fig = go.Figure()

fig.add_trace(go.Scatter(
    x = len(g2sn['PBS'])*['PBS'],
    y=comps.loc[g2sn['PBS'],n],
    mode='markers', name='PBS'
))

fig.add_trace(go.Scatter(
    x = len(g2sn['B1-8'])*['B1-8'],
    y=comps.loc[g2sn['B1-8'],n],
    mode='markers', name='B1-8'
))

fig.add_trace(go.Scatter(
    x = len(g2sn['A12'])*['A12'],
    y=comps.loc[g2sn['A12'],n],
    mode='markers', name='A12'
))


fig.update_layout(title=f'PCA {n+1} | {round(res.loc[n, "%Var PCA"], 2)}%', width=500)
fig.show()



In [88]:
mdata

Unnamed: 0,Seqn,ID,Group,Ig,Control,TMT,Tag,ID_metabo
0,LD2169,1,A12,1,0,TMT1,127_C,A12_01
1,LD2170,2,A12,1,0,TMT1,127_N,A12_02
2,LD2171,3,A12,1,0,TMT1,128_C,A12_03
3,LD2243,4,A12,1,0,TMT2,127_C,A12_04
4,LD2244,5,A12,1,0,TMT2,127_N,A12_05
5,LD2245,6,A12,1,0,TMT2,128_C,A12_06
6,LD2162,7,B1-8,1,1,TMT1,128_N,B18_01
7,LD2163,8,B1-8,1,1,TMT1,129_C,B18_02
8,LD2164,9,B1-8,1,1,TMT2,128_N,B18_03
9,LD2213,10,B1-8,1,1,,,B18_04
