In [11]:
#
# Import libraries
#

import numpy as np
import os
import pandas as pd
import sys

from itertools import cycle

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import StandardScaler

from IPython.display import Image


#
# Import own libraries
#
utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)

from myLog import myLog
from PlotEDA import PlotEDA
from PlotMV import PlotMV
from PCA_UMAP import PCA_UMAP, PCA_Var

In [12]:
#
# Constants
#

MVF_thr = 0.2
MVO_thr = 0.1

In [13]:
working_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Proteomics\PESA'

xq_path = os.path.join(working_path, 'OriginalFiles', 'RBR_Xq_minus_X.xlsx')
sn2tmtc_path = os.path.join(working_path, 'OriginalFiles', 'Seqn_TMT_Cohorte.xlsx')

fileSummary = os.path.join(working_path, 'WorkingFiles', 'Plots', 'SummaryPlots.html')
filePCA = os.path.join(working_path, 'WorkingFiles', 'Plots', 'PCAPlots.html')
if os.path.exists(fileSummary): os.remove(fileSummary)
if os.path.exists(filePCA): os.remove(filePCA)

In [14]:
mdata = pd.read_csv(r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\PESA\WorkingFiles\main_metadata.tsv', sep='\t')

In [15]:
#
# Set logging
#

logw = myLog(os.path.join(working_path, 'WorkingFiles', 'info.log'))
logw('Start Session')

Start Session


In [16]:
#
# Read files
#

xq = pd.read_excel(xq_path, header=[0,1])
sn2tmt = pd.read_excel(sn2tmtc_path)

In [17]:
xq = xq.rename(columns={'Proteins': 'qdesc', 'Accession': 'fid'})

In [18]:
#
# Create q2info.tsv
#

q2i = xq['q'].loc[:, ['fid', 'qdesc', 'Np']]
q2i.to_csv(os.path.join(working_path, 'WorkingFiles', 'q2info.tsv'), sep='\t', index=False)

In [19]:
xq = xq.drop(['Np', 'qdesc'], level=1, axis=1)

In [20]:
#
# Create Xq_minus_X.tsv
#

xq = pd.merge(
    xq.set_index(xq[('q', 'fid')]).drop('q', level=0, axis=1).T.reset_index(names=['Cohorte', 'TMT']),
    sn2tmt.loc[:, ['Cohorte', 'TMT', 'Seqn']],
    how='outer',
    on=['Cohorte', 'TMT']
).drop(['Cohorte', 'TMT'], axis=1).set_index('Seqn')

xq.to_csv(os.path.join(working_path, 'WorkingFiles', 'Xq_minus_X.tsv'), sep='\t', index=False)

In [21]:
#
# Start Filtering, Standardization, Missing Value imputation, and Batch Effect analysis
#

In [22]:
#
# Check Missing Values
#

plotMV = PlotMV(xq, mdata, file=fileSummary)
plotMV.plotSummary()

In [23]:
logw(f"Total number of observations: {xq.shape[0]}")
logw(f"Total number of proteins: {xq.shape[1]}")
logw(f"Total number of proteins with <{MVF_thr*100}% of missing values(<{int(xq.shape[0]*MVF_thr)} of obs.): {((xq.isna().sum()/xq.shape[0])<=MVF_thr).sum()}")

#logw("Missing elements: No missing element")

Total number of observations: 444
Total number of proteins: 6213
Total number of proteins with <20.0% of missing values(<88 of obs.): 470


In [24]:
#
# Generate Xq_minus_X_norm.tsv
# 

# Filter by 0.2 missing values
xqf = xq.loc[:, (xq.isna().sum()/xq.shape[0])<MVF_thr].copy()

In [25]:
#
# Filter observations by Missing values
#

plotMV = PlotMV(xqf, mdata, file=fileSummary)
plotMV.plotSummaryObs()

# Filter Observations by missing values

xqf = xqf[xqf.isna().sum(axis=1)/xqf.shape[1]<MVO_thr]

logw(f'Total number of observations with <{MVO_thr*100}% of missing values: {xqf.shape[0]} / {xq.shape[0]}')

Total number of observations with <10.0% of missing values: 424 / 444


In [26]:
#
# Center and scale
#

xqfn = pd.DataFrame(
    StandardScaler().fit_transform(xqf),
    columns=xqf.columns, index=xqf.index
)

In [27]:
#
# Summary plots
#

plotEDA = PlotEDA(xqf, mdata, file=fileSummary)

plotEDA.plotSummary(r11=(-0.4,0.4), r12=(0,2), r21=(-0.4,0.4), r22=(0,2), r3=(-4,4),vl3=[0])
plotEDA.plotByGroup('Group', vl1=[0],vl2=[0])
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[0])

In [28]:
#
# Imputation of missing values using KNN
#

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
xqfnv = pd.DataFrame(
    imputer.fit_transform(xqfn),
    columns=xqfn.columns,
    index=xqfn.index
)


In [29]:
# Number of imputed missing values

logw(f"Total number of values after filtration: {xqf.shape[0]*xqf.shape[1]}")
logw(f"Number of imputed missing values: {xqf.isna().sum().sum()} ({round(100*xqf.isna().sum().sum()/(xqf.shape[0]*xqf.shape[1]),2)}%)")

Total number of values after filtration: 199280
Number of imputed missing values: 6024 (3.02%)


In [30]:
catVars = ['Group', 'Smoke']
conVars = ['Calcium_Score', 'HDL', 'LDL', 'Total_Cholesterol','Ox-LDL','Lipoprotein(a)','CRP', 'Plaque_thickness']

In [31]:
#
# Correct batch effect using combat
#
# https://github.com/brentp/combat.py
#

# from combat import combat

# xqfnvb = combat(
#     data=xqfnv.T,
#     batch=mdata.set_index('Seqn').loc[xqf.index, 'Cohort']
# ).T


from myComBat import myComBat

xqfnvb = myComBat(xqfnv, mdata, 'Cohort', catVars, conVars, Rpath=os.path.join(working_path, 'WorkingFiles', 'myRData'))

Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Loading required package: BiocParallel
1: package 'sva' was built under R version 4.2.1 
2: package 'mgcv' was built under R version 4.2.2 
3: package 'nlme' was built under R version 4.2.2 
4: package 'genefilter' was built under R version 4.2.2 
5: package 'BiocParallel' was built under R version 4.2.2 
Found4batches
Adjusting for12covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding nonparametric adjustments
Adjusting the Data




In [32]:
plotEDA = PlotEDA(xqfnvb, mdata, file=fileSummary)
plotEDA.plotByGroup('Group', vl1=[0],vl2=[0], plotN=False, titleLabel='- Corrected by Batch Effect')
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[0], plotN=False, titleLabel='- Corrected by Batch Effect')

In [33]:
ploteda = PlotEDA(xqfnvb, mdata, file=fileSummary)
ploteda.plotSummary(plots=[1], titleLabel='- Corrected by Batch Effect', r11=(-0.5, 0.5), r12=(0.85, 1.15))

In [34]:
#
# Kruskal-Wallisk H test 
#

logw('')
logw('Kuskal-Wallis test')
logw('Non corrected')
logw(f'Group - {plotEDA._kruskal(xqfnv, "Group")}')
logw(f'Cohort - {plotEDA._kruskal(xqfnv, "Cohort")}')
logw('Corrected')
logw(f'Group - {plotEDA._kruskal(xqfnvb, "Group")}')
logw(f'Cohort - {plotEDA._kruskal(xqfnvb, "Cohort")}')


Kuskal-Wallis test
Non corrected
Group - KruskalResult(statistic=7.295921026031224, pvalue=0.006911133205515113)
Cohort - KruskalResult(statistic=76.52391788736688, pvalue=1.7078472513712089e-16)
Corrected
Group - KruskalResult(statistic=6.258027115254663, pvalue=0.012363180779176308)
Cohort - KruskalResult(statistic=0.6730481351260096, pvalue=0.8795242346430892)


In [35]:
xqfnvb.to_csv(os.path.join(working_path, 'WorkingFiles', 'Xq_minus_X_norm.tsv'), sep='\t', index=True)

In [36]:
pcaumap = PCA_UMAP(xqfnv, mdata, file=filePCA)
pcaumap.plotReduction('Cohort', pcacomp=[0,1])
pcaumap.plotReduction('Group', pcacomp=[0,1])

In [37]:
pcaumap = PCA_UMAP(xqfnvb, mdata, file=filePCA)
pcaumap.plotReduction('Cohort', pcacomp=[0,1], titleLabel='- Corrected by Batch Effect')
pcaumap.plotReduction('Group', pcacomp=[0,1], titleLabel='- Corrected by Batch Effect')

In [38]:
PCA_Var(xqfnv, mdata, conVars, catVars+['Cohort'], n_comp=10)

Unnamed: 0,%Var PCA,Calcium_Score,HDL,LDL,Total_Cholesterol,Ox-LDL,Lipoprotein(a),CRP,Plaque_thickness,Group,Smoke,Cohort
1,11.428637,0.2784549,0.640947,0.273595,0.07527742,0.07218451,0.422157,0.6816749,0.041296,0.011915,0.194908,0.1979417
2,6.800875,0.7530169,0.1159864,0.014129,0.001187416,0.0002486087,0.495967,0.02025507,0.15424,0.013401,0.041719,0.3723152
3,4.136326,0.03431212,0.219427,0.417542,0.6936876,0.4827007,0.061774,0.0001577544,0.196578,0.277653,0.073204,0.001538343
4,2.95434,4.457627e-05,0.06572315,0.456422,0.3239683,0.1323138,0.650488,0.4849171,0.292248,0.323697,0.602156,0.02453304
5,2.110144,0.419656,5.450338e-10,0.028476,0.1502604,0.1928039,0.971979,5.096182e-10,0.030755,0.038776,0.029869,0.2429616
6,1.77496,0.002945097,2.271334e-05,0.63674,0.4701854,0.2860553,0.434675,1.513644e-13,0.07801,0.218436,0.015265,9.819606e-11
7,1.746863,0.006888004,0.7833515,0.080082,0.01349858,0.01479138,0.087495,0.8605108,0.048459,0.951061,0.000997,3.0248e-09
8,1.514122,0.5551612,1.40263e-05,0.565639,0.567126,0.112616,0.033897,2.293207e-10,0.003147,0.038401,0.063875,0.1647162
9,1.405869,0.03239594,0.8314667,0.002632,6.029339e-09,6.304026e-08,0.888753,0.8150418,0.562716,0.697956,0.488609,4.428902e-07
10,1.361567,8.958535e-07,0.1751845,0.888256,0.3115513,0.008710914,0.043011,0.00115743,0.077105,0.048164,0.987345,2.506219e-08


In [39]:
xqfnvb = pd.read_csv(os.path.join(working_path, 'WorkingFiles', 'Xq_minus_X_norm.tsv'), sep='\t', index_col='Seqn')

In [40]:
PCA_Var(xqfnvb, mdata, conVars, catVars+['Cohort'], n_comp=10).T

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
%Var PCA,9.486122,6.751791,4.02429,2.767219,2.121711,1.836975,1.691666,1.58729,1.381879,1.344555
Calcium_Score,0.086204,0.770956,0.025581,0.000117,0.3071057,0.445233,0.04766037,0.1700421,0.005984,0.000937
HDL,0.38595,0.233479,0.04852,0.34156,1.661598e-13,0.160277,2.529151e-05,0.0264377,0.785151,0.444516
LDL,0.03693,0.096086,0.418738,0.454769,0.006251421,0.504709,0.0317192,0.2579116,0.477391,0.870896
Total_Cholesterol,0.002627,0.042899,0.658051,0.287835,0.02847695,0.870821,5.529377e-05,0.0006344648,0.344744,0.423536
Ox-LDL,0.001203,0.016935,0.31681,0.177248,0.002925589,0.569997,0.04983156,3.146647e-07,0.834523,0.794788
Lipoprotein(a),0.880633,0.377564,0.076972,0.897325,0.4532393,0.500301,0.6316735,0.2646867,0.250619,0.050146
CRP,0.415539,0.02083,8.8e-05,0.600963,1.88071e-16,0.007732,6.946226e-11,0.05359135,0.025526,0.017179
Plaque_thickness,0.016218,0.697681,0.068807,0.244265,0.01358161,0.407032,0.00364953,0.1219368,0.392546,0.69279
Group,0.001232,0.332922,0.214763,0.318992,0.01444513,0.905384,0.08825276,0.0868711,0.143828,0.52505
