In [1]:
#
# Import libraries
#

import numpy as np
import os
import pandas as pd
import sys

from itertools import cycle

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import StandardScaler

from IPython.display import Image


#
# Import own libraries
#
utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)

from myLog import myLog
from PlotEDA import PlotEDA
from PlotMV import PlotMV
from PCA_UMAP import PCA_UMAP, PCA_Var

In [2]:
#
# Constants
#

MVF_thr = 0.2
MVO_thr = 0.1

In [3]:
working_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Proteomics\PESA'

xq_path = os.path.join(working_path, 'OriginalFiles', 'RBR_Xq_minus_X.xlsx')
sn2tmtc_path = os.path.join(working_path, 'OriginalFiles', 'Seqn_TMT_Cohorte.xlsx')

fileSummary = os.path.join(working_path, 'WorkingFiles', 'Plots', 'SummaryPlots.html')
filePCA = os.path.join(working_path, 'WorkingFiles', 'Plots', 'PCAPlots.html')
if os.path.exists(fileSummary): os.remove(fileSummary)
if os.path.exists(filePCA): os.remove(filePCA)

In [4]:
mdata = pd.read_csv(r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\PESA\WorkingFiles\main_metadata.tsv', sep='\t')

In [5]:
#
# Set logging
#

logw = myLog(os.path.join(working_path, 'WorkingFiles', 'info.log'))
logw('Start Session')

Start Session


In [6]:
#
# Read files
#

xq = pd.read_excel(xq_path, header=[0,1])
sn2tmt = pd.read_excel(sn2tmtc_path)

In [7]:
xq = xq.rename(columns={'Proteins': 'qdesc', 'Accession': 'fid'})

In [8]:
#
# Create q2info.tsv
#

q2i = xq['q'].loc[:, ['fid', 'qdesc', 'Np']]
q2i.to_csv(os.path.join(working_path, 'WorkingFiles', 'q2info.tsv'), sep='\t', index=False)

In [9]:
xq = xq.drop(['Np', 'qdesc'], level=1, axis=1)

In [10]:
#
# Create Xq_minus_X.tsv
#

xq = pd.merge(
    xq.set_index(xq[('q', 'fid')]).drop('q', level=0, axis=1).T.reset_index(names=['Cohorte', 'TMT']),
    sn2tmt.loc[:, ['Cohorte', 'TMT', 'Seqn']],
    how='outer',
    on=['Cohorte', 'TMT']
).drop(['Cohorte', 'TMT'], axis=1).set_index('Seqn')

xq.to_csv(os.path.join(working_path, 'WorkingFiles', 'Xq_minus_X.tsv'), sep='\t', index=False)

In [11]:
#
# Start Filtering, Standardization, Missing Value imputation, and Batch Effect analysis
#

In [12]:
#
# Check Missing Values
#

plotMV = PlotMV(xq, mdata, file=fileSummary)
plotMV.plotSummary()

In [13]:
logw(f"Total number of observations: {xq.shape[0]}")
logw(f"Total number of proteins: {xq.shape[1]}")
logw(f"Total number of proteins with <{MVF_thr*100}% of missing values(<{int(xq.shape[0]*MVF_thr)} of obs.): {((xq.isna().sum()/xq.shape[0])<=MVF_thr).sum()}")

#logw("Missing elements: No missing element")

Total number of observations: 444
Total number of proteins: 6213
Total number of proteins with <20.0% of missing values(<88 of obs.): 470
Missing elements: No missing element


In [14]:
#
# Generate Xq_minus_X_norm.tsv
# 

# Filter by 0.2 missing values
xqf = xq.loc[:, (xq.isna().sum()/xq.shape[0])<MVF_thr].copy()

In [15]:
#
# Filter observations by Missing values
#

plotMV = PlotMV(xqf, mdata, file=fileSummary)
plotMV.plotSummaryObs()

# Filter Observations by missing values

xqf = xqf[xqf.isna().sum(axis=1)/xqf.shape[1]<MVO_thr]

logw(f'Total number of observations with <{MVO_thr*100}% of missing values: {xqf.shape[0]} / {xq.shape[0]}')

Total number of observations with <10.0% of missing values: 424 / 444


In [16]:
#
# Center and scale
#

xqfn = pd.DataFrame(
    StandardScaler().fit_transform(xqf),
    columns=xqf.columns, index=xqf.index
)

In [17]:
#
# Summary plots
#

plotEDA = PlotEDA(xqf, mdata, file=fileSummary)

plotEDA.plotSummary(r11=(-0.4,0.4), r12=(0,2), r21=(-0.4,0.4), r22=(0,2), r3=(-4,4),vl3=[0])
plotEDA.plotByGroup('Group', vl1=[0],vl2=[0])
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[0])

In [18]:
#
# Imputation of missing values using KNN
#

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
xqfnv = pd.DataFrame(
    imputer.fit_transform(xqfn),
    columns=xqfn.columns,
    index=xqfn.index
)


In [19]:
# Number of imputed missing values

logw(f"Total number of values after filtration: {xqf.shape[0]*xqf.shape[1]}")
logw(f"Number of imputed missing values: {xqf.isna().sum().sum()} ({round(100*xqf.isna().sum().sum()/(xqf.shape[0]*xqf.shape[1]),2)}%)")

Total number of values after filtration: 199280
Number of imputed missing values: 6024 (3.02%)


In [20]:
#
# Correct batch effect using combat
#
# https://github.com/brentp/combat.py
#

# from combat import combat

# xqfnvb = combat(
#     data=xqfnv.T,
#     batch=mdata.set_index('Seqn').loc[xqf.index, 'Cohort']
# ).T


from myComBat import myComBat

catVars = ['Group', 'Smoke']
conVars = ['Calcium_Score', 'HDL', 'LDL', 'Total_Cholesterol','Ox-LDL','Lipoprotein(a)','CRP', 'Plaque_thickness']
xqfnvb = myComBat(xqfnv, mdata, 'Cohort', catVars, conVars, Rpath=os.path.join(working_path, 'WorkingFiles', 'myRData'))

Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Loading required package: BiocParallel
1: package 'sva' was built under R version 4.2.1 
2: package 'mgcv' was built under R version 4.2.2 
3: package 'nlme' was built under R version 4.2.2 
4: package 'genefilter' was built under R version 4.2.2 
5: package 'BiocParallel' was built under R version 4.2.2 
Found4batches
Adjusting for12covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding nonparametric adjustments
Adjusting the Data




In [21]:
plotEDA = PlotEDA(xqfnvb, mdata, file=fileSummary)
plotEDA.plotByGroup('Group', vl1=[0],vl2=[0], plotN=False, titleLabel='- Corrected by Batch Effect')
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[0], plotN=False, titleLabel='- Corrected by Batch Effect')

In [22]:
ploteda = PlotEDA(xqfnvb, mdata, file=fileSummary)
ploteda.plotSummary(plots=[1], titleLabel='- Corrected by Batch Effect', r11=(-0.5, 0.5), r12=(0.85, 1.15))

In [23]:
#
# Kruskal-Wallisk H test 
#

logw('')
logw('Kuskal-Wallis test')
logw('Non corrected')
logw(f'Group - {plotEDA._kruskal(xqfnv, "Group")}')
logw(f'Cohort - {plotEDA._kruskal(xqfnv, "Cohort")}')
logw('Corrected')
logw(f'Group - {plotEDA._kruskal(xqfnvb, "Group")}')
logw(f'Cohort - {plotEDA._kruskal(xqfnvb, "Cohort")}')


Kuskal-Wallis test
Non corrected
Group - KruskalResult(statistic=7.295921026031224, pvalue=0.006911133205515113)
Cohort - KruskalResult(statistic=76.52391788736688, pvalue=1.7078472513712089e-16)
Corrected
Group - KruskalResult(statistic=6.258027115254663, pvalue=0.012363180779176308)
Cohort - KruskalResult(statistic=0.6730481351260096, pvalue=0.8795242346430892)


In [24]:
xqfnvb.to_csv(os.path.join(working_path, 'WorkingFiles', 'Xq_minus_X_norm.tsv'), sep='\t', index=True)

In [25]:
pcaumap = PCA_UMAP(xqfnv, mdata, file=filePCA)
pcaumap.plotReduction('Cohort', pcacomp=[0,1])
pcaumap.plotReduction('Group', pcacomp=[0,1])

In [26]:
pcaumap = PCA_UMAP(xqfnvb, mdata, file=filePCA)
pcaumap.plotReduction('Cohort', pcacomp=[0,1], titleLabel='- Corrected by Batch Effect')
pcaumap.plotReduction('Group', pcacomp=[0,1], titleLabel='- Corrected by Batch Effect')

In [27]:
PCA_Var(xqfnv, mdata, conVars, catVars+['Cohort'], n_comp=10)

Unnamed: 0,%Var PCA,Calcium_Score,HDL,LDL,Total_Cholesterol,Ox-LDL,Lipoprotein(a),CRP,Plaque_thickness,Group,Smoke,Cohort
1,11.428637,0.2785,0.6409,0.2736,0.0753,0.0722,0.4222,0.6817,0.0413,0.0119,0.1949,0.1979
2,6.800875,0.753,0.116,0.0141,0.0012,0.0002,0.496,0.0203,0.1542,0.0134,0.0417,0.3723
3,4.136326,0.0343,0.2194,0.4175,0.6937,0.4827,0.0618,0.0002,0.1966,0.2777,0.0732,0.0015
4,2.95434,0.0,0.0657,0.4564,0.324,0.1323,0.6505,0.4849,0.2922,0.3237,0.6022,0.0245
5,2.110144,0.4197,0.0,0.0285,0.1503,0.1928,0.972,0.0,0.0308,0.0388,0.0299,0.243
6,1.77496,0.0029,0.0,0.6367,0.4702,0.2861,0.4347,0.0,0.078,0.2184,0.0153,0.0
7,1.746863,0.0069,0.7834,0.0801,0.0135,0.0148,0.0875,0.8605,0.0485,0.9511,0.001,0.0
8,1.514122,0.5552,0.0,0.5656,0.5671,0.1126,0.0339,0.0,0.0031,0.0384,0.0639,0.1647
9,1.405869,0.0324,0.8315,0.0026,0.0,0.0,0.8888,0.815,0.5627,0.698,0.4886,0.0
10,1.361567,0.0,0.1752,0.8883,0.3116,0.0087,0.043,0.0012,0.0771,0.0482,0.9873,0.0


In [28]:
PCA_Var(xqfnvb, mdata, conVars, catVars+['Cohort'], n_comp=10)

Unnamed: 0,%Var PCA,Calcium_Score,HDL,LDL,Total_Cholesterol,Ox-LDL,Lipoprotein(a),CRP,Plaque_thickness,Group,Smoke,Cohort
1,9.486122,0.0862,0.386,0.0369,0.0026,0.0012,0.8806,0.4155,0.0162,0.0012,0.054,0.9881
2,6.751791,0.771,0.2335,0.0961,0.0429,0.0169,0.3776,0.0208,0.6977,0.3329,0.4704,0.9807
3,4.02429,0.0256,0.0485,0.4187,0.6581,0.3168,0.077,0.0001,0.0688,0.2148,0.0476,0.9612
4,2.767219,0.0001,0.3416,0.4548,0.2878,0.1772,0.8973,0.601,0.2443,0.319,0.4308,0.5751
5,2.121711,0.3071,0.0,0.0063,0.0285,0.0029,0.4532,0.0,0.0136,0.0144,0.0497,0.6667
6,1.836975,0.4452,0.1603,0.5047,0.8708,0.57,0.5003,0.0077,0.407,0.9054,0.2355,0.8279
7,1.691666,0.0477,0.0,0.0317,0.0001,0.0498,0.6317,0.0,0.0036,0.0883,0.0,0.7292
8,1.58729,0.17,0.0264,0.2579,0.0006,0.0,0.2647,0.0536,0.1219,0.0869,0.7771,0.8374
9,1.381879,0.006,0.7852,0.4774,0.3447,0.8345,0.2506,0.0255,0.3925,0.1438,0.3655,0.9399
10,1.344555,0.0009,0.4445,0.8709,0.4235,0.7948,0.0501,0.0172,0.6928,0.5251,0.4753,0.3788
