In [4]:
#
# Import libraries
#

import numpy as np
import os
import pandas as pd
import sys

from itertools import cycle

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import StandardScaler

from IPython.display import Image


#
# Import own libraries
#
utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)

from myLog import myLog
from PlotEDA import PlotEDA
from PlotMV import PlotMV
from PCA_UMAP import PCA_UMAP, PCA_Var

In [5]:
#
# Constants
#

MVF_thr = 0.2
MVO_thr = 0.15

In [6]:
working_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Proteomics\PESA_V2'

xq_path = os.path.join(working_path, 'OriginalFiles', 'Xq_minus_X.tsv')

fileSummary = os.path.join(working_path, 'WorkingFiles', 'Plots', 'SummaryPlots.html')
filePCA = os.path.join(working_path, 'WorkingFiles', 'Plots', 'PCAPlots.html')
if os.path.exists(fileSummary): os.remove(fileSummary)
if os.path.exists(filePCA): os.remove(filePCA)

In [7]:
mdata = pd.read_csv(r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\PESA_V2\WorkingFiles\main_metadata.tsv', sep='\t')

In [8]:
#
# Set logging
#

logw = myLog(os.path.join(working_path, 'WorkingFiles', 'info.log'))
logw('Start Session')

Start Session


In [9]:
#
# Read files
#

xq = pd.read_csv(xq_path, sep='\t')#, header=[0,1])

In [10]:
#
# Create Xq_minus_X.tsv
#

xq.to_csv(os.path.join(working_path, 'WorkingFiles', 'Xq_minus_X.tsv'), sep='\t', index=False)

In [11]:
#
# Start Filtering, Standardization, Missing Value imputation, and Batch Effect analysis
#

xq = xq.set_index('seqn')

In [12]:
#
# Check Missing Values
#

plotMV = PlotMV(xq, mdata, file=fileSummary)
plotMV.plotSummary()

In [13]:
logw(f"Total number of observations: {xq.shape[0]}")
logw(f"Total number of proteins: {xq.shape[1]}")
logw(f"Total number of proteins with <{MVF_thr*100}% of missing values(<{int(xq.shape[0]*MVF_thr)} of obs.): {((xq.isna().sum()/xq.shape[0])<=MVF_thr).sum()}")

#logw("Missing elements: No missing element")

Total number of observations: 440
Total number of proteins: 3664
Total number of proteins with <20.0% of missing values(<88 of obs.): 251


In [14]:
#
# Generate Xq_minus_X_norm.tsv
# 

# Filter by 0.2 missing values
xqf = xq.loc[:, (xq.isna().sum()/xq.shape[0])<MVF_thr].copy()

In [15]:
#
# Filter observations by Missing values
#

plotMV = PlotMV(xqf, mdata, file=fileSummary)
plotMV.plotSummaryObs()

# Filter Observations by missing values

xqf = xqf[xqf.isna().sum(axis=1)/xqf.shape[1]<MVO_thr]

logw(f'Total number of observations with <{MVO_thr*100}% of missing values: {xqf.shape[0]} / {xq.shape[0]}')

Total number of observations with <15.0% of missing values: 428 / 440


In [16]:
#
# Center and scale
#

xqfn = pd.DataFrame(
    StandardScaler().fit_transform(xqf),
    columns=xqf.columns, index=xqf.index
)

In [17]:
#
# Summary plots
#

plotEDA = PlotEDA(xqf, mdata, file=fileSummary)

plotEDA.plotSummary(r11=(-0.4,0.4), r12=(0,2), r21=(-0.4,0.4), r22=(0,2), r3=(-4,4),vl3=[0])
plotEDA.plotByGroup('Group', vl1=[0],vl2=[0])
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[0])

In [18]:
#
# Imputation of missing values using KNN
#

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
xqfnv = pd.DataFrame(
    imputer.fit_transform(xqfn),
    columns=xqfn.columns,
    index=xqfn.index
)


In [19]:
# Number of imputed missing values

logw(f"Total number of values after filtration: {xqf.shape[0]*xqf.shape[1]}")
logw(f"Number of imputed missing values: {xqf.isna().sum().sum()} ({round(100*xqf.isna().sum().sum()/(xqf.shape[0]*xqf.shape[1]),2)}%)")

Total number of values after filtration: 106572
Number of imputed missing values: 3645 (3.42%)


In [20]:
#
# Correct batch effect using combat
#
# https://github.com/brentp/combat.py
#

# from combat import combat

# xqfnvb = combat(
#     data=xqfnv.T,
#     batch=mdata.set_index('Seqn').loc[xqf.index, 'Cohort']
# ).T


from myComBat import myComBat

catVars = ['Group', 'Smoke_dummy']
conVars = ['Calcium_Score', 'HDL', 'LDL', 'Total_Cholesterol','Ox-LDL','Lipoprotein a','CRP', 'Plaque_thickness']
xqfnvb = myComBat(xqfnv, mdata, 'Cohort', catVars, conVars, Rpath=os.path.join(working_path, 'WorkingFiles', 'myRData'))

Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Loading required package: BiocParallel
1: package 'sva' was built under R version 4.2.1 
2: package 'mgcv' was built under R version 4.2.2 
3: package 'nlme' was built under R version 4.2.2 
4: package 'genefilter' was built under R version 4.2.2 
5: package 'BiocParallel' was built under R version 4.2.2 
Found4batches
Adjusting for10covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding nonparametric adjustments
Adjusting the Data




In [21]:
plotEDA = PlotEDA(xqfnvb, mdata, file=fileSummary)
plotEDA.plotByGroup('Group', vl1=[0],vl2=[0], plotN=False, titleLabel='- Corrected by Batch Effect')
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[0], plotN=False, titleLabel='- Corrected by Batch Effect')

In [22]:
ploteda = PlotEDA(xqfnvb, mdata, file=fileSummary)
ploteda.plotSummary(plots=[1], titleLabel='- Corrected by Batch Effect', r11=(-0.5, 0.5), r12=(0.85, 1.15))

In [23]:
#
# Kruskal-Wallisk H test 
#

logw('')
logw('Kuskal-Wallis test')
logw('Non corrected')
logw(f'Group - {plotEDA._kruskal(xqfnv, "Group")}')
logw(f'Cohort - {plotEDA._kruskal(xqfnv, "Cohort")}')
logw('Corrected')
logw(f'Group - {plotEDA._kruskal(xqfnvb, "Group")}')
logw(f'Cohort - {plotEDA._kruskal(xqfnvb, "Cohort")}')


Kuskal-Wallis test
Non corrected
Group - KruskalResult(statistic=0.06748521432747272, pvalue=0.7950341236164762)
Cohort - KruskalResult(statistic=53.87797728263231, pvalue=1.191348983815448e-11)
Corrected
Group - KruskalResult(statistic=0.5980499170836993, pvalue=0.4393230398274126)
Cohort - KruskalResult(statistic=2.6830831027473323, pvalue=0.44310980324030447)


In [24]:
xqfnvb.to_csv(os.path.join(working_path, 'WorkingFiles', 'Xq_minus_X_norm.tsv'), sep='\t', index=True)

In [25]:
pcaumap = PCA_UMAP(xqfnv, mdata, file=filePCA)
pcaumap.plotReduction('Cohort', pcacomp=[0,1])
pcaumap.plotReduction('Group', pcacomp=[0,1])

In [26]:
pcaumap = PCA_UMAP(xqfnvb, mdata, file=filePCA)
pcaumap.plotReduction('Cohort', pcacomp=[0,1], titleLabel='- Corrected by Batch Effect')
pcaumap.plotReduction('Group', pcacomp=[0,1], titleLabel='- Corrected by Batch Effect')

In [27]:
PCA_Var(xqfnv, mdata, conVars, catVars+['Cohort'], n_comp=10)

Unnamed: 0,%Var PCA,Calcium_Score,HDL,LDL,Total_Cholesterol,Ox-LDL,Lipoprotein a,CRP,Plaque_thickness,Group,Smoke_dummy,Cohort
1,11.365935,0.542271,0.9212895,0.000272,2.47389e-05,8e-06,0.219206,0.321285,0.129198,0.002058,0.000137,0.3859038
2,5.467211,0.014642,0.002490983,0.634041,0.9266462,0.256538,0.610876,0.2368771,0.564579,0.617231,0.464218,0.3176661
3,3.564185,0.02888,2.195981e-13,0.069343,0.1229014,0.004978,0.910568,5.75619e-21,0.004443,0.041146,0.02856,0.000536713
4,2.937813,0.000208,0.001398887,0.002302,0.04827053,0.009091,0.572695,0.01586597,0.865962,0.286044,0.092605,0.002048192
5,2.775025,0.001797,0.4885488,0.039668,0.007306665,0.204201,0.806642,0.2585696,0.233813,0.011246,0.606763,0.4950664
6,2.444747,0.219599,0.8725256,1.4e-05,2.967831e-15,2e-06,0.801908,0.5652212,0.007782,0.011673,0.665587,0.8028398
7,2.172028,0.955534,0.3536822,0.737301,0.6957133,0.702358,0.338753,0.08008933,0.000644,0.017658,0.061874,0.1520605
8,1.973684,0.001545,1.829961e-13,0.001435,4.737864e-06,0.331947,0.158072,0.04084,0.253464,0.912247,0.741726,1.278677e-10
9,1.751983,0.859589,0.9537621,0.120294,0.2161744,0.672806,0.026023,3.932942e-07,0.180634,0.725308,0.096051,5.340273e-14
10,1.700577,0.570413,0.3056202,0.763451,0.9272133,0.764018,0.026081,0.009548612,0.619658,0.452286,0.647606,0.0003080129


In [28]:
PCA_Var(xqfnvb, mdata, conVars, catVars+['Cohort'], n_comp=10)

Unnamed: 0,%Var PCA,Calcium_Score,HDL,LDL,Total_Cholesterol,Ox-LDL,Lipoprotein a,CRP,Plaque_thickness,Group,Smoke_dummy,Cohort
1,11.911085,0.385494,0.9677866,0.00028,4.417098e-05,1.6e-05,0.121259,0.3842113,0.126191,0.002403,0.000408,0.894879
2,4.289564,0.006533,6.521466e-05,0.3858,0.5601295,0.153807,0.51846,0.04659501,0.580535,0.71116,0.546783,0.897767
3,3.699641,0.043087,1.432699e-12,0.109117,0.1898298,0.006744,0.901229,2.3238939999999998e-21,0.001722,0.007368,0.007071,0.135321
4,2.817237,0.224637,0.09066836,0.001759,2.486528e-08,0.006017,0.53914,0.3484972,0.537327,0.486149,0.809318,0.913191
5,2.69108,0.000144,5.894975e-05,0.000225,0.00592571,0.000892,0.41571,0.03423735,0.727956,0.85485,0.08132,0.456266
6,2.454717,0.098122,0.8713307,0.001301,4.864788e-09,0.000101,0.786481,0.350422,0.083214,0.010574,0.31581,0.926049
7,2.169716,0.957679,0.1571394,0.618645,0.2471717,0.838545,0.482534,0.06139561,0.000122,0.006222,0.056524,0.969438
8,1.918351,0.010126,3.540296e-12,1.1e-05,4.284494e-08,0.283986,0.177924,0.004360236,0.370029,0.839597,0.648755,0.530559
9,1.744504,0.850269,0.0008591935,0.36768,0.5811748,0.672516,0.019113,0.001054413,0.402305,0.304153,0.950826,0.539226
10,1.645828,0.115055,0.03032296,0.929267,0.5904203,0.82549,0.170437,0.00370508,0.033448,0.593946,0.165854,0.794871
