In [1]:
#
# Import libraries
#

import numpy as np
import os
import pandas as pd
import sys

from itertools import cycle

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import StandardScaler

from IPython.display import Image


#
# Import own libraries
#
utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)

from myLog import myLog
from PlotEDA import PlotEDA
from PlotMV import PlotMV
from PCA_UMAP import PCA_UMAP, PCA_Var

In [2]:
#
# Constants
#

MVF_thr = 0.2
MVO_thr = 0.1

In [3]:
#
# Set constants
#

working_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Proteomics\AWHS'
xq_path = os.path.join(working_path, 'OriginalFiles', 'RBR_Xq_minus_X.xlsx')
sn2tmtc_path = os.path.join(working_path, 'OriginalFiles', 'SPSS_All.xlsx')

fileSummary = os.path.join(working_path, 'WorkingFiles', 'Plots', 'SummaryPlots.html')
filePCA = os.path.join(working_path, 'WorkingFiles', 'Plots', 'PCAPlots.html')
if os.path.exists(fileSummary): os.remove(fileSummary)
if os.path.exists(filePCA): os.remove(filePCA)

In [4]:
mdata = pd.read_csv(r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\AWHS\WorkingFiles\main_metadata.tsv', sep='\t')

In [5]:
#
# Set logging
#

logw = myLog(os.path.join(working_path, 'WorkingFiles', 'info.log'))
logw('Start Session')

Start Session


In [6]:
#
# Read Dataframes
#

xq = pd.read_excel(xq_path, sheet_name='Xq_minus_X', header=[0,1])
sn2tmtc = pd.read_excel(sn2tmtc_path)

In [7]:
#
# Write TMT2seqn.tsv
#

sn2tmtc = sn2tmtc.loc[:, ['SEQN', 'Codigo', 'Cohorte', 'Experimento']].rename(columns={'SEQN': 'Seqn', 'Codigo':'Codigo_Proteo', 'Cohorte': 'Cohort', 'Experimento':'TMT'})
sn2tmtc['Cohort'] = [i[-1] for i in sn2tmtc['Cohort']]
sn2tmtc.to_csv(os.path.join(working_path, 'WorkingFiles', 'TMT2seqn.tsv'), sep='\t', index=False)

In [8]:
#
# Write q2info.tsv
#

q2i = xq.iloc[:, [0,1,2]].droplevel(0, axis=1).rename(columns={'Proteins':'qdesc', 'Accession':'fid'})
q2i.loc[:, ['fid', 'qdesc', 'Np']].to_csv(os.path.join(working_path, 'WorkingFiles', 'q2info.tsv'), sep='\t', index=False)

In [9]:
#
# Generate Xq_minus_X.tsv
#

# set uniprot id as unique index
xq = xq.drop(['Np', 'Proteins'], level=1, axis=1).rename(columns={'Accession': 'fid'})
xq = xq.set_index(xq.droplevel(0, axis=1).iloc[:, 0]).drop('fid', level=1, axis=1)

In [10]:
xq = xq.T.reset_index(names=['Cohort', 'TMT'])
xq['Cohort'] = [i[-1] for i in xq['Cohort']]

In [11]:
### WARNING!! El numero de observaciones en Xq y en SPSS_All no coincide:
# Estos canales no se usaron. Sin embargo, al crear la PSM_Table, se extrajeron
# todas las señales (incluidas las no usadas, y que tenían ruido). Al ejecutar el 
# SanXoT, se mantuvo el mismo template. Por ello aparecen estas columnas
print(f"From Xq: {xq['Cohort'].value_counts().to_dict()}")
print(f"From Zq: {sn2tmtc['Cohort'].value_counts().to_dict()}")

From Xq: {'1': 112, '2': 112, '3': 112}
From Zq: {'1': 112, '3': 110, '2': 108}


In [12]:
xq = pd.merge(
    sn2tmtc,
    xq,
    how='outer',
    on=['Cohort', 'TMT']
)

In [13]:
xq.loc[xq['Seqn'].isna(), ['Seqn', 'Cohort', 'TMT']]

Unnamed: 0,Seqn,Cohort,TMT
330,,2,TMT1_130C
331,,2,TMT1_130N
332,,2,TMT2_130C
333,,2,TMT2_130N
334,,3,TMT14_130C
335,,3,TMT14_130N


In [14]:
xq = xq.loc[~xq['Seqn'].isna(), :].astype({'Seqn': int}).drop(['Codigo_Proteo', 'Cohort', 'TMT'], axis=1).set_index('Seqn')

In [15]:
xq.to_csv(os.path.join(working_path, 'WorkingFiles', 'Xq_minus_X.tsv'), index=True, sep='\t')

In [16]:
#
# Start Standardization and Exploratory Data Analysis
#

In [17]:
#
# Check missing values
#

plotMV = PlotMV(xq, mdata, file=fileSummary)
plotMV.plotSummary()

In [18]:
logw(f"Total number of observations: {xq.shape[0]}")
logw(f"Total number of proteins: {xq.shape[1]}")
logw(f"Total number of proteins with <{MVF_thr*100}% of missing values(<{int(xq.shape[0]*MVF_thr)} of obs.): {((xq.isna().sum()/xq.shape[0])<=MVF_thr).sum()}")

logw("Missing elements:")
logw(mdata[~np.isin(mdata['Seqn'], xq.index)].loc[:, ['Group', 'Cohort']].groupby(['Group', 'Cohort']).size())

Total number of observations: 330
Total number of proteins: 4229
Total number of proteins with <20.0% of missing values(<66 of obs.): 381
Missing elements:
Group  Cohort
C      1         4
       2         4
       3         2
D      1         4
       2         4
       3         2
dtype: int64


In [19]:
mdataq = mdata.set_index('Seqn').loc[xq.index].reset_index(names='Seqn')

In [20]:
#
# Generate Xq_minus_X_norm.tsv
# 

# Filter by 0.2 missing values
xqf = xq.loc[:, (xq.isna().sum()/xq.shape[0])<MVF_thr].copy()

In [21]:
#
# Filter observations by Missing values
#

plotMV = PlotMV(xqf, mdata, file=fileSummary)
plotMV.plotSummaryObs()

# Filter Observations by missing values

xqf = xqf[xqf.isna().sum(axis=1)/xqf.shape[1]<MVO_thr]

logw(f'Total number of observations with <{MVO_thr*100}% of missing values: {xqf.shape[0]} / {xq.shape[0]}')

Total number of observations with <10.0% of missing values: 313 / 330


In [22]:
#
# Center and scale
#

xqfn = pd.DataFrame(
    StandardScaler().fit_transform(xqf),
    columns=xqf.columns, index=xqf.index
)

In [23]:
#
# Summary plots
#

plotEDA = PlotEDA(xqf, mdata, file=fileSummary)

plotEDA.plotSummary(r11=(-0.4,0.4), r12=(0,2), r21=(-0.4,0.4), r22=(0,2), r3=(-4,4),vl3=[0])
plotEDA.plotByGroup('Group', vl1=[0],vl2=[0])
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[0])

In [24]:
#
# Imputation of missing values using KNN
#

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
xqfnv = pd.DataFrame(
    imputer.fit_transform(xqfn),
    columns=xqfn.columns,
    index=xqfn.index
)


In [25]:
logw(f"Total number of values: {xqfn.shape[0]*xqfn.shape[1]}")
logw(f"Number of missing values imputed by KNN (n=3): {xqfn.isna().sum().sum()} ({round(xqfn.isna().sum().sum()/(xqfn.shape[0]*xqfn.shape[1])*100,2)}%)")

Total number of values: 119253
Number of missing values imputed by KNN (n=3): 3043 (2.55%)


In [26]:
#
# Correct batch effect using combat
#
# https://github.com/brentp/combat.py
#

# from combat import combat

# xqfnvb = combat(
#     data=xqfnv.T,
#     batch=mdataq.set_index('Seqn').loc[xqf.index, 'Cohort']
# ).T

from myComBat import myComBat

catVars = ['Group', 'smoker', 'diabetes']
conVars = ['Plaque thickness', 'Glucosa', 'hdl', 'coltot']
xqfnvb = myComBat(xqfnv, mdata, 'Cohort', catVars, conVars, Rpath=os.path.join(working_path, 'WorkingFiles', 'myRData'))

Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Loading required package: BiocParallel
1: package 'sva' was built under R version 4.2.1 
2: package 'mgcv' was built under R version 4.2.2 
3: package 'nlme' was built under R version 4.2.2 
4: package 'genefilter' was built under R version 4.2.2 
5: package 'BiocParallel' was built under R version 4.2.2 
Found3batches
Adjusting for7covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding nonparametric adjustments
Adjusting the Data




In [27]:
plotEDA = PlotEDA(xqfnvb, mdata, file=fileSummary)
plotEDA.plotByGroup('Group', vl1=[0],vl2=[0], plotN=False, titleLabel='- Corrected by Batch Effect')
plotEDA.plotByGroup('Cohort',vl1=[0],vl2=[0], plotN=False, titleLabel='- Corrected by Batch Effect')

In [28]:
ploteda = PlotEDA(xqfnvb, mdata, file=fileSummary)
ploteda.plotSummary(plots=[1], titleLabel='- Corrected by Batch Effect', r11=(-0.5, 0.5), r12=(0.85, 1.15))

In [29]:
#
# Kruskal-Wallisk H test 
#

logw('')
logw('Kuskal-Wallis test')
logw('Non corrected')
logw(f'Group - {plotEDA._kruskal(xqfnv, "Group")}')
logw(f'Cohort - {plotEDA._kruskal(xqfnv, "Cohort")}')
logw('Corrected')
logw(f'Group - {plotEDA._kruskal(xqfnvb, "Group")}')
logw(f'Cohort - {plotEDA._kruskal(xqfnvb, "Cohort")}')


Kuskal-Wallis test
Non corrected
Group - KruskalResult(statistic=78.5722286758321, pvalue=7.712607891046371e-19)
Cohort - KruskalResult(statistic=62.89444897248777, pvalue=2.2011210872263378e-14)
Corrected
Group - KruskalResult(statistic=74.71826035104459, pvalue=5.4291633493358406e-18)
Cohort - KruskalResult(statistic=0.512904777075164, pvalue=0.7737918350433243)


In [30]:
xqfnvb.to_csv(os.path.join(working_path, 'WorkingFiles', 'Xq_minus_X_norm.tsv'), sep='\t', index=True)

In [31]:
pcaumap = PCA_UMAP(xqfnv, mdata, file=filePCA)
pcaumap.plotReduction('Cohort', pcacomp=[0,1])
pcaumap.plotReduction('Group', pcacomp=[0,1])

In [32]:
pcaumap = PCA_UMAP(xqfnvb, mdata, file=filePCA)
pcaumap.plotReduction('Cohort', pcacomp=[0,1], titleLabel='- Corrected by Batch Effect')
pcaumap.plotReduction('Group', pcacomp=[0,1], titleLabel='- Corrected by Batch Effect')

In [33]:
PCA_Var(xqfnv, mdata, conVars, catVars+['Cohort'], n_comp=10)

Unnamed: 0,%Var PCA,Plaque thickness,Glucosa,hdl,coltot,Group,smoker,diabetes,Cohort
1,13.672072,0.2119,0.5045,0.0411,0.6621,0.6361,0.5599,0.1042,0.002
2,7.066286,0.5616,0.3041,0.8538,0.0284,0.0129,0.0607,0.1993,0.0006
3,6.586803,0.9154,0.7079,0.3728,0.0075,0.0001,0.0351,0.728,0.3742
4,3.086397,0.2195,0.438,0.3946,0.3638,0.0664,0.0024,0.6787,0.0081
5,2.859518,0.048,0.065,0.0005,0.0476,0.0,0.7555,0.3577,0.4026
6,2.466264,0.6443,0.0497,0.6278,0.5358,0.0014,0.3901,0.0115,0.2321
7,2.06559,0.5012,0.6178,0.0167,0.0148,0.3999,0.0006,0.1137,0.8086
8,2.026402,0.1608,0.129,0.0231,0.0,0.0021,0.7385,0.0154,0.0265
9,1.793214,0.3266,0.8119,0.8485,0.0322,0.0024,0.5036,0.9536,0.0
10,1.683503,0.1838,0.258,0.0022,0.3984,0.0381,0.147,0.0002,0.913


In [34]:
PCA_Var(xqfnvb, mdata, conVars, catVars+['Cohort'], n_comp=10)

Unnamed: 0,%Var PCA,Plaque thickness,Glucosa,hdl,coltot,Group,smoker,diabetes,Cohort
1,12.890757,0.4124,0.8222,0.0688,0.5106,0.3669,0.8892,0.3593,0.6223
2,6.981794,0.8047,0.2145,0.29,0.0002,0.0986,0.416,0.5051,0.9469
3,6.706447,0.4971,0.346,0.7636,0.7534,0.0,0.0259,0.1199,0.3955
4,3.065087,0.0742,0.9935,0.0184,0.0962,0.0005,0.0319,0.9806,0.5297
5,2.912187,0.1787,0.0588,0.0047,0.1547,0.0006,0.0793,0.2275,0.8921
6,2.43267,0.8372,0.2987,0.5804,0.2522,0.0001,0.5754,0.2873,0.9647
7,2.127614,0.4641,0.4983,0.0028,0.0089,0.1974,0.0005,0.0667,0.7127
8,2.054434,0.2143,0.0815,0.014,0.0004,0.007,0.3154,0.0444,0.6334
9,1.77221,0.2054,0.2681,0.0761,0.0961,0.0048,0.9513,0.1273,0.7321
10,1.667645,0.3254,0.4179,0.0049,0.0298,0.0388,0.0612,0.0104,0.5801
