In [1]:
#
# Import Libraries
#

import numpy as np
import pandas as pd
import sys
import os
import re
from functools import reduce
from sklearn.preprocessing import StandardScaler

#
# Import own libraries
#
utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)

from myLog import myLog
from PlotEDA import PlotEDA
from PlotMV import PlotMV
from PCA_UMAP import PCA_UMAP, PCA_Var

In [2]:
#
# Set constants
#

working_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Proteomics\ALDH4'
n2i_path = os.path.join(working_path, 'OriginalFiles', 'RBR_n2info.tsv')

fileSummary = os.path.join(working_path, 'WorkingFiles', 'Plots', 'SummaryPlots.html')
filePCA = os.path.join(working_path, 'WorkingFiles', 'Plots', 'PCAPlots.html')
if os.path.exists(fileSummary): os.remove(fileSummary)
if os.path.exists(filePCA): os.remove(filePCA)

In [3]:
#
# Set logging
#

logw = myLog(os.path.join(working_path, 'WorkingFiles', 'info.log'))
logw('Start Session')

Start Session


In [4]:
#
# Read Experiment information
#

n2i = pd.read_csv(n2i_path, sep='\t')

In [5]:
#
# Read X' values
#

xq = [
    pd.read_csv(f, sep='\t')
        .rename(columns={"X'inf": f'{m}-{t}'})
        .drop('Winf', axis=1)
        #.set_index('idinf')
    for m, f, t in zip(n2i['Mouse'], n2i['file'], n2i['TMT'])
]

In [6]:
xq = [
    df.set_index(
        pd.Index([re.search(r'\|([^|]+)\|', i).groups()[0] for i in df['idinf']])
    ) 
    for df in xq
]

In [7]:
#
# Generate q2info.tsv file
#

q2i = pd.DataFrame(
    [i for df in xq for i in zip(df.index.to_list(), df['idinf'])],
    columns=['fid', 'qdesc']
).drop_duplicates()

idq1 = pd.read_csv(r"S:\U_Proteomica\LABS\LAB_ARR\ClonesAb-atherosclerosis\Higados\proteomica\TMT1\msf\ID_Q_XV.txt", sep='\t')
idq2 = pd.read_csv(r"S:\U_Proteomica\LABS\LAB_ARR\ClonesAb-atherosclerosis\Higados\proteomica\TMT2\msf\ID_Q_XV.txt", sep='\t')


In [8]:
def get_nps(idq, suffix):

    nps = pd.DataFrame([
        (
            re.search(r'\|([^|]+)\|', i).groups()[0],
            ''.join(re.findall(r'[A-Z]+', j))
        )
        for i,j in zip(idq['FASTAProteinDescription'].to_list(), idq['Sequence'].to_list())
    ], columns=['fid', 'seq'])

    nps = nps['fid'].value_counts().to_frame().rename(columns={'fid': f'ScanFreq_{suffix}'}).join(
        nps.drop_duplicates()['fid'].value_counts().to_frame().rename(columns={'fid': f'pFreq_{suffix}'}),
        how='outer'
    )

    return nps


nps = get_nps(idq1, 1).join(
    get_nps(idq2, 2),
    how='outer'
)

In [9]:
q2i = q2i.set_index('fid').join(nps)
q2i.to_csv(os.path.join(working_path, 'WorkingFiles/q2info.tsv'), index=True, sep='\t')

In [10]:
#
# Generate Xq_minus_X.tsv
#

xq = [i.drop('idinf', axis=1) for i in xq]

In [11]:
# Select between LD2214-TMT1 and LD2214-TMT2
for i in xq:
    if 'LD2214' in i.columns[0]:
        print(f'Number of proteins in {i.columns[0]}: {i.shape[0]}')

for i in xq:
    if 'LD2223' in i.columns[0]:
        print(f'Number of proteins in {i.columns[0]}: {i.shape[0]}')



Number of proteins in LD2214-TMT1: 5247
Number of proteins in LD2214-TMT2: 4936
Number of proteins in LD2223-TMT1: 5247
Number of proteins in LD2223-TMT2: 4936


In [12]:
# Select both from TMT1

xq = [i for i in xq if i.columns[0] not in ['LD2214-TMT2', 'LD2223-TMT2']]

In [13]:
xq = reduce(lambda df1, df2: df1.join(df2, how='outer'), xq)

In [14]:
xq.columns = [i.split('-')[0] for i in xq.columns]

In [15]:
xq = xq.T

In [16]:
xq.to_csv(os.path.join(working_path, 'WorkingFiles/Xq_minus_X.tsv'), sep='\t', index=True)

In [17]:
xq

Unnamed: 0,A0A023NE65,A0A068ETZ5,A0A075B5M7,A0A075B5N7,A0A075B5S2,A0A076FRG6,A0A077K846,A0A077S2U6,A0A087WNU5,A0A087WQ04,...,V9GX23,V9GX81,V9GZG5,V9GZG9,V9GZQ6,W8QK46,X5J4Q6,X5J4V5,X5J5L2,Z4YKV1
LD2169,,0.211745,-0.279754,-0.148536,0.042719,0.374578,,0.176766,,0.083081,...,0.296407,0.41203,0.037165,0.078172,,-0.388321,0.635805,-0.124981,,
LD2170,,0.058881,-0.830888,-0.165629,-0.104145,0.173245,,-0.000786,,0.073758,...,0.417525,0.481374,0.090895,0.069856,,-0.214174,0.085245,-0.551075,,
LD2171,,-0.137228,-0.476768,0.773273,0.048258,-0.001902,,-0.116687,,-0.175519,...,-0.299232,0.060244,0.0809,0.101585,,0.072631,0.505704,-0.002342,,
LD2243,0.001684,0.160198,-0.439884,0.01068,,,0.071086,0.175706,-0.214627,0.048655,...,0.154346,0.588669,,-0.079279,-0.048808,,,,-0.156649,-0.355584
LD2244,0.139485,-0.204659,-0.281001,0.193496,,,-0.179846,-0.277834,0.029755,0.015017,...,0.220831,0.155407,,-0.078939,0.375996,,,,-0.063778,-0.314014
LD2245,0.071298,-0.263406,-0.820109,-0.016369,,,0.020832,-0.010656,0.257595,0.075472,...,0.020431,-0.169871,,-0.071655,0.237942,,,,-0.094179,0.034142
LD2162,,-0.007486,-0.145293,0.179016,-0.121668,0.19974,,0.321679,,0.213393,...,0.246303,0.206115,-0.026002,-0.003327,,-0.563918,0.721845,-0.285067,,
LD2163,,-0.260648,-0.607871,-0.069603,-0.184768,0.314014,,0.038032,,-0.079964,...,0.384033,-0.076501,0.040769,-0.052532,,-0.369158,0.333702,-0.515202,,
LD2164,-0.050325,-0.106601,-0.43766,0.070138,,,-0.092557,0.343222,-0.417765,0.247348,...,0.138059,0.166713,,0.049737,0.065082,,,,-0.290823,-0.258613
LD2214,,-0.031807,-0.297856,-0.14541,0.293554,0.015671,,0.15777,,0.062284,...,-0.264564,0.38053,-0.209537,0.031008,,0.034763,0.389654,-0.328073,,


In [18]:
#
# Start Standardization and Exploratory Data Analysis
#

In [19]:
mdata = pd.read_csv(r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\ALDH4\WorkingFiles\main_metadata.tsv", sep='\t')

In [20]:
#
# Check missing values
#

plotMV = PlotMV(xq, mdata, file=fileSummary)
plotMV.plotSummary(col='Group', g1='A12', g2='B1-8', titleLabel='A12 vs B1-8')
plotMV.plotSummary(col='Group', g1='A12', g2='PBS', titleLabel='A12 vs PBS')
plotMV.plotSummary(col='Group', g1='B1-8', g2='PBS', titleLabel='B1-8 vs PBS')

In [21]:
MVF_thr = 0.2

logw(f"Total number of observations: {xq.shape[0]}")
logw(f"Total number of proteins: {xq.shape[1]}")
logw(f"Total number of proteins with <{MVF_thr*100}% of missing values(<{int(xq.shape[0]*MVF_thr)} of obs.): {((xq.isna().sum()/xq.shape[0])<=MVF_thr).sum()}")

logw("")
logw(mdata[np.isin(mdata['Seqn'], xq.index)].loc[:, ['Group', 'TMT']].groupby(['Group', 'TMT']).size())

Total number of observations: 16
Total number of proteins: 6064
Total number of proteins with <20.0% of missing values(<3 of obs.): 4118

Group  TMT 
A12    TMT1    3
       TMT2    3
B1-8   TMT1    3
       TMT2    2
PBS    TMT1    3
       TMT2    2
dtype: int64


In [22]:
#
# Generate Xq_minus_X_norm.tsv
# 

# Filter by 0.2 missing values
xqf = xq.loc[:, (xq.isna().sum()/xq.shape[0])<MVF_thr].copy()

In [23]:
print(f'Total missing values: {xqf.isna().sum().sum()}')
print()
print(xqf.isna().sum(axis=1)[xqf.isna().any(axis=1)])
print()
print(xqf.isna().sum(axis=0)[xqf.isna().any(axis=0)])

xqf.loc[:, xqf.isna().any(axis=0)].join(mdata.set_index('Seqn').loc[:, ['Group']])

Total missing values: 5

LD2170    3
LD2163    1
LD2214    1
dtype: int64

P97313    2
P97822    1
Q5EBK7    1
Q8CI11    1
dtype: int64


Unnamed: 0,P97313,P97822,Q5EBK7,Q8CI11,Group
LD2169,-0.271509,0.202039,0.281283,-0.163746,A12
LD2170,-0.172375,,,,A12
LD2171,0.00571,-0.675992,-0.233917,0.051111,A12
LD2243,0.049698,-0.185714,0.129795,-0.065201,A12
LD2244,0.322827,-0.109768,0.230855,0.793996,A12
LD2245,-0.400419,-0.291929,-0.013153,0.607114,A12
LD2162,-0.405263,0.200672,0.610763,-0.150073,B1-8
LD2163,,0.306967,0.299875,0.215773,B1-8
LD2164,-0.542329,-0.219494,0.17017,-0.106408,B1-8
LD2214,,0.253333,-0.046615,-0.353492,B1-8


In [24]:
#
# Center and scale
#

xqfn = pd.DataFrame(
    StandardScaler().fit_transform(xqf),
    columns=xqf.columns, index=xqf.index
)

In [25]:
#
# Summary plots
#

plotEDA = PlotEDA(xqf, mdata, file=fileSummary)

plotEDA.plotSummary(r11=(-0.4,0.4), r12=(0,2), r21=(-0.4,0.4), r22=(0,2), r3=(-4,4),vl3=[0])
plotEDA.plotByGroup('Group', vl1=[0],vl2=[0])
plotEDA.plotByGroup('Ig', vl1=[0],vl2=[0])
plotEDA.plotByGroup('Control', vl1=[0],vl2=[0])
plotEDA.plotByGroup('TMT',vl1=[0],vl2=[0])

In [26]:
#
# Imputation of missing values using KNN
#

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
xqfnv = pd.DataFrame(
    imputer.fit_transform(xqfn),
    columns=xqfn.columns,
    index=xqfn.index
)


In [27]:
logw(f"Total number of values: {xqfn.shape[0]*xqfn.shape[1]}")
logw(f"Number of missing values imputed by KNN (n=3): {xqfn.isna().sum().sum()} ({round(xqfn.isna().sum().sum()/(xqfn.shape[0]*xqfn.shape[1])*100,2)}%)")

Total number of values: 65888
Number of missing values imputed by KNN (n=3): 5 (0.01%)


In [28]:
from myComBat import myComBat

catVars = ['Group']
conVars = []
xqfnvb = myComBat(xqfnv, mdata, 'TMT', catVars, conVars, Rpath=os.path.join(working_path, 'WorkingFiles', 'myRData'))

Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Loading required package: BiocParallel
1: package 'sva' was built under R version 4.2.1 
2: package 'mgcv' was built under R version 4.2.2 
3: package 'nlme' was built under R version 4.2.2 
4: package 'genefilter' was built under R version 4.2.2 
5: package 'BiocParallel' was built under R version 4.2.2 
Found2batches
Adjusting for2covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding nonparametric adjustments
Adjusting the Data




In [29]:
plotEDA = PlotEDA(xqfnvb, mdata, file=fileSummary)
plotEDA.plotByGroup('Group', vl1=[0],vl2=[0], plotN=True, titleLabel='- Corrected by Batch Effect')
plotEDA.plotByGroup('Ig', vl1=[0],vl2=[0], plotN=True, titleLabel='- Corrected by Batch Effect')
plotEDA.plotByGroup('Control', vl1=[0],vl2=[0], plotN=True, titleLabel='- Corrected by Batch Effect')
plotEDA.plotByGroup('TMT',vl1=[0],vl2=[0], plotN=True, titleLabel='- Corrected by Batch Effect')

In [30]:
ploteda = PlotEDA(xqfnvb, mdata, file=fileSummary)
ploteda.plotSummary(plots=[1], titleLabel='- Corrected by Batch Effect', r11=(-0.5, 0.5), r12=(0.85, 1.15))

In [31]:
#
# Kruskal-Wallisk H test 
#

logw('')
logw('Kuskal-Wallis test')
logw('Non corrected')
logw(f'Group - {plotEDA._kruskal(xqfnv, "Group")}')
logw(f'TMT - {plotEDA._kruskal(xqfnv, "TMT")}')
logw('Corrected')
logw(f'Group - {plotEDA._kruskal(xqfnvb, "Group")}')
logw(f'TMT - {plotEDA._kruskal(xqfnvb, "TMT")}')


Kuskal-Wallis test
Non corrected
Group - KruskalResult(statistic=10.0858385618194, pvalue=0.006454877186878417)
TMT - KruskalResult(statistic=5.113835604017368, pvalue=0.02373578950885407)
Corrected
Group - KruskalResult(statistic=13.134777416737052, pvalue=0.001405462697469717)
TMT - KruskalResult(statistic=4.190686121553881, pvalue=0.040646643522018834)


In [32]:
xqfnvb.to_csv(os.path.join(working_path, 'WorkingFiles', 'Xq_minus_X_norm.tsv'), sep='\t', index=True)

In [33]:
pcaumap = PCA_UMAP(xqfnv, mdata, file=filePCA)
pcaumap.plotReduction('TMT', pcacomp=[0,1])
pcaumap.plotReduction('Group', pcacomp=[0,1])

In [34]:
pcaumap = PCA_UMAP(xqfnvb, mdata, file=filePCA)
#pcaumap.plotReduction('TMT', pcacomp=[0,1], titleLabel='- Corrected by Batch Effect')
pcaumap.plotReduction('Group', pcacomp=[2,4], titleLabel='- Corrected by Batch Effect')

In [37]:
PCA_Var(xqfnv, mdata, conVars, catVars+['Ig', 'Control','TMT'], n_comp=10)

Unnamed: 0,%Var PCA,Group,Ig,Control,TMT
1,22.877066,0.6796,0.9041,0.4908,0.1499
2,17.974089,0.4696,0.9012,0.3242,0.0442
3,11.553285,0.0996,0.0443,0.0911,0.8061
4,8.735733,0.3759,0.281,0.8417,0.0018
5,7.582811,0.89,0.9331,0.7124,0.6119
6,5.35951,0.1066,0.5026,0.0354,0.8794
7,5.056408,0.363,0.1588,0.6632,0.3892
8,4.344536,0.3081,0.8573,0.2284,0.8113
9,3.576064,0.7094,0.5311,0.8815,0.9329
10,2.840511,0.0333,0.0078,0.1975,0.9906


In [36]:
PCA_Var(xqfnvb, mdata, conVars, catVars+['Ig', 'Control','TMT'], n_comp=10)

Unnamed: 0,%Var PCA,Group,Ig,Control,TMT
1,23.738437,0.7862,0.8567,0.6275,0.7737
2,17.284722,0.1174,0.7148,0.1331,0.5402
3,13.4286,0.0315,0.0108,0.0641,0.943
4,8.251975,0.9496,0.9605,0.7626,0.9464
5,6.117657,0.0789,0.6826,0.1073,0.8569
6,6.08562,0.1574,0.0698,0.1306,0.6003
7,4.717784,0.48,0.7896,0.3834,0.9162
8,3.881346,0.7006,0.472,0.9834,0.9181
9,3.369429,0.0971,0.0276,0.2865,0.9121
10,2.86382,0.7634,0.6566,0.4557,0.7888
