In [1]:
#
# Import Libraries
#

import numpy as np
import os
import pandas as pd
import sys

from plotly.subplots import make_subplots
import plotly.graph_objects as go

utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)

from myLog import myLog
from PlotEDA import PlotEDA
from PlotMV import PlotMV
from PCA_UMAP import PCA_UMAP, PCA_Var

In [2]:
#
# Constants
#

MVF_thr = 0.2 # Filter by missing value (IS non detected)
MV5_thr = 0.2 # Filter by -5 imputed missing value (feature not detected)
MVO_thr = 0.1

In [3]:
#
# Set constants
#

workingPath = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\AWHS"
pos_info_path = os.path.join(workingPath, 'OriginalFiles', 'RBR_LOG_POS_INFO.xlsx')
neg_info_path = os.path.join(workingPath, 'OriginalFiles', 'RBR_LOG_NEG_INFO.xlsx')

fileSummaryConc = os.path.join(workingPath, 'WorkingFiles', 'Plots', 'SummaryPlotsConc.html')
fileMSV = os.path.join(workingPath, 'WorkingFiles', 'Plots', 'SummaryMSV.html')
fileSummaryMS1 = os.path.join(workingPath, 'WorkingFiles', 'Plots', 'SummaryPlotsMS1.html')
fileSummaryMS2S = os.path.join(workingPath, 'WorkingFiles', 'Plots', 'SummaryPlotsMS2S.html')
fileSummaryMS2R = os.path.join(workingPath, 'WorkingFiles', 'Plots', 'SummaryPlotsMS2R.html')
filePCAMS1 = os.path.join(workingPath, 'WorkingFiles', 'Plots', 'PCAPlotsMS1.html')
filePCAMS2 = os.path.join(workingPath, 'WorkingFiles', 'Plots', 'PCAPlotsMS2.html')
if os.path.exists(fileSummaryConc): os.remove(fileSummaryConc)
if os.path.exists(fileMSV): os.remove(fileMSV)
if os.path.exists(fileSummaryMS1): os.remove(fileSummaryMS1)
if os.path.exists(fileSummaryMS2S): os.remove(fileSummaryMS2S)
if os.path.exists(fileSummaryMS2R): os.remove(fileSummaryMS2R)
if os.path.exists(filePCAMS1): os.remove(filePCAMS1)
if os.path.exists(filePCAMS2): os.remove(filePCAMS2)

In [4]:
#
# Set logging
#

logw = myLog(os.path.join(workingPath,'WorkingFiles', 'info.log'))
logw('Start Session')

Start Session


In [5]:
#
# Read sheets on dataframes
#

xmp = pd.read_excel(pos_info_path, sheet_name='fid2log', header=[0,1], index_col=0) # Positive-Concentration
mp2i = pd.read_excel(pos_info_path, sheet_name='fid2LipidInfo')
# posS2sn = pd.read_excel(pos_info_path, sheet_name='sample2Seqn')

xmn = pd.read_excel(neg_info_path, sheet_name='fid2log', header=[0,1], index_col=0, na_values=' None') # Negative-Concentration
mn2i = pd.read_excel(neg_info_path, sheet_name='fid2LipidInfo')
# negS2sn = pd.read_excel(neg_info_path, sheet_name='sample2Seqn')

mdata = pd.read_csv(r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\AWHS\WorkingFiles\main_metadata.tsv', sep='\t')

In [6]:
#
# Generate m2info.tsv joining positive and negative
#

m2i = pd.concat([mp2i, mn2i])
m2i['mode'] = ['POS' if i[0]=='P' else 'NEG' for i in m2i['fid']]
m2i.to_csv(os.path.join(workingPath, 'WorkingFiles', 'm2info.tsv'), sep='\t', index=False)

In [7]:
#
# Generate Xm_MS1.tsv and Xm_MS2.tsv
# WARNING!!!!! for concentration some values are inf (We only use them for representation)

# CONC_MS1.tsv
xmp1 = xmp.loc[:, pd.IndexSlice['P']].T.dropna(axis=1, how='all')
cmp1 = 10**xmp1
xmn1 = xmn.loc[:, pd.IndexSlice['P']].T.dropna(axis=1, how='all')
cmn1 = 10**xmn1

xm1 = pd.merge(
    xmp1,
    xmn1,
    how='outer',
    on='fid'
)

cm1 = pd.merge(
    cmp1,
    cmn1,
    how='outer',
    on='fid'
)

xm1.to_csv(os.path.join(workingPath, 'WorkingFiles', 'Xm_MS1.tsv'), sep='\t', index=True)

# CONC_MS2.tsv
xmp2 = xmp.loc[:, pd.IndexSlice['FRAG']].T.dropna(axis=1, how='all')
cmp2 = 10**xmp2
xmn2 = xmn.loc[:, pd.IndexSlice['FAS']].T.dropna(axis=1, how='all')
cmn2 = 10**xmn2

xm2 = pd.merge(
    xmp2,
    xmn2,
    how='outer',
    on='fid'
)

cm2 = pd.merge(
    cmp2,
    cmn2,
    how='outer',
    on='fid'
)

xm2.to_csv(os.path.join(workingPath, 'WorkingFiles', 'Xm_MS2.tsv'), sep='\t', index=True)

In [8]:
logw(f"Number of features MS1: {cm1.shape[1]}")
logw(f"Number of features MS1 (Pos.): {cmp1.shape[1]}")
logw(f"Number of features MS1 (Neg.): {cmn1.shape[1]}")
logw(f"Number of features MS2: {cm2.shape[1]}")
logw(f"Number of features MS2 (Pos.): {cmp2.shape[1]}")
logw(f"Number of features MS2 (Neg.): {cmn2.shape[1]}")

Number of features MS1: 849
Number of features MS1 (Pos.): 653
Number of features MS1 (Neg.): 196
Number of features MS2: 805
Number of features MS2 (Pos.): 650
Number of features MS2 (Neg.): 155


In [9]:
# Extract from metadata subtable with observations studied in metabolomics
mdatam = mdata[np.isin(mdata['Seqn'], cm1.index)].copy()
mdatam['AllElem'] = 0

In [10]:
#
# Plot concentration distribution
#
palette = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880']


column = 'Metabo_Batch'
valMax = 300
dfs = [cm1, cm2]
namedfs = ['MS1 Pos/Neg', 'MS2 Pos/Neg']

def saveToFile(fig, file):
    with open(file, 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn', default_height='50%', default_width='80%'))


def PlotConcentration(dfs, namedfs, column, valMax=500, file=False):

    fig = make_subplots(rows=2, cols=2, shared_xaxes=True, vertical_spacing=0.02, horizontal_spacing=0.05, subplot_titles=namedfs)
    for ndf,df in enumerate(dfs):
        for n,value in enumerate(sorted(set(mdatam[column]))):
            data = df.loc[mdatam['Seqn'][mdatam[column]==value], :].to_numpy().flatten() 
            data = data[data<valMax]
            fig.add_trace(go.Histogram(
                x=data,
                name=value,
                opacity=0.75,
                marker_color=palette[n],
                showlegend=True if ndf==0 else False,
                histnorm='probability density',
                xbins=dict(size=1),
                #hoverinfo='skip'
            ), row=1, col=1+ndf)

            fig.add_trace(go.Box(
                x=data,
                name=value,
                opacity=0.75,
                marker_color=palette[n],
                showlegend=False,
                hoverinfo='skip'
            ), row=2, col=1+ndf)

    fig.update_layout(
        title=f"Concentration distribution by {column}",
        barmode='overlay'
    )
        
    fig.show() if not file else saveToFile(fig, file)

PlotConcentration([cm1, cm2], ['MS1 Pos/Neg', 'MS2 Pos/Neg'], 'AllElem', valMax=300, file=fileSummaryConc)
PlotConcentration([cmp1, cmn1], ['MS1 Pos', 'MS1 Neg'], 'AllElem', valMax=300, file=fileSummaryConc)
PlotConcentration([cmp2, cmn2], ['MS2 Pos', 'MS2 Neg'], 'AllElem', valMax=300, file=fileSummaryConc)
PlotConcentration([cm1, cm2], ['MS1 Pos/Neg', 'MS2 Pos/Neg'], 'Metabo_Batch', valMax=300, file=fileSummaryConc)
# PlotConcentration([cmp1, cmn1], ['MS1 Pos', 'MS1 Neg'], 'Cohort', valMax=300, file=fileSummary)
# PlotConcentration([cmp2, cmn2], ['MS2 Pos', 'MS2 Neg'], 'Cohort', valMax=300, file=fileSummary)
PlotConcentration([cm1, cm2], ['MS1 Pos/Neg', 'MS2 Pos/Neg'], 'Group', valMax=300, file=fileSummaryConc)
# PlotConcentration([cmp1, cmn1], ['MS1 Pos', 'MS1 Neg'], 'Group', valMax=300, file=fileSummary)
# PlotConcentration([cmp2, cmn2], ['MS2 Pos', 'MS2 Neg'], 'Group', valMax=300, file=fileSummary)

In [11]:
#
# Plot a scatter indicating for each feature % of missing value that were imputed with -5.
#

m2im1 = m2i[np.isin(m2i['fid'], xm1.columns)]
m2im2 = m2i[np.isin(m2i['fid'], xm2.columns)]

snc = mdatam['Seqn'][mdatam['Group']=='C']
snd = mdatam['Seqn'][mdatam['Group']=='D']

fig = make_subplots(rows=1, cols=2, subplot_titles=['MS1', 'MS2'])

col=1
for xm,m2im in zip([xm1, xm2], [m2im1, m2im2]):

    # fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=(xm.loc[snc, m2im['fid'][m2im['mode']=='POS']]==-5).sum()/len(snc),
        y=(xm.loc[snd, m2im['fid'][m2im['mode']=='POS']]==-5).sum()/len(snd),
        mode='markers',
        name='POS',
        marker_color='#636EFA',
        showlegend=False if col==2 else True
    ), row=1, col=col)
    fig.add_trace(go.Scatter(
        x=(xm.loc[snc, m2im['fid'][m2im['mode']=='NEG']]==-5).sum()/len(snc),
        y=(xm.loc[snd, m2im['fid'][m2im['mode']=='NEG']]==-5).sum()/len(snd),
        mode='markers',
        name='NEG',
        marker_color='#EF553B',
        showlegend=False if col==2 else True
    ), row=1, col=col)
    fig.add_trace(go.Scatter(
        x=[0,1], y=[0,1], line=dict(dash='dash', width=0.5, color='black'), opacity=0.7, showlegend=False, mode='lines'
    ), row=1, col=col)
    col+=1

fig.update_xaxes(title='Control')
fig.update_yaxes(title='Disease')
fig.update_layout(title='Ratio of missing values imputed with -5 in Control vs Disease')
saveToFile(fig, fileSummaryConc)#fig.show()

In [12]:
#
# Count number of elements with -5 in MS2
#

logw('Missing Values')
logw('')
logw('MS1')
logw(f"All values: {xm1.shape[0]*xm1.shape[1]}")
logw(f"Values imputed with -5: {(xm1==-5).sum().sum()} ({round(100*(xm1==-5).sum().sum()/(xm1.shape[0]*xm1.shape[1]),2)}%)")
logw(f'Non-imputed missing values (IS not detected): {xm1.isna().sum().sum()} ({round(100*xm1.isna().sum().sum()/(xm1.shape[0]*xm1.shape[1]),2)}%)')

logw('')
logw('MS2')
logw(f"All values: {xm2.shape[0]*xm2.shape[1]}")
logw(f"Values imputed with -5: {(xm2==-5).sum().sum()} ({round(100*(xm2==-5).sum().sum()/(xm2.shape[0]*xm2.shape[1]),2)}%)")
logw(f'Non-imputed missing values (IS not detected): {xm2.isna().sum().sum()} ({round(100*xm2.isna().sum().sum()/(xm2.shape[0]*xm2.shape[1]),2)}%)')

Missing Values

MS1
All values: 295452
Values imputed with -5: 14543 (4.92%)
Non-imputed missing values (IS not detected): 1205 (0.41%)

MS2
All values: 280140
Values imputed with -5: 31885 (11.38%)
Non-imputed missing values (IS not detected): 714 (0.25%)


In [13]:
plotMV = PlotMV(xm1, mdata, file=fileMSV)
plotMV.plotSummary(titleLabel='- MS1')

plotMV = PlotMV(xm2, mdata, file=fileMSV)
plotMV.plotSummary(titleLabel='- MS2')

In [14]:
#
# Accepted features as a function of ratio of imputed values
#

xdata = np.arange(0,1,0.01)

for df, i in [(xm1, 'MS1'), (xm2, 'MS2')]:

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=xdata,
        y=[((df==-5).sum()/df.shape[0]<=i).sum() for i in xdata]
    ))

    fig.update_layout(
        title=f'Accepted Features vs Imputed values {i}'
    )

    fig.update_xaxes(title='Ratio of imputed values')
    fig.update_yaxes(title='Number of accepted features')
    saveToFile(fig, fileMSV)


In [15]:
logw('')
logw('MS1')
logw(f"Total number of observations: {xm1.shape[0]}")
logw(f"Total number of features: {xm1.shape[1]}")
logw(f"Total number of features with <{MVF_thr*100}% of missing values(<{int(xm1.shape[0]*MVF_thr)} of obs.): {((xm1.isna().sum()/xm1.shape[0])<=MVF_thr).sum()}")
logw(f"Total number of features with <{MVF_thr*100}% of (-5)-imputed values (<{int(xm1.shape[0]*MVF_thr)} of obs.): {(((xm1==-5).sum()/xm1.shape[0])<=MVF_thr).sum()}")
logw('')
logw('MS2')
logw(f"Total number of observations: {xm2.shape[0]}")
logw(f"Total number of features: {xm2.shape[1]}")
logw(f"Total number of features with <{MVF_thr*100}% of missing values(<{int(xm2.shape[0]*MVF_thr)} of obs.): {((xm2.isna().sum()/xm2.shape[0])<=MVF_thr).sum()}")
logw(f"Total number of features with <{MVF_thr*100}% of (-5)-imputed values (<{int(xm2.shape[0]*MVF_thr)} of obs.): {(((xm2==-5).sum()/xm2.shape[0])<=MVF_thr).sum()}")


MS1
Total number of observations: 348
Total number of features: 849
Total number of features with <20.0% of missing values(<69 of obs.): 849
Total number of features with <20.0% of (-5)-imputed values (<69 of obs.): 773

MS2
Total number of observations: 348
Total number of features: 805
Total number of features with <20.0% of missing values(<69 of obs.): 805
Total number of features with <20.0% of (-5)-imputed values (<69 of obs.): 656


In [16]:
#
# Filter by missing values (although no feature will be removed)
#

xm1f = xm1.loc[:, xm1.isna().sum()/xm1.shape[0] <= MVF_thr]
xm2f = xm2.loc[:, xm2.isna().sum()/xm2.shape[0] <= MVF_thr]

In [17]:
#
# Remove values where >20% were (-5)-imputed
#

xm1f = xm1f.loc[:, (xm1f==-5).sum()/xm1f.shape[0]<MV5_thr]
xm2f = xm2f.loc[:, (xm2f==-5).sum()/xm2f.shape[0]<MV5_thr]

In [18]:
xm1f

Unnamed: 0_level_0,P1,P2,P4,P5,P6,P7,P8,P9,P10,P11,...,N174,N175,N176,N178,N179,N180,N181,N183,N188,N192
fid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2515,2.095463,1.092414,0.731614,2.004638,3.591444,1.601048,3.156607,1.812960,1.053862,1.125457,...,1.452542,2.475839,1.225159,2.074344,2.397792,2.224788,1.522018,0.619471,1.989512,2.589751
273710,1.943931,0.790746,0.593961,2.007067,3.395482,1.525246,3.083728,,-5.000000,,...,1.358770,2.417085,0.932336,1.898146,2.422405,2.084777,1.746221,-5.000000,1.646769,2.534357
353918,1.997492,0.717726,0.586638,1.822292,3.508992,1.755538,2.977030,1.916527,0.988410,1.555373,...,1.349207,2.392533,1.240281,2.010245,2.189663,1.945928,1.291314,0.436417,1.645139,2.536466
233292,1.824169,0.699200,0.345699,1.832666,3.510665,1.658584,3.069936,,0.704044,,...,1.398204,2.265840,1.275511,1.833980,2.212894,1.882090,1.392809,0.672716,1.544479,2.450986
456052,2.023524,1.210670,0.800484,1.914443,3.667758,1.882119,3.134274,,1.365654,,...,1.771895,2.437158,1.428423,1.929183,2.368912,2.101952,1.045453,1.167779,1.835953,2.467317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504335,1.910573,0.894368,0.689580,2.032865,3.623343,1.598363,3.108388,1.693454,0.857709,1.289752,...,1.615657,2.436585,1.349227,1.961006,2.297576,2.065189,1.035804,0.777547,1.765119,2.547947
575679,2.125031,1.023011,0.803847,1.902651,3.526522,1.760763,3.169825,,0.603663,,...,1.727060,2.544028,1.510688,2.106131,2.363070,2.290222,1.134026,0.938151,1.979531,2.657189
733797,2.126363,0.950849,0.779019,2.079231,3.541726,1.619899,3.135876,1.758719,0.961915,1.357612,...,1.584028,2.528782,1.420058,2.111025,2.338601,2.081349,0.995462,0.763925,1.838135,2.668286
735630,2.035358,0.289307,0.508292,1.845639,3.467188,1.617190,3.109096,1.790774,1.049544,1.318179,...,1.543145,2.465849,1.380361,2.014429,2.269131,2.138384,0.899434,0.301049,1.784816,2.655234


In [19]:
#
# Filter observations by Missing values
#

# MS1
plotMV = PlotMV(xm1f, mdata, file=fileMSV)
plotMV.plotSummaryObs()

# Filter Observations by missing values

xm1f = xm1f[xm1f.isna().sum(axis=1)/xm1f.shape[1]<MVO_thr]

logw(f'Total number of observations with <{MVO_thr*100}% of missing values: {xm1f.shape[0]} / {xm1.shape[0]}')


# MS2
plotMV = PlotMV(xm2f, mdata, file=fileMSV)
plotMV.plotSummaryObs()

# Filter Observations by missing values

xm2f = xm2f[xm2f.isna().sum(axis=1)/xm2f.shape[1]<MVO_thr]

logw(f'Total number of observations with <{MVO_thr*100}% of missing values: {xm2f.shape[0]} / {xm2.shape[0]}')

Total number of observations with <10.0% of missing values: 347 / 348
Total number of observations with <10.0% of missing values: 348 / 348


In [20]:
#
# Summary Plots
# Probamos dos tipos de estandarizacion

# MS1
for i in ['s', 'r']: # StandardScaler and RobustScaler
    plotEDA = PlotEDA(xm1f, mdata, file=fileSummaryMS1, scaler=i)
    plotEDA.plotSummary(
        r11=(-5, 5), r12=(0,3), r3=(-6,6),
        vl3=[0],
        binsize=0.1,
        titleLabel=f'- Scaler: {"Standard" if i=="s" else "Robust"}'
    )
    plotEDA.plotByGroup('Metabo_Batch',vl1=[0],vl2=[0], r1=(-6,6), r2=(-6,6))
    plotEDA.plotByGroup('Group',vl1=[0],vl2=[0], r1=(-6,6), r2=(-6,6))


# MS2
for i,file in zip(['s', 'r'],[fileSummaryMS2S,fileSummaryMS2R]):
    plotEDA = PlotEDA(xm2f, mdata, file=file, scaler=i)
    plotEDA.plotSummary(
        r11=(-5, 5), r12=(0,3), r3=(-6,6),
        vl3=[0],
        binsize=0.1,
        titleLabel=f'- Scaler: {"Standard" if i=="s" else "Robust"}'
    )
    plotEDA.plotByGroup('Metabo_Batch',vl1=[0],vl2=[0], r1=(-6,6), r2=(-6,6))
    plotEDA.plotByGroup('Group',vl1=[0],vl2=[0], r1=(-6,6), r2=(-6,6))

In [21]:
#
# Standardize
#

from sklearn.preprocessing import StandardScaler, RobustScaler

xm1fn = pd.DataFrame(
    StandardScaler().fit_transform(xm1f),
    columns=xm1f.columns, index=xm1f.index
)

xm2fn = pd.DataFrame(
    StandardScaler().fit_transform(xm2f),
    columns=xm2f.columns, index=xm2f.index
)

xm1fr = pd.DataFrame(
    RobustScaler().fit_transform(xm1f),
    columns=xm1f.columns, index=xm1f.index
)

xm2fr = pd.DataFrame(
    RobustScaler().fit_transform(xm2f),
    columns=xm2f.columns, index=xm2f.index
)

In [22]:
#
# Imputation of missing values using KNN
#

from sklearn.impute import KNNImputer


xm1fnv = pd.DataFrame(
    KNNImputer(n_neighbors=3).fit_transform(xm1fn),
    columns=xm1fn.columns,
    index=xm1fn.index
)

xm2fnv = pd.DataFrame(
    KNNImputer(n_neighbors=3).fit_transform(xm2fn),
    columns=xm2fn.columns,
    index=xm2fn.index
)

xm1frv = pd.DataFrame(
    KNNImputer(n_neighbors=3).fit_transform(xm1fr),
    columns=xm1fr.columns,
    index=xm1fr.index
)

xm2frv = pd.DataFrame(
    KNNImputer(n_neighbors=3).fit_transform(xm2fr),
    columns=xm2fr.columns,
    index=xm2fr.index
)


In [23]:
logw(f"Imputed missing values in MS1: \
    | 'KNN' {xm1f.isna().sum().sum()}/{xm1f.shape[0]*xm1f.shape[1]} ({round(xm1f.isna().sum().sum()/(xm1f.shape[0]*xm1f.shape[1])*100, 2)}%) \
    | '-5' {(xm1f==-5).sum().sum()}/{xm1f.shape[0]*xm1f.shape[1]} ({round((xm1f==-5).sum().sum()/(xm1f.shape[0]*xm1f.shape[1])*100, 2)}%) \
")
logw(f"Imputed missing values in MS2: \
    | 'KNN' {xm2f.isna().sum().sum()}/{xm2f.shape[0]*xm2f.shape[1]} ({round(xm2f.isna().sum().sum()/(xm2f.shape[0]*xm2f.shape[1])*100, 2)}%) \
    | '-5' {(xm2f==-5).sum().sum()}/{xm2f.shape[0]*xm2f.shape[1]} ({round((xm2f==-5).sum().sum()/(xm2f.shape[0]*xm2f.shape[1])*100, 2)}%) \
")

Imputed missing values in MS1:     | 'KNN' 1074/268231 (0.4%)     | '-5' 3328/268231 (1.24%) 
Imputed missing values in MS2:     | 'KNN' 572/228288 (0.25%)     | '-5' 6373/228288 (2.79%) 


In [26]:
#
# Correct Batch Effect
# Comparamos batch effect correction con los dos tipos de escalado
# https://github.com/brentp/combat.py

# from combat import combat
# from scipy.stats import kruskal, median_test

# xm2fnvb = combat(
#     data=xm2fnv.T,
#     batch=mdata.set_index('Seqn').loc[xm2fnv.index, 'Cohort']
# ).T

# xm2frvb = combat(
#     data=xm2frv.T,
#     batch=mdata.set_index('Seqn').loc[xm2frv.index, 'Cohort']
# ).T

#
# Apply myComBat
# We take R ComBat from sva package
#

from myComBat import myComBat

catVars = ['Group', 'smoker', 'diabetes']
conVars = ['Plaque thickness', 'Glucosa', 'hdl', 'coltot']
xm2fnvb = myComBat(xm2fnv, mdata, 'Metabo_Batch', catVars, conVars, 
                   Rpath=os.path.join(workingPath, 'WorkingFiles', 'myRData'),
                    )#Rengine=r"C:\Program Files\R\R-4.1.2\bin\Rscript.exe")

xm2frvb = myComBat(xm2frv, mdata, 'Metabo_Batch', catVars, conVars, 
                   Rpath=os.path.join(workingPath, 'WorkingFiles', 'myRData'),
                    )#Rengine=r"C:\Program Files\R\R-4.1.2\bin\Rscript.exe")

ploteda = PlotEDA(xm2fnvb, mdata, file=fileSummaryMS2S)
ploteda.plotByGroup('Metabo_Batch', plotN=False, titleLabel='- Batch Corrected (Standard)')

ploteda = PlotEDA(xm2frvb, mdata, file=fileSummaryMS2R)
ploteda.plotByGroup('Metabo_Batch', plotN=False, titleLabel='- Batch Corrected (Robust)')

Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Loading required package: BiocParallel
1: package 'sva' was built under R version 4.2.1 
2: package 'mgcv' was built under R version 4.2.2 
3: package 'nlme' was built under R version 4.2.2 
4: package 'genefilter' was built under R version 4.2.2 
5: package 'BiocParallel' was built under R version 4.2.2 
Found8batches
Adjusting for7covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding nonparametric adjustments
Adjusting the Data


Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Loading required package: BiocParallel
1: package 'sva' was built under R version 4.2.1 
2: package 'mgcv' was built under R version 4.2.2 
3: package 'nlme' was built under R versio

In [27]:
ploteda = PlotEDA(xm2fnvb, mdata, file=fileSummaryMS2S)
ploteda.plotByGroup('Group', plotN=False, titleLabel='- Batch Corrected (Standard)')

ploteda = PlotEDA(xm2frvb, mdata, file=fileSummaryMS2R)
ploteda.plotByGroup('Group', plotN=False, titleLabel='- Batch Corrected (Robust)')

In [28]:
PlotEDA(xm2fnvb, mdata)._kruskal(xm2fnvb, 'Metabo_Batch', showTest=True)

Kruskal-Wallis:
KruskalResult(statistic=87.63569094818578, pvalue=3.7810369109996196e-16)
PostHoc-MannWhitney
          1             2             3             4         5             6  \
1  1.000000  1.000000e+00  1.000000e+00  1.000000e+00  0.001193  1.064335e-06   
2  1.000000  1.000000e+00  1.000000e+00  1.000000e+00  0.000041  7.799078e-09   
3  1.000000  1.000000e+00  1.000000e+00  1.000000e+00  0.000161  6.944119e-08   
4  1.000000  1.000000e+00  1.000000e+00  1.000000e+00  0.000515  6.576818e-07   
5  0.001193  4.130242e-05  1.612383e-04  5.148815e-04  1.000000  1.000000e+00   
6  0.000001  7.799078e-09  6.944119e-08  6.576818e-07  1.000000  1.000000e+00   
7  0.000265  1.216554e-04  1.948148e-04  1.096842e-03  1.000000  1.000000e+00   
8  1.000000  1.000000e+00  1.000000e+00  1.000000e+00  0.015944  1.236629e-05   

          7         8  
1  0.000265  1.000000  
2  0.000122  1.000000  
3  0.000195  1.000000  
4  0.001097  1.000000  
5  1.000000  0.015944  
6  1.000000  0.0

KruskalResult(statistic=87.63569094818578, pvalue=3.7810369109996196e-16)

In [29]:
#
# Generate 
#

xm1fnv.to_csv(os.path.join(workingPath, 'WorkingFiles', 'Xm_norm_MS1.tsv'), sep='\t')
xm2fnvb.to_csv(os.path.join(workingPath, 'WorkingFiles', 'Xm_norm_MS2.tsv'), sep='\t')

In [30]:
#
# Dimensionality Reduction
#

pcaumap = PCA_UMAP(xm1fnv, mdata, file=filePCAMS1)
pcaumap.plotReduction('Metabo_Batch', pcacomp=[0,1], titleLabel='Scaler: Standard')
pcaumap.plotReduction('Group', titleLabel='Scaler: Standard')

pcaumap = PCA_UMAP(xm2fnv, mdata, file=filePCAMS2)
pcaumap.plotReduction('Metabo_Batch', pcacomp=[0,1], titleLabel='Scaler: Standard')
pcaumap.plotReduction('Group', titleLabel='Scaler: Standard')

pcaumap = PCA_UMAP(xm2fnvb, mdata, file=filePCAMS2)
pcaumap.plotReduction('Metabo_Batch', pcacomp=[0,1], titleLabel='Scaler: Standard - Batch')
pcaumap.plotReduction('Group', pcacomp=[0,1], titleLabel='Scaler: Standard - Batch')


pcaumap = PCA_UMAP(xm1frv, mdata, file=filePCAMS1)
pcaumap.plotReduction('Metabo_Batch', pcacomp=[0,1], titleLabel='Scaler: Robust')
pcaumap.plotReduction('Group', titleLabel='Scaler: Robust')

pcaumap = PCA_UMAP(xm2frv, mdata, file=filePCAMS2)
pcaumap.plotReduction('Metabo_Batch', pcacomp=[0,1], titleLabel='Scaler: Robust')
pcaumap.plotReduction('Group', titleLabel='Scaler: Robust')

pcaumap = PCA_UMAP(xm2frvb, mdata, file=filePCAMS2)
pcaumap.plotReduction('Metabo_Batch', pcacomp=[0,1], titleLabel='Scaler: Robust - Batch')
pcaumap.plotReduction('Group', pcacomp=[0,1], titleLabel='Scaler: Robust - Batch')

In [32]:
PCA_Var(xm2fnv, mdata, conVars, catVars+['Metabo_Batch'], n_comp=10)

Unnamed: 0,%Var PCA,Plaque thickness,Glucosa,hdl,coltot,Group,smoker,diabetes,Metabo_Batch
1,30.636145,0.7239,0.0253,0.0,0.0,0.0284,0.1331,0.1044,0.0158
2,6.645477,0.694,0.5938,0.0,0.003,0.2594,0.2793,0.6969,0.0245
3,5.335242,0.4498,0.4286,0.0,0.0,0.5372,0.367,0.089,0.0001
4,3.137564,0.4442,0.8364,0.0013,0.0217,0.8772,0.002,0.4167,0.0112
5,2.976598,0.6568,0.3466,0.7924,0.3097,0.6445,0.092,0.9944,0.0
6,2.835709,0.3727,0.1757,0.8963,0.8449,0.8769,0.809,0.4326,0.0002
7,2.296204,0.5788,0.5367,0.3941,0.0052,0.7837,0.8005,0.5992,0.0002
8,1.812636,0.4416,0.5748,0.9727,0.3193,0.8047,0.0039,0.5216,0.376
9,1.717879,0.2527,0.1417,0.6077,0.8766,0.8722,0.6031,0.4029,0.0054
10,1.684994,0.2504,0.4444,0.4912,0.8829,0.8462,0.0835,0.5063,0.0


In [33]:
PCA_Var(xm2fnvb, mdata, conVars, catVars+['Metabo_Batch'], n_comp=10)

Unnamed: 0,%Var PCA,Plaque thickness,Glucosa,hdl,coltot,Group,smoker,diabetes,Metabo_Batch
1,33.492661,0.4134,0.0341,0.0,0.0,0.0653,0.5422,0.137,0.9888
2,7.681274,0.6661,0.9221,0.0,0.0112,0.2237,0.9991,0.9691,0.9485
3,6.310365,0.5487,0.3287,0.0,0.0001,0.6962,0.1845,0.0878,0.6678
4,3.449286,0.4217,0.2998,0.1011,0.0076,0.7965,0.0004,0.944,0.9783
5,3.066355,0.6041,0.8524,0.048,0.1724,0.6836,0.3804,0.3949,0.9683
6,2.305752,0.7606,0.965,0.9063,0.567,0.3431,0.0081,0.8387,0.7378
7,1.791799,0.7388,0.3435,0.54,0.6949,0.2115,0.4406,0.5209,0.9996
8,1.743469,0.3627,0.6445,0.0004,0.6426,0.5901,0.6888,0.4672,0.9944
9,1.597932,0.4211,0.2412,0.0021,0.0948,0.6484,0.6435,0.9543,0.9832
10,1.447864,0.8043,0.4963,0.1382,0.0356,0.1699,0.6249,0.4248,0.9949


In [34]:
mdatam.loc[:, ['Group', 'Metabo_Batch']].groupby(['Group', 'Metabo_Batch']).size()

#mdata['Cohort'].value_counts()

Group  Metabo_Batch
C      1.0             22
       2.0             22
       3.0             22
       4.0             22
       5.0             22
       6.0             22
       7.0             22
       8.0             20
D      1.0             22
       2.0             22
       3.0             22
       4.0             22
       5.0             22
       6.0             22
       7.0             22
       8.0             20
dtype: int64