In [1]:
#
# Import Libraries
#

import pandas as pd
import numpy as np
import os
import sys

utilsPath = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\utils'
if utilsPath not in sys.path:
    sys.path.append(utilsPath)

from MetadataStats import MetadataStats

In [4]:
#
# Set constants
#

mdata_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\AWHS\OriginalFiles\Cohortes_AWHS.xlsx'
sn2g_path = r'S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\AWHS\OriginalFiles\Seqn2Disease.xlsx'
mbatch_path = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\AWHS\OriginalFiles\RBR_MetabolomicsBatch.xlsx"
fhs10y_path = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\AWHS\OriginalFiles\FHS10years.xlsx"

cohs = ['Cohorte1_120','Cohorte2_120','Cohorte3_110']

filePlot = 'MetadataStats.html'
if os.path.exists(filePlot): os.remove(filePlot)

In [5]:
#
# Read df
#

mdata = [pd.read_excel(mdata_path, sheet_name=i) for i in cohs]

for df,cn in zip(mdata,cohs):
    df['Cohort'] = cn.split('_')[0][-1]

mdata = pd.concat(mdata)

sn2g = pd.read_excel(sn2g_path)

In [6]:
#
# Adapt columns
#

mdata = pd.merge(
    sn2g,
    mdata.drop('Ind Para Prot', axis=1).rename(columns={
        'SEQN': 'Seqn'
    }),
    on='Seqn',
    how='outer'
)


In [7]:
#
# Add metabolomics batch
#

mdata = pd.merge(
    mdata,
    pd.read_excel(mbatch_path).loc[:, ['Metabo_Batch', 'Metabo_GlobalOrder', 'Seqn']],
    on='Seqn',
    how='outer'
)

In [13]:
#
# Add Framingham 10years
#

mdata = pd.merge(
    mdata,
    pd.read_excel(fhs10y_path),
    how='inner', on='Seqn'
)

In [14]:
#
# Write main_metadata.tsv
#

mdata.to_csv(
    os.path.join(r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metadata\AWHS\WorkingFiles", 'main_metadata.tsv'),
    sep='\t',
    index=False
)

In [15]:
#
# Some plots & statistics about metadata
#

mds = MetadataStats(mdata, file=filePlot)

qualCols = ['diabetes', 'smoker', 'HIPERTENSION_RF', 'DISLIPEMIA_RF']
quanCols = ['Plaque thickness', 'Glucosa', 'Calcio Score', 'age', 'coltot', 'hdl', 'presis', 'presdi', 'FHS10y']

mds.plotQuanCols(quanCols)
mds.plotQualCols(qualCols)

Mann-Whitney U Test

Plaque thickness
Statistic = 29803.5 | p-value = 4.856836395109511e-59

Glucosa
Statistic = 16355.0 | p-value = 0.27042676111508834

Calcio Score
Statistic = 24237.5 | p-value = 6.090529507475739e-37

age
Statistic = 18334.0 | p-value = 0.001290924496258573

coltot
Statistic = 16923.5 | p-value = 0.08870956750507607

hdl
Statistic = 12274.5 | p-value = 0.0013197859550785518

presis
Statistic = 16360.0 | p-value = 0.011518488411675307

presdi
Statistic = 15376.0 | p-value = 0.1551921779076202

FHS10y
Statistic = 19639.5 | p-value = 4.835081904826723e-06

Chi-Square Homogeneity Sample Test

diabetes
Statistic = 0.07989043597352204 | p-value = 0.7774459434458258

smoker
Statistic = 2.713178294573644 | p-value = 0.09952252907239534

HIPERTENSION_RF
Statistic = 0.8177356055621254 | p-value = 0.3658430544271807

DISLIPEMIA_RF
Statistic = 3.108152609346334 | p-value = 0.0779012736823803



In [16]:
for i in qualCols:
    print(mdata[i].value_counts().to_frame())
    print()

   diabetes
0       337
1        13

   smoker
1     215
0     135

     HIPERTENSION_RF
0.0              242
1.0               94

     DISLIPEMIA_RF
0.0            172
1.0            120

