In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os, sys
import cPickle as pickle
import glob, re
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000)

root = 'C:\Users\ethan\Documents\MIT\Alm lab\Revo_healthcare\data\study_data'
multi_to_single = True

def get_labels(df, file_label, label):
    '''
    input: 
        df: pandas dataframe to ake the labels df out of
        file_label: column in the original df you want to to be the INDICIES for the new df
        labael: column in the original df that will be the 'label' column
    output:
        df_label: new dataframe mapping file name to label
    '''
    df_label = df[[file_label, label]]
    df_label.set_index(file_label, inplace=True)
    return df_label

In [2]:
# HELPER FUNCTION TO READ TO METABOLOMICS WORKBENCH DATA - FROM Michael Murphy

from StringIO import StringIO

def read_metadata(fn, label_col, label_key, debug=False):
    sample_lines = []
    feature_lines = []
    metabolite_lines = []
    with open(fn,'r') as f:
        is_sample_line = False
        is_feature_line = False
        is_feature_header = False
        is_metabolite_line = False
        for l in f:
            if 'MS_METABOLITE_DATA_START' in l:
                is_feature_line = True
                is_feature_header = True
            elif 'METABOLITES_START' in l:
                is_metabolite_line = True
            elif '#SUBJECT_SAMPLE_FACTORS' in l:
                is_sample_line = True
                # FACTORS(NAME:VALUE pairs separated by |)[tab]
                sample_lines.append('\t'.join(l.replace('[tab]','\t').split('\t')[1:]))
            elif is_feature_line:
                if 'MS_METABOLITE_DATA_END' in l:
                    is_feature_line = False
                else:
                    if is_feature_header:
                        feature_lines.append(l)
                        is_feature_header = False
                    else:
                        # throw away anything non-numeric after the sample header
                        m = re.search(r'^[^\t]*\t([^\t]*)\t', l).group(1)
                        try:
                            if len(m) > 0:
                                float(m)
                            feature_lines.append(l)
                        except:
                            pass
            elif is_metabolite_line:
                if 'METABOLITES_END' in l:
                    is_metabolite_line = False
                else:
                    metabolite_lines.append(l)
            elif is_sample_line:
#                 print(l.strip().split('\t'))
                if '#' in l:
                    is_sample_line = False
                else:
                    # hack fix -- at least one file semi-duplicates the header line, easier to fix in here
                    if 'Sample name' not in l:
                        sample_lines.append('\t'.join(l.split('\t')[1:]))
    
    # this is basically doing text-to-columns on the sample metadata
    samples = pd.read_csv(StringIO(''.join(sample_lines)), sep='\t').set_index('SAMPLE')
    samples = pd.concat([samples.iloc[:,0],
                         # FACTORS(NAME:VALUE pairs separated by |)[tab]
                         samples.iloc[:,1].astype('str').str.split('[:|]',expand=True),
                         # Additional sample data
                         samples.iloc[:,2].astype('str').str.split('[=;]',expand=True)], # ***
                        axis=1)
    # resulting table has field names and values interleaved as columns, this corrects that
    cols = [samples.columns[0],] + samples.iloc[0,1::2].str.strip().tolist()
    samples = samples.iloc[:,::2]
    samples.columns = cols[:len(samples.columns)] # if *** is empty have to prune its label from end

    if debug:
        display(samples)

    labels = pd.DataFrame(samples[label_col].str.strip().apply(lambda x: label_key[x]))

    df = pd.read_csv(StringIO(''.join(feature_lines)), sep='\t', index_col=False)
    
    # first row of df will contain metabolite identifiers, and we want (samples, features) shape
    features = df.iloc[:,1:].T.copy()
    # turn any leftover symbols into NaNs -- one file has '\N' in place of NAs, for whatever reason
    features = features.replace(r'(?:.*[^0-9\.].*)',np.nan,regex=True).astype('float')
    # drop any rows of all NaNs
    features = features.loc[~np.all(features.isnull(),axis=1)]
    
    metabolites = df.iloc[:,[0,]].copy()
    metabolites.columns = ['metabolite_name',]
    
    if len(metabolite_lines) > 0:
        metabolites = metabolites.merge(pd.read_csv(StringIO(''.join(metabolite_lines)), sep='\t'))
    # thus far I've seen m/z and RT embedded in the metabolite identifier, separated by either @ or _
    if metabolites['metabolite_name'].str.contains('[@_]',regex=True).any():
        mz_rt = metabolites['metabolite_name'].str.split('[@_]',expand=True)
        mz = mz_rt.iloc[:,0].str.strip('*').fillna('')
        # the regex just drops anything that isn't a number
        mz[~mz.str.match('^(?:[0-9]+(?:\.[0-9]*)?)$').astype('bool')] = 'nan'
        mz = mz.astype('float')
        rt = mz_rt.iloc[:,1].str.strip('*').fillna('')
        rt[~rt.str.match('^(?:[0-9]+(?:\.[0-9]*)?)$').astype('bool')] = 'nan'
        rt = rt.astype('float')
        metabolites['mz'] = pd.Series(mz, index=metabolites.index)
        metabolites['rt'] = pd.Series(rt, index=metabolites.index)
        # these two fields are sometimes m/z and RT, not always
        # metabolites['mz'] = metabolites[['moverz_quant','mz']].max(axis=1)
        # metabolites['rt'] = metabolites[['ri','rt']].max(axis=1)
    else:
        metabolites['mz'] = pd.Series(np.nan * np.ones(metabolites.shape[0]))
        metabolites['rt'] = pd.Series(np.nan * np.ones(metabolites.shape[0]))
    metabolites = metabolites.set_index('metabolite_name')
    
    return features, labels, samples, metabolites


def label_hist(labels):
    num_lab = len(set(labels))
    hist = {}
    for ele in set(labels):
        count = 0
        for ele2 in labels:
            if ele2 == ele:
                count+=1
        hist[ele] = count
    return hist

#### USE :
# print(label_hist(label.values.flatten().astype('int').tolist()))
#### to get th number of sampels for each class

def reduce_multi(labels, data_set, label_key):
    '''
    for each of the classes make it a one vs. the rest problem. (ie 1 for it, and 0 for lables of the rest)
    '''
#     try:
#         classes = set(labels.values.flatten())
#     except:
#         classes = set(labels.flatten())
    switch_int = len(label_key)+10 #just some larger number to temporarily switch the labels to 
    label_sets = []
    data_sets = []
#     for ele in classes:
    for k,ele in label_key.items():
        data_sets.append(data_set+'_'+str(ele)+'_'+k)
        label = labels.copy()
        label[label==ele] = switch_int
        label[label!=switch_int] = 0
        label[label==switch_int] = 1
        label_sets.append(label)
    return label_sets, data_sets

In [None]:
###### This function does not actually work ###########
# def filter_samples(data, filters=None):
#     samples = data['samples'].copy()
#     labels = data['labels'].copy()
#     features = data['features'].copy()
#     peaks = data['peaks'].copy()
    
#     # this suggests an off-by-one error
#     assert 'Unnamed' not in ','.join(features.index.astype('str'))
    
#     # make sure all indices are strings -- join won't work if comparing different types
#     samples.index = samples.index.astype('str')
#     features.index = features.index.astype('str')
#     labels.index = labels.index.astype('str')
    
#     # easiest fix: drop anything that has duplicates
#     index = features.index.drop_duplicates(keep=False).intersection(samples.index.drop_duplicates(keep=False))
    
#     # apply filters: drop anything that doesn't match equality constraint
#     if filters:
#         for k, v in filters.iteritems():
#             index = index[samples.loc[index][k].astype(type(v))==v]
    
#     # and filter, ensuring everything's in the same order
#     samples = samples.loc[index]
#     features = features.loc[index].astype('float')
#     labels = labels.loc[index].astype('float')
    
#     assert samples.shape[0] == features.shape[0]
    
#     data_new = {
#         'study': data['study'],
#         'analysis': data['analysis'],
#         'disease': data['disease'],
#         'features': features,
#         'labels': labels,
#         'peaks': data['peaks'],
#         'samples': samples
#     }
    
#     return data_new

In [67]:
# No reprocessed data
# doesnt look like there are duplicates 
# targeted analysis 
study = 'ST000284_data'
disease = 'Colorectal Cancer'
os.chdir(os.path.join(root,study))
data = []
f = 'AN000452.txt'

label_col = 'Patient group'
label_key = {'Healthy': 0, 'CRC': 1, 'Polyp':2} 
features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
feature_names = list(features.index)
feature_names = [int(fi) for fi in feature_names]
label = labels.loc[feature_names]
if multi_to_single:
    labels, ds_names = reduce_multi(label, f[:-4], label_key)
    for l, n in zip(labels, ds_names):
        data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': n,
                 'features': features,
                 'labels': l,
                 'peaks': metabolites,
                 'samples': samples.loc[feature_names]}) 
else:
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': f[:-4],
                 'features': features,
                 'labels': label,
                 'peaks': metabolites,
                 'samples': samples.loc[feature_names]}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [68]:
# I dont think there are duplicates except for what I note below
# labels match the data
# 12-7-18: order is same between data sets - can combine
study = 'ST000355_data'
disease = 'Breast Cancer'
os.chdir(os.path.join(root,study))
data = []
files = ['AN000580.txt', 'AN000581.txt']

label_col = 'Diagnosis'
label_key = {'Control': 0, 'Breast cancer': 1} 
for f in files:
    features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
    feature_names = list(features.index)
    label = labels.loc[feature_names]
    seen = []
    t_f_mask = []
    for i, val in enumerate(list(label.index)):
        if val not in seen:
            seen.append(val)
            if list(label.values)[i] != 0 and list(label.values)[i] != 1:
                t_f_mask.append(False)
            else:
                t_f_mask.append(True)
        else:
            t_f_mask.append(False)
    label = label[t_f_mask]
    label_names = list(label.index)
    features = features.loc[label_names]
    # note: there are three samples seen twice in the data, but its from different stages of cancer (i believe)
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': f[:-4],
                 'features': features,
                 'labels': label,
                 'peaks': metabolites,
                 'samples': samples.loc[feature_names]}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [69]:
# 134 people now, GC data 
# doesnt look like replicates from what I can tell
# labels match to the data
# 12-7-18: order is same between data sets - can combine
study = 'ST000356_data'
disease = 'Breast Cancer'
os.chdir(os.path.join(root,study))
data = []
files = ['AN000582.txt', 'AN000583.txt']

label_col = 'Diagnosis'
label_key = {'control': 0, 'breast cancer': 1} 
for f in files:
    features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
    feature_names = list(features.index)
    label = labels.loc[feature_names]
    # note: there are three samples seen twice in the data, but its from different stages of cancer (i believe)
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': f[:-4],
                 'features': features,
                 'labels': label,
                 'peaks': metabolites,
                 'samples': samples.loc[feature_names]}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [70]:
# looks like 56 people with no duplicates in samples 
study = 'ST000383_data'
disease = 'Obesity - Non-diabetic and T2 diabetic'
os.chdir(os.path.join(root,study))
data = []
f = 'AN000618.txt'

label_col = 'Health Status'
label_key = {'diabetic': 1, 'non-diabetic': 0} 
features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
feature_names = list(features.index)
label = labels.loc[feature_names]
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4],
             'features': features,
             'labels': label,
             'peaks': metabolites,
             'samples': samples.loc[feature_names]}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [71]:
# 84 people in study
# labels match, no replicates
# 12-7-18: order is same between data sets - can combine
study = 'ST000450_data'
disease = 'Chronic fatigue'
os.chdir(os.path.join(root,study))
data = []
files = ['AN000705.txt', 'AN000706.txt']

label_col = 'Disease'
label_key = {'Normal': 0, 'CFS': 1} 
for f in files:
    features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
    feature_names = list(features.index)
    label = labels.loc[feature_names]
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': f[:-4],
                 'features': features,
                 'labels': label,
                 'peaks': metabolites,
                 'samples': samples.loc[feature_names]}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [72]:
# 3 replicates for each
# compared dried blood spots vs serum 
# 12-7-18: order is same between data sets - can combine
study = 'ST000608_data'
disease = 'Stability of dried blood samples - diabetic men'
os.chdir(os.path.join(root,study))
data = []
files = ['AN000929.txt', 'AN000930.txt', 'AN000931.txt']

label_col = 'Factor'
label_key = {'control': 0, 'case': 1} 
for f in files:
    features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
    features = features[samples['Replicate']=='1']
    feature_names = list(features.index)
    label = labels.loc[feature_names]
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': f[:-4],
                 'features': features,
                 'labels': label,
                 'peaks': metabolites,
                 'samples': samples.loc[feature_names]}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))



In [73]:
# some randomly have replicates 
# looks like labels match up to data
study = 'ST000888_data'
disease = 'Lyme disease'
os.chdir(os.path.join(root,study))
data = []
f = 'AN001450.txt'

label_col = 'Disease Status'
# label_key = {'Early Disseminated Lyme': 1, 'Heathy (Non-endemic) control': 0, 
#              'Early Localized Lyme':1, 'Healthy Endemic Control':0, 
#              'Early Lyme': 1, 'Heatlhy Controls - Endemic':0,
#              'Healthy Non-endemic control':0, 'Mononucleosis':0, 
#              'Fibromyalgia':0, 'Lyme C6+ Baseline':1, 'Lyme C6+ Baseline /Atyp':1, 'Severe Periodontitis':0, 'Syphilis':0} 

# note following what the paper did which was classify lyme vs NOT lyme. 
label_key = {'Early Disseminated Lyme': 1, 'Heathy (Non-endemic) control': 0, 
             'Early Localized Lyme':1, 'Healthy Endemic Control':0, 
             'Early Lyme': 1, 'Heatlhy Controls - Endemic':0,
             'Healthy Non-endemic control':0, 'Mononucleosis':0, 
             'Fibromyalgia':0, 'Lyme C6+ Baseline':1, 'Lyme C6+ Baseline /Atyp':1, 'Severe Periodontitis':0, 'Syphilis':0} 
features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
feature_names = list(features.index)
keep_files = []
seen = []
toss = []
for ele in feature_names:
    ind = ele.find('Run')
    part_file = ele[:ind]
    if part_file not in seen:
        seen.append(part_file)
        keep_files.append(ele)
    else:
        toss.append(ele)
features = features.loc[keep_files]
label = labels.loc[keep_files]
label = label.dropna()

features = features.loc[list(label.index)]
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4],
             'features': features,
             'labels': label,
             'peaks': metabolites,
             'samples': samples.loc[keep_files]}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [74]:
study = 'ST000918_data'
disease = 'Breast cancer'
os.chdir(os.path.join(root,study))
data = []
f = 'AN001503.txt'

label_col = 'Disease Status'
label_key = {'Breast cancer patient': 1, 'Control patient': 0} 
features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
feature_names = list(features.index)
label = labels.loc[feature_names]
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4],
             'features': features,
             'labels': label,
             'peaks': metabolites,
             'samples': samples.loc[feature_names]}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [75]:
# dont think there are any duplicates 
# This is just the targeted oxylipin assay. 
study = 'MTBLS253_data' 
disease = 'chronic hepatitis B'

os.chdir(os.path.join(root,study))
data = []
df_lc = pd.read_csv('a_oxylipin_mass_spectrometry.txt',sep='\t')
df_s = pd.read_csv('s_mtbls253.txt', sep='\t')
df_lc = df_s.merge(df_lc, on='Sample Name').set_index('Sample Name')

labels = df_lc['Factor Value[control]']
lab_to_int = {True:0, False:1}
labels = labels.replace(lab_to_int)

#get author data:
features = pd.read_csv('m_oxylipin_analyses_of_chronic_hepatitis_b_metabolite_profiling_mass_spectrometry_v2_maf.tsv', sep='\t')
file_names = list(features.iloc[:,10:])
feat = features.iloc[:,10:].T.astype('float')
new_files = []
for fi in file_names:
    fi = int(fi[8:12])
    new_files.append(fi)
df_l = labels.loc[new_files]
data.append({'study':study[:-5],
             'disease': disease,
             'data_set': 'm_oxylipin_chronic_hep_b',
             'features': feat,
             'labels': df_l,
             'peaks': features,
             'samples': df_lc})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [76]:
# 12-7-18: order is same between data sets - can combine
study = 'MTBLS279_data' 
disease = 'chronic hepatitis B'

os.chdir(os.path.join(root,study))
data = []

df_p = pd.read_csv('a_POS_lipid_analyses_of_chronic_hepatitis_b_mass_spectrometry.txt',sep='\t')
df_n = pd.read_csv('a_NEG_lipid_analyses_of_chronic_hepatitis_b_mass_spectrometry.txt',sep='\t')
df_s = pd.read_csv('s_mtbls279.txt', sep='\t')
df_p = df_s.merge(df_p, on='Sample Name').set_index('Sample Name')
df_n = df_s.merge(df_n, on='Sample Name').set_index('Sample Name')

labels_p = df_p['Factor Value[control]']
labels_n = df_n['Factor Value[control]']
lab_to_int = {True:0, False:1}
labels_p = labels_p.replace(lab_to_int)
labels_n = labels_n.replace(lab_to_int)

#get author data:
files = ['m_POS_lipid_analyses_of_chronic_hepatitis_b_mass_spectrometry_v2_maf.tsv',
         'm_NEG_lipid_analyses_of_chronic_hepatitis_b_mass_spectrometry_v2_maf.tsv']
for f in files:
    if 'POS' in f:
        labels = labels_p
        df = df_p
    else:
        labels = labels_n
        df = df_n
    features = pd.read_csv(f, sep='\t')
    file_names = list(features.iloc[:,21:])
    feat = features.iloc[:,21:].T.astype('float')
    file_names = [int(fi) for fi in file_names]
    df_l = labels.loc[file_names]
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': 'm_chronic_hep_b_'+f[2:5],
                 'features': feat,
                 'labels': df_l,
                 'peaks': features,
                 'samples': df})
pickle.dump(data, open('%s.pkl'%study, 'wb'))
# NOTE: since 280 is also jst targeted but for amines (i think) I wont be analyzing since thats not the point of this work
# we care more about untargeted studies

In [77]:
# this study would be super useful for looking at confounders since their final models had many!
# 12-7-18: order is same between data sets - can combine
study = 'MTBLS358_data'
disease = 'COPD'
os.chdir(os.path.join(root,study))
data = []

df_cer = pd.read_csv('a_CER_mass_spectrometry.txt', sep='\t')
df_eico = pd.read_csv('a_EICO_mass_spectrometry.txt', sep='\t')
df_tag = pd.read_csv('a_TAG_mass_spectrometry.txt', sep='\t')
df_shot = pd.read_csv('a_SHOT_mass_spectrometry.txt', sep='\t')
# df_shot_n = pd.read_csv('a_SHOT_mass_spectrometry-neg.txt', sep='\t')
# df_shot_p = pd.read_csv('a_SHOT_mass_spectrometry-pos.txt', sep='\t')

df_s = pd.read_csv('s_Study.txt', sep='\t')

df_cer = df_s.merge(df_cer, on='Sample Name').set_index('Sample Name')
df_eico = df_s.merge(df_eico, on='Sample Name').set_index('Sample Name')
df_tag = df_s.merge(df_tag, on='Sample Name').set_index('Sample Name')
df_shot = df_s.merge(df_shot, on='Sample Name').set_index('Sample Name')
# df_shot_n = df_s.merge(df_shot_n, on='Sample Name').set_index('Sample Name')
# df_shot_p = df_s.merge(df_shot_p, on='Sample Name').set_index('Sample Name')

dfs = [df_cer, df_eico, df_shot, df_tag]
files = ['m_CER_mass_spectrometry_v4.maf', 'm_EICO_mass_spectrometry_v4.maf',
         'm_SHOT_mass_spectrometry_v4.maf', 'm_TAG_mass_spectrometry_v4.maf']
# Factor Value[Study Group] is the label
to_replace = {'COPD':1, 'FS':2, 'CS':3, 'NS':0} 
# ns: never smoke, fs: former smoker, cs: current smoker, also the COPD are all smokers
for f, df in zip(files,dfs):
    features = pd.read_csv(f, sep='\t')
    file_names = list(features.iloc[:,21:])
    feat = features.iloc[:,21:].T.astype('float')
    labels = df['Factor Value[Study Group]']
    df_l = labels.loc[file_names]
    df_l = df_l[df_l.notnull()]
    df_l = df_l.replace(to_replace)
    new_files = list(df_l.index)
    feat = feat.loc[new_files]
    if multi_to_single:
        labels_new, ds_names = reduce_multi(df_l, f[:-4], to_replace)
        for l, n in zip(labels_new, ds_names):
            data.append({'study': study[:-5],
                         'disease': disease,
                         'data_set': n,
                         'features': feat,
                         'labels': l,
                         'peaks': features[['mass_to_charge','retention_time']],
                         'samples': df}) 
    else:
        data.append({'study':study[:-5],
                     'disease': disease,
                     'data_set': f[:-4],
                     'features': feat,
                     'labels': df_l,
                     'peaks': features[['mass_to_charge','retention_time']],
                     'samples': df})   
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [78]:
study = 'MTBLS579_data'
disease = 'typhoid carriage'
os.chdir(os.path.join(root,study))
data = []

df = pd.read_csv('a_typhoid_carriage_metabolite_profiling_mass_spectrometry.txt', sep='\t')
df_s = pd.read_csv('s_Typhoid carriage.txt', sep='\t')
df = df_s.merge(df, on='Sample Name').set_index('Sample Name')
to_replace = {'S. Paratyphi A carriage':1, 'S. Typhi carriage':1, 'Non-carriage control':0, 'Quality control':np.nan}
labels = df['Factor Value[Carriage status]'].replace(to_replace)
labels = labels[labels.notnull()]
file_names = list(labels.index)

file_name = 'm_typhoid_carriage_metabolite_profiling_mass_spectrometry_v2_maf.tsv'
features = pd.read_csv(file_name, sep='\t')
feat = features.iloc[:,21:].T.astype('float')
feat = feat.loc[file_names]
data.append({'study':study[:-5],
             'disease': disease,
             'data_set': file_name[:-4],
             'features': feat,
             'labels': labels,
             'peaks': features[['mass_to_charge','retention_time']],
             'samples': df})  
pickle.dump(data, open('%s.pkl'%study, 'wb'))

Studies with raw data but I have not fully finished XCMS runs on... or are from XCMSonline

In [79]:
# this was an xcmsonline study
# looks like 97 people from the authors
# looks like the labels match to the samples
# NOTE: for the reprocessed data I did two reprocessings with different parameters
study = 'ST000063_data'
disease = 'Depression'
os.chdir(os.path.join(root,study))
data = []
f = 'AN000101.txt'

label_col = 'Source'
label_key = {'Group 1 - Score 0': 0, 'Group 2 - Score 50': 1} 
features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
feat_files = list(features.index)
labels_sub = labels.loc[feat_files]
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4],
             'features': features,
             'labels': labels_sub,
             'peaks': metabolites,
             'samples': samples})   

files = ['XCMS-Report-annotated-SingleClass-GCTOF.xlsx'] #'XCMS-Report-annotated-SingleClass.xlsx',
for f in files:
    features = pd.read_excel(f) 
    file_names = list(features.iloc[:,10:-3])  
    feat = features.iloc[:,10:-3].T.loc[list(labels.index)]
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': f[:-4],
                 'features': feat,
                 'labels': labels,
                 'peaks': features[['mzmed', 'mzmin', 'mzmax', 'rtmed', 'rtmin', 'rtmax', 'npeaks', 'isotopes', 'adduct', 'pcgroup']],
                 'samples': samples})     
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [35]:
# in the author data: dont think there are duplicates (< the number authors stated)
# got the labels to match to the results
# 12-7-18: for the author data the positive and negative can now be combined! 
study = 'MTBLS352_data'
disease = "Diabetes - healthy v. T2 v. prediabetic"
os.chdir(os.path.join(root,study))

data = []
df_s = pd.read_csv('s_MTBLS352.txt', sep='\t').set_index('Sample Name')

lab_to_int = {'T2D patients':1, 'normal glucose tolerant':0, 'prediabetes':2}
lab_to_int_short = {'T2D':1, 'NGT':0, 'Pre-DM':2}
labels = df_s['Factor Value[Group]'].replace(lab_to_int)
labels = labels[labels.notnull()]

files = ['DEMO_neg-norm-metaboAnalystInput.csv', 'DEMO_pos-norm-metaboAnalystInput.csv']
for f in files:
    features = pd.read_csv(f)
    feat = features.iloc[:,1:].T.astype('float')
    peaks = features.iloc[:,0]
    file_names = list(feat.index)
    label = [p.split('_')[0] for p in file_names]
    names = list(feat.index)
    df_n = pd.DataFrame(names, columns=['Sample Names'])
    df_l = pd.DataFrame(label, columns=['label']).replace(lab_to_int_short)
    df_l = pd.concat([df_n, df_l], axis=1)
    df_l = df_l.set_index('Sample Names')
    df_l = df_l[df_l.label != 'QC']
    keep_files = list(df_l.index)
    feat = feat.loc[keep_files]
    if multi_to_single:
        labels_new, ds_names = reduce_multi(df_l, f[:-4], lab_to_int_short)
        for l, n in zip(labels_new, ds_names):
            data.append({'study': study[:-5],
                         'disease': disease,
                         'data_set': n,
                         'features': feat,
                         'labels': l,
                         'peaks': peaks,
                         'samples': df_s}) 
    else:
        data.append({'study': study[:-5],
                     'disease': disease,
                     'data_set': f[:-4],
                     'features': feat,
                     'labels': df_l,
                     'peaks': peaks,
                     'samples': df_s})      
labels_2 = list(data[3]['features'].index)
labels_1 = list(data[0]['features'].index)
shared = [value for value in labels_1 if value in labels_2]
for i in range(len(data)):
    data[i]['features'] = data[i]['features'].loc[shared]
    data[i]['labels'] = data[i]['labels'].loc[shared]
    print(data[i]['features'].shape, data[i]['labels'].shape)
# get my data: 
dirs = ['neg'] #pos once done...
for folder in dirs:
    data_file = 'IPO_aligned_MTBLS352_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file) 
    file_names = list(features.iloc[:,9:-3])    
    feat = features.iloc[:,9:-3].T.astype('float')
    file_names = [fi[:-9].replace('.','-') for fi in file_names]
    df_l = labels.loc[file_names]
    if multi_to_single:
        labels_new, ds_names = reduce_multi(df_l, data_file[:-4], lab_to_int_short)
        for l, n in zip(labels_new, ds_names):
            data.append({'study': study[:-5],
                         'disease': disease,
                         'data_set': n,
                         'features': feat,
                         'labels': l,
                         'peaks': features[['mz', 'rt', 'X1','isotopes', 'adduct', 'pcgroup']],
                         'samples': df_s}) 
    else:
        data.append({'study': study[:-5],
                     'disease': disease,
                     'data_set': data_file[:-4],
                     'features': feat,
                     'labels': df_l,
                     'peaks': features[['mz', 'rt', 'X1','isotopes', 'adduct', 'pcgroup']],
                     'samples': df_s})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

((230, 100), (230, 1))
((230, 100), (230, 1))
((230, 100), (230, 1))
((230, 100), (230, 1))
((230, 100), (230, 1))
((230, 100), (230, 1))


In [81]:
# note two clear batches here! 
# correct number of samples - no duplicates - 90 people
# 12-7-18: looks like the order of the samples matches between pos and neg
study = 'MTBLS408_data'
disease = "psoriasis"
os.chdir(os.path.join(root,study))
data = []

df_s = pd.read_csv('s_psoriasis.txt', sep='\t').set_index('Sample Name')
df_l = df_s['Source Name']
to_replace = {'healthy':0, 'disease':1}
# Authors give no data...

#ok get my data:
dirs = ['neg', 'pos']
for folder in dirs:
    data_file = 'IPO_aligned_MTBLS408_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file) 
    file_names = list(features.iloc[:,9:-3])    
    feat = features.iloc[:,9:-3].T.astype('float')
    file_names = [fi[:-5] for fi in file_names]
    labels = df_l.loc[file_names]
    labels = labels.replace(to_replace)
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': feat,
                 'labels': labels,
                 'peaks': features[['mz', 'rt', 'X1','isotopes', 'adduct', 'pcgroup']],
                 'samples': df_s.loc[file_names]})
pickle.dump(data, open('%s.pkl'%study, 'wb'))    

In [82]:
# looks like triplicates at each time point. 
study = 'snyder_data'
disease = "common cold - longitudinal"
os.chdir(os.path.join(root,study))
data = []
dirs = ['B1_neg', 'B2_neg', 'neg_onebatch', 'pos_onebatch'] #'B1_pos', 'B2_pos'
for folder in dirs:
    data_file = 'IPO_aligned_snyderome_' + folder.lower() + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)   
    feat = features.iloc[:,9:-3].T.astype('float')
    file_names = list(features.iloc[:,9:-3])   
    file_names = [fi for fi in file_names if '01' in fi] # forcing analysis on just the first sample
    feat = feat.loc[file_names]
    try:
        peaks = features[['mz', 'rt', 'X1','isotopes', 'adduct', 'pcgroup']]
    except:
        peaks = features[['mz', 'rt']]
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': feat,
                 'labels': 'none',
                 'peaks': peaks,
                 'samples': 'none'})

pickle.dump(data, open('%s.pkl'%study, 'wb'))  

In [16]:
# this study is part xcmsonline and part my processing
# random replicates and names 
# think I finally got names and labels to match
# 12-7-18: seems like everything matches in term sof order across datasets...probably can combine!
study = 'ST000046_data'
disease = "Alzheimer's"
os.chdir(os.path.join(root,study))
files = ['AN000076.txt', 'AN000077.txt', 'AN000078.txt', 'AN000079.txt']
data = []
label_col = 'Cognitive Status'
label_key = {'CN': 0, 'MCI': 2, 'AD':1}
for f in files:
    features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
    print(metabolites)
    if multi_to_single:
        labels_new, ds_names = reduce_multi(labels, f[:-4], label_key)
        for l, n in zip(labels_new, ds_names):
            data.append({'study': study[:-5],
                         'disease': disease,
                         'data_set': n,
                         'features': features,
                         'labels': l,
                         'peaks': metabolites,
                         'samples': samples}) 
    else:
        data.append({'study': study[:-5],
                     'disease': disease,
                     'data_set': f[:-4],
                     'features': features,
                     'labels': labels,
                     'peaks': metabolites,
                     'samples': samples}) 
    
dirs = ['20120606_data', '20120613_data', 
        '20120618_data', '20120620_data', '20120625_data']
chroms = ['neg_hilic', 'neg_hilic', 'pos_c18', 'neg_c18', 'pos_c18']
maps = ['06jun12.csv', '13jun12.csv', '18jun12.csv', '20jun12.csv', '25jun12.csv']

lab_index = list(labels.index)
ind_rename = {f:f[7:] for f in lab_index}
labels = labels.rename(index=ind_rename)

for folder, chrom, name_to_pat_map in zip(dirs, chroms, maps):
    data_file = 'IPO_aligned_ST000046_' + folder[:-5] + '_' + chrom + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file) 
    file_names = list(features.iloc[:,8:-3])
    m = pd.read_csv(name_to_pat_map).set_index('filename')
    m_ind = list(m.index)
    m_ind = {name:name[8:10] for name in m_ind}
    m = m.rename(m_ind)
    m = m.drop_duplicates() # still has a single NaN from a QC or blank due to how dropping works 
    # map file numbers to patient numbers
    file_numb = []
    for fi in file_names:
        file_numb.append(fi[9:11])
    pat_num = m['Patient #'].loc[file_numb].values.astype('int').astype('string')
    pat_num = [str(int(n)) for n in pat_num]
    label = labels.loc[pat_num]
    feat = features.iloc[:,8:-3].T.astype('float')
#     print(list(feat.index), label.index) # NEEDD TO REMOVED DUPLICATES

    # get the mask for to dereplicate the data....
    seen = []
    t_f_mask = []
    for ele in file_names:
        if ele[:11] not in seen:
            seen.append(ele[:11])
            t_f_mask.append(True)
        else:
            t_f_mask.append(False)
    feat = feat[t_f_mask]
    label = label[t_f_mask]
    feat = feat[label.notnull().values]
    label = label[label.notnull().values]
    if multi_to_single:
        labels_new, ds_names = reduce_multi(label, data_file[:-4], label_key)
        for l, n in zip(labels_new, ds_names):
            data.append({'study': study[:-5],
                         'disease': disease,
                         'data_set': n,
                         'features': feat,
                         'labels': l,
                         'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'npeaks', 'isotopes', 'adduct', 'pcgroup']],
                         'samples': m}) 
    else:
        data.append({'study': study[:-5],
                     'disease': disease,
                     'data_set': data_file[:-4],
                     'features': feat,
                     'labels': label,
                     'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'npeaks', 'isotopes', 'adduct', 'pcgroup']],
                     'samples': m}) 
    
# There are the XCMSonline processed files
dirs = ['20120604_data', '20120611_data', '20120627_data']
maps = ['04jun12.csv', '11jun12.csv', '27jun12.csv']
for folder, name_to_pat_map in zip(dirs, maps):
    data_file = 'XCMS-Report-annotated-SingleClass.xlsx'
    open_file = os.path.join(folder,data_file)
    features = pd.read_excel(open_file) 
    file_names = list(features.iloc[:,10:-3])
    m = pd.read_csv(name_to_pat_map).set_index('filename')
    m_ind = list(m.index)
    m_ind_short = [name[8:10] for name in m_ind]
    m_ind_d = {name:name[8:10] for name in m_ind}
    t_f_mask = []
    seen = []
    for ind in list(m_ind_short):
        if ind not in seen:
            seen.append(ind)
            t_f_mask.append(True)
        else:
            t_f_mask.append(False)
    m = m.rename(m_ind_d)
    m = m[t_f_mask]
    
    # map file numbers to patient numbers
    file_numb = []
    for fi in file_names:
        file_numb.append(fi[8:10])
    pat_num = m['Patient #'].loc[file_numb].values.astype('int').astype('string')
    pat_num = [str(int(n)) for n in pat_num]
    label = labels.loc[pat_num]
    feat = features.iloc[:,10:-3].T.astype('float')
#     # get the mask for to dereplicate the data....
    seen = []
    t_f_mask = []
    file_names = list(label.index)
    for ele in file_names:
        if ele not in seen:
#         if ele[:10] not in seen:
#             seen.append(ele[:10])
            seen.append(ele)
            t_f_mask.append(True)
        else:
            t_f_mask.append(False)
    feat = feat[t_f_mask]
    label = label[t_f_mask]
    feat = feat[label.notnull().values]
    label = label[label.notnull().values]
    if multi_to_single:
        labels_new, ds_names = reduce_multi(label, data_file[:-4]+name_to_pat_map[:-4], label_key)
        for l, n in zip(labels_new, ds_names):
            data.append({'study': study[:-5],
                         'disease': disease,
                         'data_set': n,
                         'features': feat,
                         'labels': l,
                         'peaks': features[['mzmed', 'mzmin', 'mzmax', 'rtmed', 'rtmin', 'rtmax', 'npeaks', 'isotopes', 'adduct', 'pcgroup']],
                         'samples': m}) 
    else:
        data.append({'study': study[:-5],
                     'disease': disease,
                     'data_set': data_file[:-4]+name_to_pat_map[:-4],
                     'features': feat,
                     'labels': label,
                     'peaks': features[['mzmed', 'mzmin', 'mzmax', 'rtmed', 'rtmin', 'rtmax', 'npeaks', 'isotopes', 'adduct', 'pcgroup']],
                     'samples': m}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))  

                                                 Samples
0                          10_12_15-octadecatrienoicacid
1      (10E)-19-fluorovitaminD3/(10E)-19-fluorocholec...
2             10-hydroxy-2E_8E-Decadiene-4_6-diynoicacid
3      10-hydroxy-2E_8E-Decadiene-4_6-diynoicacid+1.2...
4                      1-(11E-octadecenoyl)-rac-glycerol
5                                          11-deoxy-PGE2
6                                12-amino-dodecanoicacid
7           12beta-Hydroxy-3-oxo-5beta-cholan-24-oicAcid
8      12beta-Hydroxy-3-oxo-5beta-cholan-24-oicAcid+1...
9      12beta-Hydroxy-3-oxo-5beta-cholan-24-oicAcid+1...
10                         1_2-Didecanoyl-glycerol(10:0)
11                                12-Ketodeoxycholicacid
12                             12-oxo-9-octadecynoicacid
13                             1-(2-Pyrimidyl)piperazine
14                   1-(2-Pyrimidyl)piperazine+5.2760143
15                           12Z_15Z-octadecadienoicacid
16             1_3-DIPROPYL-8-C

STUDIES WITH RAW DATA THAT I PROCESSED!


In [84]:
# no technical replicates. 
# label order appears to match the feature order
# NO CLUE HOW TO MAP FILE NAMES FROM PLASMA TO URINE - CANNOT COMBINE
study = 'Feng_data'
disease = 'coronary heart disease'
os.chdir(os.path.join(root,study))
files = ['srep22525-s2.xls', 'srep22525-s4.xls']
analysis_type = ['plasma', 'urine']
data = []
label_dict = {}
for f, t in zip(files,analysis_type):
    df = pd.read_excel(f, skiprows=1) # NOTE THIS IS JUST THE PLASMA DATA
    if t == 'plasma':
        key = 'CD'
    else:
        key = 'ZSL'
    labels = pd.DataFrame(df.columns[1:].str.contains(key), index=df.iloc[:,1:].T.index)
    samples = pd.DataFrame(index=df.iloc[:,1:].T.index)
    label_dict[t] = [labels, samples]
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': t+'all_author',
                 'features': df.iloc[:,1:].T.astype('float'),
                 'labels': labels,
                 'peaks': df.iloc[:,[0,]],
                 'samples': samples})

dirs = ['serum', 'urine']
for folder in dirs:
    if 'serum' in folder:
        labels, samples = label_dict['plasma']
    else:
        labels, samples = label_dict['urine']
    for f in os.listdir(folder):
        if f[-3:] == 'csv':
            analysis = folder +'_'+f[:-4]
            open_file = os.path.join(folder,f)
            features = pd.read_csv(open_file) 
            file_names = list(features.iloc[:,9:-3])  
            file_names = [f[:-6]+f[-8:-6] for f in file_names]
            label = labels.loc[file_names]
            data.append({'study': study[:-5],
                         'disease': disease,
                         'data_set': analysis,
                         'features': features.iloc[:,9:-3].T.astype('float'),
                         'labels': label,
                         'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']],
                         'samples': samples.loc[file_names]})      #using the last samples df as well
            
dirs = ['serum_onebatch', 'urine_onebatch']   
for folder in dirs:
    if 'serum' in folder:
        labels, samples = label_dict['plasma']
    else:
        labels, samples = label_dict['urine']
    for f in os.listdir(folder):
        if f[-3:] == 'csv':
            analysis = folder +'_'+f[:-4]
            open_file = os.path.join(folder,f)
            features = pd.read_csv(open_file) 
            file_names = list(features.iloc[:,9:-3])  
            file_names = [f[:-6]+f[-8:-6] for f in file_names]
            file_names = [fi for fi in file_names if 'QC' not in fi]
            label = labels.loc[file_names]
            file_names_2 = [f[:-2]+'.mzXML' for f in file_names]
            feat = features.iloc[:,9:-3].T.astype('float')
            feat = feat.loc[file_names_2]
            data.append({'study': study[:-5],
                         'disease': disease,
                         'data_set': analysis,
                         'features': feat,
                         'labels': label,
                         'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']],
                         'samples': samples.loc[file_names]})  

pickle.dump(data, open('FengEtAl.pkl', 'wb'))

In [21]:
# also some people with same subject name have different sex....
# LOOK AT PAPER BEFORE DEALING ANYMORE WITH THIS
# looks like the label order matches the sample order in the features 
# 12-7-18: looks like the order is good and you can combine features! 
study = 'ST000763_data'
disease = 'scleroderma PAH'
os.chdir(os.path.join(root,study))
data = []
files = ['AN001201.txt', 'AN001202.txt'] #, 'AN001203.txt', 'AN001204.txt']
# this is pos-Qtof, neg-Qtof, pos-3tof, neg-3tof
# 1201 and 1202 are the untargeted samples, 1203 and 1204 are the msms and match to the pos and neg folder
label_col = 'Group'
label_key = {'Healthy': 0, 'PAH': 4,'Normal Pressures':2, 'Borderline Pressures': 3, 'LowRisk':1} 
# 1) healthy controls, 2) patients with scleroderma at low risk for pulmonary hypertension, 
# 3) pateints with scleroderma at high risk for pulmonary hypertension who underwent a catheterization 
# and were found to have normal pressures, borderline elevated pressures or pulmonary arterial hypertensino (PAH).
for f in files:
    features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
    if multi_to_single:
        labels_new, ds_names = reduce_multi(labels, f[:-4], label_key)
        for l, n in zip(labels_new, ds_names):
            data.append({'study': study[:-5],
                         'disease': disease,
                         'data_set': n,
                         'features': features,
                         'labels': l,
                         'peaks': metabolites,
                         'samples': samples}) 
    else:
        data.append({'study': study[:-5],
                     'disease': disease,
                     'data_set': f[:-4],
                     'features': features,
                     'labels': labels,
                     'peaks': metabolites,
                     'samples': samples})   
# get my data:
dirs = ['untargeted']
# dirs = ['neg', 'pos', 'untargeted']
for folder in dirs:
    for fi in os.listdir(folder):
        if fi[-3:] == 'csv':
            open_file = os.path.join(folder,fi)
            features = pd.read_csv(open_file) 
            file_names = list(features.iloc[:,9:-3])  
            if folder == 'untargeted':
                new_ind = {f:f[-16:-7] for f in file_names}
                file_names = [f[-16:-7] for f in file_names]
            else:
                new_ind = {f:f[31:-5] for f in file_names}
                file_names = [f[31:-5] for f in file_names]
            # this next line only works because all the above files make 'labels' dfs that are the same, so just using the last:
            label = labels.loc[file_names]
            features_filt = features.iloc[:,9:-3].T.astype('float')
            features_filt = features_filt.rename(index=new_ind)
            if multi_to_single:
                labels_new, ds_names = reduce_multi(label, fi[:-4], label_key)
                for l, n in zip(labels_new, ds_names):
                    data.append({'study': study[:-5],
                                 'disease': disease,
                                 'data_set': n,
                                 'features': features_filt,
                                 'labels': l,
                                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']],
                                 'samples': samples.loc[file_names]}) 
            else:
                data.append({'study': study[:-5],
                             'disease': disease,
                             'data_set': fi[:-4],
                             'features': features_filt,
                             'labels': label,
                             'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']],
                             'samples': samples.loc[file_names]})      #using the last samples df as well
pickle.dump(data, open('%s.pkl'%study, 'wb'))

((218, 138), (138, 10))
((218, 133), (133, 10))


In [86]:
# AN file says 24 subject...at multiple timepoints...total 89 samples - all found, no duplicates
# label order appears to match order in features 
# 12-7-18: looks like features can be combined!
study = 'ST000726_data'
disease = 'Polycystic Ovarian Syndrome'
os.chdir(os.path.join(root,study))
data = []
files = ['AN001138.txt', 'AN001139.txt']
modes = ['Negative', 'Positive']
label_col = 'DISEASE_STATE'
#NOTE GOOD glycemic control in second half and poor in first...because of this I made both '1'
label_key = {'PCOS': 1, 'Control': 0} 
sample_dir = {}
label_dir = {}
for f,mode in zip(files,modes):
    features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
    sample_dir[mode[:3].lower()] = samples 
    label_dir[mode[:3].lower()] = labels
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': f[:-4]+'-'+mode,
                 'features': features,
                 'labels': labels,
                 'peaks': metabolites,
                 'samples': samples})
#get my IPO based features
dirs = ['neg', 'pos']
for folder in dirs:
    data_file = 'IPO_aligned_ST000726_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file) 
    file_names = list(features.iloc[:,9:-3])  
    new_ind = {f:f[30:-7] for f in file_names}
    file_names = [f[30:-7] for f in file_names]
    file_names = [f for f in file_names if 'S' == f[0]]
    labels = label_dir[folder].loc[file_names]
    features_filt = features.iloc[:,9:-3].T.astype('float')
    features_filt = features_filt.rename(index=new_ind)
    features_filt = features_filt.loc[file_names]
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': features_filt,
                 'labels': labels,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']],
                 'samples': sample_dir[folder]}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [87]:
# THIS IS CODE FROM MM: just showing the filtering is not actualyl removing replicates 
# study = 'ST000578_data'
# disease = 'malaria'
# os.chdir(os.path.join(root,study))
# data = []

# label_col = 'Current Malaria Infection'
# label_key = {'P.Vivax': 1,
#              'None': 0}

# for fn in glob.iglob('AN*.txt'):
#     features, labels, samples, metabolites = read_metadata(fn, label_col, label_key)
    
#     # both the controls and the QCs have disease status 'None'
#     labels[samples['SUBJECT(optional)'].isnull()] = np.nan
#     labels[samples['SUBJECT(optional)']=='REe6'] = np.nan
#     data.append({'study': study,
#                  'disease': disease,
#                  'analysis': os.path.splitext(fn)[0],
#                  'features': features,
#                  'labels': labels,
#                  'peaks': metabolites,
#                  'samples': samples})

# data = [filter_samples(x) for x in data]
# # pickle.dump(data, open('%s.pkl'%study, 'wb'))
# os.chdir('..')

In [88]:
#  data has triplicates
# 12-7-18: looks like you can probably combine the datasets!
study = 'ST000578_data'
disease = 'Malaria (P. vivax)'
os.chdir(os.path.join(root,study))
data = []
files = ['AN000888.txt', 'AN000889.txt']
label_col = 'Current Malaria Infection'
label_key = {'P.Vivax': 1, 'None': 0}
columns = ['C18', 'AE']
label_dir = {}
sample_dir = {}
for f, col in zip(files, columns):
    features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
    labels.index = labels.index.astype('string')
    samples.index = samples.index.astype('string')
    labels = labels[samples['Gender'] != 'N/A']
    samples = samples[samples['Gender'] != 'N/A']
    file_names = list(labels.index)
    features = features.loc[file_names]
    label_dir[col] = labels
    sample_dir[col] = samples
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': f[:-4],
                 'features': features,
                 'labels': labels,
                 'peaks': metabolites,
                 'samples': samples})
    
metadata = pd.read_csv('metadata.csv').set_index('Instrument.File.Name')
metadata = metadata.dropna(subset=['Aliquot.Id'])
metadata['Aliquot.Id'] = metadata['Aliquot.Id'].astype('int').astype('string')
dirs = ['AE', 'C18']
for folder in dirs:
    #VT_140317_103.mzML map this to the 2008...s, 2008603 (this is index in label dir folder)
    # ok - map names from features to IDs in the matadata, take these IDs to get the labels from label_dir[folder].loc[names]
    data_file = 'IPO_aligned_ST000578_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file) 
    file_names = list(features.iloc[:,9:-3:3])  # using the slicing to drop replicates...could probably keep and drop later as well
    file_names = [f[:-5] for f in file_names]
    file_names = metadata['Aliquot.Id'].loc[file_names]
    file_names = file_names.drop_duplicates()
    file_names = file_names.dropna()
    mapped_file_names = list(file_names)
    labels = label_dir[folder].loc[mapped_file_names]
    labels = labels.dropna()
    files_for_feats = list(labels.index)
    new_files = []
    for fi in files_for_feats:
        new_files.append(list(file_names[file_names == fi].index)[0])
    
    files_for_feats = [f+'.mzML' for f in new_files]
    feat = features.iloc[:,9:-3:3].T.astype('float')
    feat = feat.loc[files_for_feats]
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': feat,
                 'labels': labels,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']],
                 'samples': sample_dir[folder]}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))

  interactivity=interactivity, compiler=compiler, result=result)


In [89]:
# CHECK MY PROCESSING - in the pc/nc datasets there are no replicates and 3 fewer samples - did the authors just not post them all?
#note this is the metadata from both 421 and 422 but there are not files from 422 for reprocessing (its a duplicate of 421)
# look like there are 2 replicates! remove.(labeled as *_r001 and *_r002)
# labels and features order appears to match
# 12-8-18: can combined features across reprocessed, poor or good (but not good with poor!!!)
study = 'ST000421_data'
disease = 'Diabetes - Type I'
os.chdir(os.path.join(root,study))
data = []
files = ['AN000663.txt', 'AN000664.txt', 'AN000665.txt', 'AN000666.txt',
         'AN000667.txt', 'AN000668.txt', 'AN000669.txt', 'AN000670.txt']
poor_v_good = ['Poor glycemic control','Poor glycemic control','Poor glycemic control','Poor glycemic control',
               'Good glycemic control','Good glycemic control','Good glycemic control','Good glycemic control']
label_col = 'treatment'
#NOTE GOOD glycemic control in second half and poor in first...because of this I made both '1'
label_key = {'ND': 0, 'T1D poor glycemic control': 1, 'T1D good glycemic control':1} 
all_labels = []
samples_to_labels = {'Poor':[], 'Good':[]}

for f, gly_con in zip(files,poor_v_good):
    features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
    labels = labels.loc[list(features.index)] # note: in MMs analysis he did not correct for this...
    # remove duplicates:
    keep_labels = [f for f in list(labels.index) if 'r001' not in f] #tossing all the r001s (could do r002s but due to later code, but this is easier)
    min_samples = samples.loc[keep_labels]['SUBJECT(optional)'].reset_index().set_index('SUBJECT(optional)')
    if 'Poor' in gly_con:
        samples_to_labels['Poor'].append(min_samples)
    else:
        samples_to_labels['Good'].append(min_samples)
for gly_type in samples_to_labels:
    samples_to_labels[gly_type] = pd.concat(samples_to_labels[gly_type], axis=1).dropna()

good_count = 0
poor_count = 0
for f, gly_con in zip(files,poor_v_good):
    features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
    if 'Poor' in gly_con:
        keep_labels = list(samples_to_labels['Poor'].iloc[:,poor_count].values)
        poor_count += 1
    else:
        keep_labels = list(samples_to_labels['Good'].iloc[:,good_count].values)
        good_count += 1
    labels = labels.loc[keep_labels]
    features = features.loc[keep_labels]
    all_labels.append(labels)
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': f[:-4]+'-'+gly_con,
                 'features': features,
                 'labels': labels,
                 'peaks': metabolites,
                 'samples': samples.loc[list(features.index)]})  
    
process_samples_to_labels = samples_to_labels['Good']
loc = 0
all_labels = pd.concat(all_labels)
dirs = ['p_hil_data', 'n_hil_data', 'pc18_data', 'nc18_data']
to_remove = ['28','62','80','81']
# the pc and nc folders have only 27 files while the same from the authors have 30...also they have no replicates...not sure what happened here
for folder in dirs:
    data_file = 'IPO_aligned_ST000421_' + folder[:-5] + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file) 
    feat = features.iloc[:,8:-3].T.astype('float')
    file_names = list(feat.index)
    
    file_names_new = []
    if len(file_names) > 35: #since 2 of the files only have half data, this is a hacky way to do this conversion on only the ones with replicates
        file_names = [f for f in file_names if 'r001' not in f] # tossing r001 again as well
    for f in file_names:
        if '13feb12' in f or '15feb12' in f: # none of the r001 files are in the labels, so calling these as r002 and using that label
            file_names_new.append(f[1:-8].replace('.','-')+'2.d')
        else:
            file_names_new.append(f[1:-7].replace('.','-')+'.d')
    file_names = [f for f in file_names if f[9:11] not in to_remove]
    file_names_new = [f for f in file_names_new if f[8:10] not in to_remove]
    label = all_labels.loc[file_names_new]
    feat = feat.loc[file_names]
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': feat, # no X1 so lost a column
                 'labels': label,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'npeaks', 'isotopes', 'adduct', 'pcgroup']],
                 'samples': samples.loc[file_names_new]})   
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [90]:
# all good: no duplicates, labels look to be in the correct order
# This is a longitudinal study - not performing logistic regression on it for now...
study = 'ST000397_data'
disease = 'Single human time study'
os.chdir(os.path.join(root,study))
data = []
f = 'AN000634.txt'
# note this is a longitudinal study of one person
# the label_col and key are useless actually and just used to make MMs processing function work
label_col = 'Data'
label_key = {'1': 1}
features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
labels = samples['Blood Draw Date']
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4],
             'features': features,
             'labels': labels,
             'peaks': metabolites,
             'samples': samples}) 

f = 'IPO_aligned_ST000397_pos.csv'
features = pd.read_csv(f)
file_names = list(features.iloc[:,9:-3])
file_names = [fi[1:-7] for fi in file_names]
# first get labels for all samples based on labels given in AN file by authors
label = labels.loc[file_names]
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4],
             'features': features.iloc[:,9:-3].T.astype('float'),
             'labels': label,
             'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']],
             'samples': samples}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [91]:
# all good, looks like only 299 samples and none are duplicates and labels look to be in correct order
study = 'ST000396_data'
disease = 'lung cancer - non-small-cell lung cancer (adenocarcinoma, etc)'
os.chdir(os.path.join(root,study))
f = 'AN000633.txt'
data = []
label_col = 'Diagnosis'
label_key = {'-': 0, 'Adenocarcinoma': 1, 'Other NSCLC': 1, 'Squamous cell':1}
features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4],
             'features': features,
             'labels': labels,
             'peaks': metabolites,
             'samples': samples}) 

f = 'IPO_aligned_ST000396.csv'
features = pd.read_csv(f)
file_names = list(features.iloc[:,9:-3])
file_names = [fi[1:-7] for fi in file_names]
# first get labels for all samples based on labels given in AN file by authors
label = labels.loc[file_names]
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4],
             'features': features.iloc[:,9:-3].T.astype('float'),
             'labels': label,
             'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']],
             'samples': samples}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [12]:
#looks like no replicates! labels are in the correct order with the samples
# 12-7-18: looks like ordering is good so can mix multiple datasets!
study = 'ST000392_data'
disease = 'Lung cancer'
os.chdir(os.path.join(root,study))
data = []
f = 'AN000628.txt'
label_col = 'Disease State'
label_key = {'control': 0, 'cancer': 1, '-': 'NaN'}
features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
labels = labels[labels['Disease State']!='NaN'] # remove QCs
file_names = list(labels.index)
features = features.loc[file_names] #filter the QCs out of the features
# now split the plasma and serum . first get file names 
plasma_names = list(samples[samples['Organ']=='Plasma '].index)
serum_names = list(samples[samples['Organ']=='Serum '].index)
# get labels for plasma and serum
labels_p = labels.loc[plasma_names]
labels_s = labels.loc[serum_names]
# split features:
features_p = features.loc[plasma_names]
features_s = features.loc[serum_names]
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4]+'_plasma',
             'features': features_p,
             'labels': labels_p,
             'peaks': metabolites,
             'samples': samples.loc[plasma_names]}) 
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4]+'_serum',
             'features': features_s,
             'labels': labels_s,
             'peaks': metabolites,
             'samples': samples.loc[serum_names]})

dirs = ['plasma_data', 'serum_data']
for folder in dirs:
    files = os.listdir(folder)
    for fi in files:
        if 'IPO_aligned' in fi:
            data_file = fi
        else:
            continue
        open_file = os.path.join(folder,data_file)
        features = pd.read_csv(open_file)   
        file_names = list(features.iloc[:,9:-3])
        file_names = [f[1:-7] for f in file_names]
        # first get labels for all samples based on labels given in AN file by authors
        label = labels.loc[file_names]
        data.append({'study': study[:-5],
                     'disease': disease,
                     'data_set': data_file[:-4],
                     'features': features.iloc[:,9:-3].T.astype('float'),
                     'labels': label,
                     'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']],
                     'samples': samples}) 
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [93]:
# looks like no replicates, fixed label order.
study = 'ST000389_data'
disease = 'Lung cancer'
os.chdir(os.path.join(root,study))
data = []
label_col = 'Group'
label_key = {'Benign': 0, 'Cancer': 1}
f = 'AN000625.txt'
features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
labels = labels.loc[features.index]
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4]+'_GC',
             'features': features,
             'labels': labels,
             'peaks': metabolites,
             'samples': samples}) 
# get my features
dirs = ['gc']
for folder in dirs:
    data_file = 'IPO_aligned_ST000388_GC.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    # turns out this datalist has like 40ish more samples that lack labels, filtering these
    file_names = list(features.iloc[:,9:-3])
    file_names = [f[1:-7] for f in file_names]
    # first get labels for all samples based on labels given in AN file by authors:
    labels = labels.loc[file_names]
    # now find what doesnt have a label and remove these
    mask = labels.isnull()
    labels = labels[mask['Group']==False]
    # now get the feature data and do the same
    feat_data = features.iloc[:,9:-3].T.astype('float')
    feat_names = list(feat_data.index)
    rename_inx = {feat_name:feat_name[1:-7] for feat_name in feat_names} #need to make a old name:new name mapping dict
    feat_data = feat_data.rename(index=rename_inx) # renames indicies
    feat_data = feat_data[mask['Group']==False] # now mask based on samples with vs without labels
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': feat_data,
                 'labels': labels,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']],
                 'samples': samples})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [94]:
#fixed data order, and size and no replicates
# author data 94 samples, I have 94 though...I think this is related to 388
study = 'ST000388_data'
disease = 'Lung cancer'
os.chdir(os.path.join(root,study))
data = []
label_col = 'Group'
label_key = {'Benign': 0, 'Cancer': 1}
f = 'AN000624.txt'
features, labels, samples, metabolites = read_metadata(f, label_col, label_key)
labels = labels.loc[features.index]
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4]+'_LC',
             'features': features,
             'labels': labels,
             'peaks': metabolites,
             'samples': samples}) 
# get my features
dirs = ['lc']
for folder in dirs:
    data_file = 'IPO_aligned_ST000388_LC.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    file_names = list(features.iloc[:,9:-3])
    file_names = [f.replace('_HILIC_Pos_', '_')[:-5] for f in file_names]
    labels = labels.loc[file_names]
    labels = labels.dropna()   
    
    feat_files = [f.replace('_','_HILIC_Pos_')+'.mzML' for f in list(labels.index)]
    feat = features.iloc[:,9:-3].T.astype('float')
    feat = feat.loc[feat_files]
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': feat,
                 'labels': labels,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']],
                 'samples': samples})
pickle.dump(data, open('%s.pkl'%study, 'wb'))


In [95]:
# order seems fine - CHECK for duplicates
# 92 samples
study = 'ST000381_data'
disease = 'interstitial cystitis/painful bladder syndrome'
os.chdir(os.path.join(root,study))
data = []
# ok the IPO_aligned matches with AN000015, AN16 and 17 are for the nonexistant LC data
# YOU CANNOT DL the LC data sadly
files = ['AN000615.txt', 'AN000616.txt', 'AN000617.txt'] 
label_col = ['Factor3', 'SubCondition', 'SubCondition']
label_key = {'n/a': 0, 'MODEST': 1, 'INTERMEDIATE':1, 'SEVERE':1}

for f, col in zip(files,label_col):
    features, labels, samples, metabolites = read_metadata(f, col, label_key, debug=False)
    print(len(list(features.index)))
    print(list(features.index))
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': f[:-4],
                 'features': features,
                 'labels': labels,
                 'peaks': metabolites,
                 'samples': samples})   
# ONLY FOR DATA WITHOUT LC SAMPLES FROM AUTHORS... it grabs metadata from the first
# element of data above which is the GC data
# THIS DATA MATCHES WITH AN000615 IN TERMS OF NUMBER OF SAMPLES
gc_labels = data[0]['labels']
f = 'IPO_aligned_ST000381_pos.csv'
features = pd.read_csv(f)
file_names = list(features.iloc[:,9:-3])
file_names = [fi[1:-7] for fi in file_names]
labels = gc_labels.loc[file_names]
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4],
             'features': features.iloc[:,9:-3].T.astype('float'),
             'labels': labels,
             'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']],
             'samples': data[0]['samples']})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

92
['140813amssa25_1', '140814amssa37_1', '140814amssa14_1', '140814amssa35_1', '140813amssa35_1', '140813amssa13_1', '140813amssa42_1', '140814amssa24_1', '140813amssa38_1', '140813amssa47_1', '140813amssa03_1', '140814amssa50_1', '140813amssa19_1', '140814amssa07_1', '140813amssa11_1', '140813amssa49_1', '140814amssa39_1', '140814amssa23_1', '140813amssa16_1', '140814amssa27_1', '140813amssa10_1', '140814amssa11_1', '140813amssa01_1', '140814amssa49_1', '140813amssa46_1', '140814amssa19_1', '140813amssa40_1', '140814amssa08_1', '140813amssa05_1', '140813amssa37_1', '140814amssa05_1', '140814amssa21_1', '140814amssa10_1', '140813amssa07_1', '140813amssa09_1', '140814amssa22_1', '140813amssa29_1', '140814amssa16_1', '140814amssa12_1', '140814amssa09_1', '140813amssa02_1', '140813amssa18_1', '140814amssa31_1', '140813amssa39_1', '140814amssa38_1', '140814amssa01_1', '140813amssa23_1', '140813amssa15_1', '140814amssa02_1', '140813amssa31_1', '140814amssa15_1', '140813amssa41_1', '140813a

In [96]:
# order of labels appears to match order of samples also doesnt look like there are any duplicates
# 12-8-18: mapping datafiles accross the datasets!!!! 

study = 'ST000385_data'
disease = 'lung cancer - adenocarcinoma'
os.chdir(os.path.join(root,study))
data = []
# this is for ADC 1 and ADC 2 for both serum and plasma
files = ['AN000603.txt', 'AN000620.txt'] #files starting with 14... is ADC 1 is 620
# The AN files HAVE BOTH PLASMA AND SERUM IN THEM 602 ~ 620 
# NOTE I THINK AN000602 is off by one! as is 621....wtf using the two others 
label_col = ['Factor3', 'Health State']
plas_v_serum = ['Factor1', 'Organ']
label_key = {'Healthy': 0, 'Adenocarcinoma': 1,'Adenocarcinoma ': 1,  'NA':'NA', 'Adenosquamous':1, 'Adenocarcnoma':1}
df_m_adc1 = pd.read_csv('metadata_adc1.csv')
df_m_adc1.set_index('Sample name', inplace=True)
df_m_adc2 = pd.read_csv('metadata_adc2.csv')
df_m_adc2.set_index('Sample name', inplace=True)
final_file_names = []

for f, col, col2 in zip(files,label_col,plas_v_serum):
    features, labels, samples, metabolites = read_metadata(f, col, label_key, debug=False)
    #get labels to be in correct order and split into plasma and serum seperate
    plasma_files = list(samples.index[samples[col2] == 'Plasma ']) #note the extra space here
    serum_files = list(samples.index[samples[col2] == 'Serum '])
    serum_features = features.loc[serum_files]
    plasma_features = features.loc[plasma_files]
    labels_s = labels.loc[serum_files]
    labels_p = labels.loc[plasma_files]
    #get a mask for where the non-QC / pool files are - plasma first:
    na_t_f_samples_p = labels_p !='NA' 
    na_samples_p = labels_p[na_t_f_samples_p[col] == False].index
    labels_p = labels_p[na_t_f_samples_p[col] == True]
    plasma_features = plasma_features[na_t_f_samples_p[col]==True]
    p_files = list(plasma_features.index)
    # now for the serum samples
    na_t_f_samples_s = labels_s !='NA' 
    na_samples_s = labels_s[na_t_f_samples_s[col] == False].index
    labels_s = labels_s[na_t_f_samples_s[col] == True]
    serum_features = serum_features[na_t_f_samples_s[col]==True]
    s_files = list(serum_features.index)
    #### MAPPING SAMPLES ACROSS DATASETS:
    samples_s_serum = samples.loc[s_files]['SUBJECT(optional)'].values
    samples_patient_serum = [s[:-6] for s in samples_s_serum]
    samples_label_serum = list(samples.loc[s_files].index)
    samples_s_plasma = samples.loc[p_files]['SUBJECT(optional)'].values
    samples_patient_plasma = [s[:-7] for s in samples_s_plasma]
    samples_label_plasma = list(samples.loc[p_files].index)
    remove_p = []
    remove_s = []
    for p in samples_patient_plasma:
        if p not in samples_patient_serum:
            remove_p.append(p)
    for s in samples_patient_serum:
        if s not in samples_patient_plasma:
            remove_s.append(s)
    for lab, pat in zip(samples_label_serum, samples_patient_serum):
        if pat in remove_s:
            samples_label_serum.remove(lab)
    for lab, pat in zip(samples_label_plasma, samples_patient_plasma):
        if pat in remove_p:
            samples_label_plasma.remove(lab)
    final_file_names.append(samples_label_plasma)
    final_file_names.append(samples_label_serum)
    
    plasma_features = plasma_features.loc[samples_label_plasma]
    serum_features = serum_features.loc[samples_label_serum]
    labels_p = labels_p.loc[samples_label_plasma]
    labels_s = labels_s.loc[samples_label_serum]
    samples_p = samples.loc[samples_label_plasma]
    samples_s = samples.loc[samples_label_serum]
    
    # metabolites are time_mz seperated, when there are 2+, after the 1st it starts with + or _
    #features are already samples X features    
#     print(labels_p, labels_s)
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': f[:-4]+'_plasma',
                 'features': plasma_features,
                 'labels': labels_p,
                 'peaks': metabolites,
                 'samples': samples_p})
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': f[:-4]+'_serum',
                 'features': serum_features,
                 'labels': labels_s,
                 'peaks': metabolites,
                 'samples': samples_s})
    
dirs = ['adc2_plasma', 'adc2_serum', 'adc1_plasma', 'adc1_serum']
data_temp = []
mapping_labels_across_ds = []
for folder, f_names in zip(dirs,final_file_names):
    df = df_m_adc1 if 'adc1' in folder else df_m_adc2
    data_file = 'IPO_aligned_ST000385_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    feat = features.iloc[:,9:-3].T.astype('float')
    ff_names_indata = ['X'+f+'.mzData' for f in f_names] # use this to index the labels, features and samples....
    feat = feat.loc[ff_names_indata]
    labels = df.loc[f_names]['Health State']
    label = {'Healthy':0, 'Healthy ':0, 'Adenocarcinoma':1,'Adenocarcinoma ': 1, 'Adenosquamous':1,'Adenosquamous ':1, 'Adenocarcnoma':1}
    labels = labels.replace(label)
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': feat,
                 'labels': labels,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']],
                 'samples': df.loc[f_names]})
    
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [97]:
# 30 samples even though the study AN file says 31
# no duplicate files and file order seems correct
# 12-7-18: I think you can go ahead and combine datasets
study = 'ST000329_data'
disease = 'minimal change disease, focal segmental sclerosis'
os.chdir(os.path.join(root,study))
data = []

files = ['AN000525.txt', 'AN000526.txt']
label_col = 'Sample type'
label_key = {'MCD': 1, 'FSGS': 2, 'Control': 0}
df_s = pd.read_csv('metadata.csv')
df_s.set_index('Sample name', inplace=True)

for f in files:
    features, labels, samples, metabolites = read_metadata(f, label_col, label_key, debug=False)
    # metabolites are time_mz seperated, when there are 2+, after the 1st it starts with + or _
    #features are already samples X features 
    if multi_to_single:
        labels_new, ds_names = reduce_multi(labels, f[:-4], label_key)
        for l, n in zip(labels_new, ds_names):
            data.append({'study': study[:-5],
                         'disease': disease,
                         'data_set': n,
                         'features': features,
                         'labels': l,
                         'peaks': metabolites,
                         'samples': samples}) 
    else:
        data.append({'study': study[:-5],
                     'disease': disease,
                     'data_set': f[:-4],
                     'features': features,
                     'labels': labels,
                     'peaks': metabolites,
                     'samples': samples})

dirs = ['pos', 'neg']
for folder in dirs:
    data_file = 'IPO_aligned_ST000329_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    if folder == 'pos':
        file_names = list(features.iloc[:,9:-3])
        feat = features.iloc[:,9:-3].T.astype('float')
        p = features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']]
    else:
        file_names = list(features.iloc[:,9:])
        feat = features.iloc[:,9:].T.astype('float')  
        p = features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1']]
    file_names = [f[:-5]+'.d' for f in file_names]
    labels = df_s.loc[file_names]
    labels = labels['Treatment'].replace(label_key)
    if multi_to_single:
        labels_new, ds_names = reduce_multi(labels, data_file[:-4], label_key)
        for l, n in zip(labels_new, ds_names):
            data.append({'study': study[:-5],
                         'disease': disease,
                         'data_set': n,
                         'features': feat,
                         'labels': l,
                         'peaks': p,
                         'samples': df_s})
    else:
        data.append({'study': study[:-5],
                     'disease': disease,
                     'data_set': data_file[:-4],
                     'features': feat,
                     'labels': labels,
                     'peaks': p,
                     'samples': df_s})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [98]:
study = 'ST000865_data'
disease = 'Hepatocellular Carcinoma'
os.chdir(os.path.join(root,study))
data = []

f = 'AN001390.txt'
label_col = 'Patient group'
label_key = {'CIRR': 0, 'HCC': 1, '-':np.nan, 'Pool HCC': np.nan, 'Pool CIRR':np.nan, 'POSSIBLE CASE':np.nan}

features, labels, samples, metabolites = read_metadata(f, label_col, label_key, debug=False)
labels = labels.loc[list(features.index)]
good_files = list(labels.index)
data.append({'study': study[:-5],
             'disease': disease,
             'data_set': f[:-4]+'all_author',
             'features': features,
             'labels': labels,
             'peaks': metabolites,
             'samples': samples})

folders = ['batch2_raw', 'batch3_raw', 'onebatch']
for folder in folders:
    data_file = 'IPO_aligned_ST000865_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    file_names = list(features.iloc[:,9:-3])
    file_names = [name[1:-7] for name in file_names if name[1:-7] in good_files]
    
    feat = features.iloc[:,9:-3].T.astype('float')
    peaks = features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']]
    label = labels.loc[file_names]
    sample = samples.loc[file_names]
    file_names = ['X'+name+'.mzData' for name in file_names]
    feat = feat.loc[file_names]
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': feat,
                 'labels': label,
                 'peaks': peaks,
                 'samples': sample})
pickle.dump(data, open('%s.pkl'%study, 'wb'))   

In [99]:
# 18 females - the cranberry juice studies.
# study = 'ST000292_data'
# disease = 'None - Plasma'
# os.chdir(os.path.join(root,study))
# data = []

# label_col = 'Treatment'
# label_key = {'Urine after drinking cranberry juice': 1, 'Urine after drinking apple juice': 2, 'Baseline urine': 0}


In [100]:
# 18 females - the cranberry juice studies.
# study = 'ST000291_data'
# disease = 'None - Plasma'
# os.chdir(os.path.join(root,study))
# data = []

# label_col = 'Treatment'
# label_key = {'Urine after drinking cranberry juice': 1, 'Urine after drinking apple juice': 2, 'Baseline urine': 0}


In [101]:
# 63 samples, however all have 3 replicates - removed all but first using pd df slicing every 3
# order appears to match. 
# NOTE: NOT SURE WHAT THE 'ND' data is...I think its 'No Diabetes' and they jst got saline
# 12-7-18: can probably go ahead and combine the datasets...
study = 'ST000045_data'
disease = 't1 diabetes'
os.chdir(os.path.join(root,study))
data = []

label_col = 'Treatment'
# label_key = {'Saline Infusion': 0, 'Insulin Withdrawal': 1}
modes = ['pos', 'neg', 'pos', 'neg']
files = ['AN000072', 'AN000074', 'AN000073', 'AN000075'] #these match: 72 is 02feb, 74 = 11feb, 73 is 11mar, 75 had mar17
# look like these files only use 2/3 of the samples....groupA isnt used (sammples 15-21)
# took the data from the PMID file and will use this instead as the data
files = ['a_2feb.csv', 'a_11feb.csv', 'a_11mar.csv', 'a_17mar.csv']
# author features: get from PMID file, its a .xlsx so use read_excel, the first column is mz@rt 
df_s = pd.read_csv('s_data.csv')
# get dataframe for easy file label look up. 
df_l = df_s[['Filename', 'Treatment']]
df_l.set_index('Filename', inplace=True)
df_l = df_l.replace({'ND': 0, 'II': 1, 'IW': 2})
label_key = {'ND': 0, 'II': 1, 'IW': 2}

replicate_keep = 'r001' #also can choose r002 or r003, HOWEVER then you will need to change iloc when getting the features

for f in files:
#     features, labels, samples, metabolites = read_metadata(fn, label_col, label_key)
    features = pd.read_csv(f)
    file_names = list(features.iloc[:,3:-8])
    n_file_names = []
    for f in file_names:
        if f[-2:] == '_1':
            n_file_names.append(f[:-2]+'.d')
        else:
            n_file_names.append(f+'.d')
    n_file_names = [f for f in n_file_names if replicate_keep in f]
    labels = df_l.loc[n_file_names]
    if multi_to_single:
        labels_new, ds_names = reduce_multi(labels, f[:-4], label_key)
        for l, n in zip(labels_new, ds_names):
            data.append({'study': study[:-5],
                         'disease': disease,
                         'data_set': n,
                         'features': features.iloc[:,3:-8:3].T.astype('float'),
                         'labels': l,
                         'peaks': features[['Mass', 'Retention Time', 'Ionization mode']],
                         'samples': df_s})
    else:
        data.append({'study': study[:-5],
                     'disease': disease,
                     'data_set': f[:-4],
                     'features': features.iloc[:,3:-8:3].T.astype('float'),
                     'labels': labels,
                     'peaks': features[['Mass', 'Retention Time', 'Ionization mode']],
                     'samples': df_s})

#get my data:
dirs = ['2Feb', '11Feb', '11Mar', '17Mar']
for folder, mode in zip(dirs, modes):
    data_file = 'IPO_aligned_ST000045_' + folder.lower() + '_' + mode + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    # 8:-3 - No X1 feature column, use npeaks instead
    # also note, the file names in my csv files are:  X02Feb10.01.r001.mzML.1 so if matching to a label will need to slice
    file_names = list(features)[8:-3]
    file_names = [f[1:-7].replace('.','-')+'.d' for f in file_names]
    file_names = [f.replace('r002', 'r001') for f in file_names if 'r002' in f]
    # THIS IS NOT DONE YET, NEED TO MAKE SAMPLES TO LABELS! 
    if multi_to_single:
        labels_new, ds_names = reduce_multi(df_l.loc[file_names], data_file[:-4], label_key)
        for l, n in zip(labels_new, ds_names):
            data.append({'study': study[:-5],
                         'disease': disease,
                         'data_set': n,
                         'features': features.iloc[:,8:-3:3].T.astype('float'),
                         'labels': l,
                         'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'npeaks', 'isotopes', 'adduct', 'pcgroup']],
                         'samples': df_s})
    else:
        data.append({'study':study[:-5],
                     'disease': disease,
                     'data_set': data_file[:-4],
                     'features': features.iloc[:,8:-3:3].T.astype('float'),
                     'labels': df_l.loc[file_names],
                     'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'npeaks', 'isotopes', 'adduct', 'pcgroup']],
                     'samples': df_s})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

  interactivity=interactivity, compiler=compiler, result=result)


In [102]:
# order appears to match and after doing df slicing its dereplicated - also still 4 NaN files! 
# from authors: 16 healthy volunteers: 7 malaria naive and 9 semi-immune, but one didnt get malaria so it was not included...
# there appear to be replicates
# 12-7-18: looks like order matches, combine datasets
study = 'MTBLS665_data'
disease = 'Malaria'

os.chdir(os.path.join(root,study))
data = []
df_c18_n = pd.read_csv('a_huc_c18neg.txt',sep='\t')
df_hilic_p = pd.read_csv('a_huc_hilicpos.txt',sep='\t')
lab_to_int = {'Baseline':0, 'Diagnosis':1, '3 weeks':2, '4 months':3}
# turns out this is more of a longitudinal dataset...people naive and semi-immune to malaria were infected with some bug and 
# when the worms showed up in the blood they were given curing amount of cloroquinie or one other drug...
# metabolomics: pre infection, diagnosis, 3 weeks post treatment, 4 months post
# also looks like triplate measurements of each of the 4 time points
def extract_name_label(df):
    name_to_label = {}
    for ind, row in df.iterrows():
        name_to_label[row['MS Assay Name']] = row['Factor Value[Time Point]']
    return name_to_label

name_to_label_hilic = extract_name_label(df_hilic_p)
name_to_label_c18 = extract_name_label(df_c18_n)

# get author features
# CURRENTLY DONT KNOW HOW TO FIND WHICH SAMPLES MAP TO WHICH PERSON TO FOLLOW OVER TIME
files = ['m_huc_c18neg.tsv', 'm_huc_hilicpos.tsv']
for f in files:
    df_m = pd.read_csv(f, sep='\t')
    # get labels to map to this order of the data
    file_names = list(df_m.iloc[:,21::3])
    file_labels = []
    if 'c18' in f:
        mapping_dict = name_to_label_c18
    else:
        mapping_dict = name_to_label_hilic
    for ele in file_names:
        file_labels.append(mapping_dict[ele])
    labels = {'label':file_labels}
    df_l = pd.DataFrame(labels, index=file_names)
    df_l = df_l.dropna()
    df_l = df_l.replace(lab_to_int)
    f_names = list(df_l.index)
    feat = df_m.iloc[:,21::3].T.astype('float')
    feat = feat.loc[f_names]
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': f[:-4],
                 'features': feat, #taking every 3 samples to remove the replicates, Note some NaNs present
                 'labels': df_l['label'],
                 'peaks': df_m[['mass_to_charge', 'retention_time']],
                 'samples': df_c18_n if 'c18' in f else df_hilic_p})
    
# get my IPO-xcms features
dirs = ['c18', 'hilic']

for folder in dirs:
    data_file = 'IPO_aligned_MTBLS665_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    
    file_names = list(features.iloc[:,9:-3:3])
    file_labels = []
    if 'c18' in folder:
        mapping_dict = name_to_label_c18
    else:
        mapping_dict = name_to_label_hilic
    for ele in file_names:
        file_labels.append(mapping_dict[ele[:-7]])
    labels = {'label':file_labels}
    df_l = pd.DataFrame(labels, index=file_names)  
    feat = features.iloc[:,9:-3:3].T.astype('float')
    df_l = df_l.dropna()
    f_names = list(df_l.index)
    feat = feat.loc[f_names]
    df_l = df_l.replace(lab_to_int)
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': feat,
                 'labels': df_l['label'],
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                 'samples': df_c18_n if 'c18' in f else df_hilic_p})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

  interactivity=interactivity, compiler=compiler, result=result)


In [103]:
# order of labels seems to match the order of the samples
# there do not appear to be replicates
# 70 samples in total from 35 children (t=0 and 6)
study = 'MTBLS423_data' 
disease = 'prepubertal children with obesity'

os.chdir(os.path.join(root,study))
data = []
# df_f = pd.read_csv('a_lealwitt_et_al_metabolite_profiling_mass_spectrometry.txt',sep='\t')
# ^ The above file is messed up and only contains names of one file repreated over and over
# since it doesnt have any useful metadata I'm not using it
df_s = pd.read_csv('s_LealWitt et al.txt', sep='\t')
# from sample name you can scrape to get labels, its individual|t|time(?)
# labels: the last digit of the sample name is the time point: either t=0 or 6 months 
labels = df_s[['Sample Name']]
time_point = []
for ind, row in labels.iterrows():
    time_point.append(row['Sample Name'][-1])
labels['Time Point'] = time_point
labels.set_index('Sample Name', inplace=True)


#get author data: only ~6 features when the paper mentions many many more
features = pd.read_csv('m_lealwitt_et_al_metabolite_profiling_mass_spectrometry_v2_maf.tsv', sep='\t')
file_names = list(features.iloc[:,21:])
df_l = labels.loc[file_names] 
data.append({'study':study[:-5],
             'disease': disease,
             'data_set': 'Author data',
             'features': features.iloc[:,21:].T.astype('float'),
             'labels': df_l,
             'peaks': features[['mass_to_charge', 'retention_time']],
             'samples': df_s})

# get my feaures 
f = 'IPO_aligned_MTBLS423.csv'
features = pd.read_csv(f)
# this data looks oddly duplicated...will only take teh second half which seems to have features actually 
file_names = list(features.loc[:,'P2638_HILIC_11t0.mzXML.1':'P2638_HILIC_9t6.mzXML.1'])
file_names = [fi[:-8] for fi in file_names]
df_l = labels.loc[file_names]
data.append({'study':study[:-5],
             'disease': disease,
             'data_set': f[:-4],
             'features': features.loc[:,'P2638_HILIC_11t0.mzXML.1':'P2638_HILIC_9t6.mzXML.1'].T.astype('float'),
             'labels': df_l,
             'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'npeaks']],
             'samples': df_s})
# pickle.dump(data, open('%s.pkl'%study, 'wb'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [104]:
# 184 samples, order seems fine, 2 batches (labeled!), dont think there are duplicates since study says 183 people (why one more?)
# no real labels...maybe for now ill make it a gender prediction study???

# get my data:
data_file = 'IPO_aligned_MTBLS404_neg.csv'
study = 'MTBLS404_data' 
disease = 'urine metabolome'

os.chdir(os.path.join(root,study))
data = []
df_lc = pd.read_csv('a_sacurine.txt',sep='\t')
df_s = pd.read_csv('s_sacurine.txt', sep='\t')
df_lc = df_s.merge(df_lc, on='Sample Name').set_index('Sample Name')
# print(list(df_lc.index))
# labels: Characteristics[bmi], Characteristics[age], Characteristics[gender]
labels = df_lc[ 'Characteristics[gender]'] #'Characteristics[bmi]', 'Characteristics[age]' - took these out
lab_to_int = {'Male':0, 'Female':1}
labels = labels.replace(lab_to_int)
labels = labels[~labels.index.duplicated(keep='first')] # some rows will be the same since there are multiple blanks so remove
# these to avoid duplication when also mapping in datafiles where there can be multiple blanks as well (causes a multiplcation 
# in the number of features)

#get author data:
features = pd.read_csv('m_sacurine.txt', sep='\t')
file_names = list(features.iloc[:,21:])
new_files = []
for f in file_names:
    f = f.replace('_neg_', '_')
    if f[-2:] == 'b2':
        f = f[:-3]
    if 'Blanc' not in f and 'QC' not in f:
        new_files.append(f)
df_l = labels.loc[new_files]
new_feats = features.iloc[:,21:].T.astype('float')
file_names = [f for f in file_names if 'Blanc' not in f and 'QC' not in f]
new_feats = new_feats.loc[file_names]
# 2 batches, 234 files (24 blanks + 26 QCs + 184 samples) 
data.append({'study':study[:-5],
             'disease': disease,
             'data_set': f[:-4],
             'features': new_feats,
             'labels': df_l,
             'peaks': features[['mass_to_charge', 'retention_time']],
             'samples': df_lc})

features = pd.read_csv(data_file)
file_names = list(features.iloc[:,9:-3])
new_files = []
for f in file_names:
    f = f.replace('_neg_', '_')
    f = f.replace('_b2', '')
    f = f[:-5]
    new_files.append(f)
df_l = labels.loc[new_files]
data.append({'study':study[:-5],
             'disease': disease,
             'data_set': data_file[:-4],
             'features': features.iloc[:,9:-3].T.astype('float'),
             'labels': df_l,
             'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
             'samples': df_lc})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [105]:
# 108 samples in my data...authors mention ~113...? 
# no replicates, order looks good for my processed data
# 12-7-18: looks like order is good, combine datasets
study = 'MTBLS364_data' 
disease = 'smoker v. nonsmoker'

os.chdir(os.path.join(root,study))
data = []
df_n_hil = pd.read_csv('a_mtbls364_HILIC_NEG_mass_spectrometry.txt',sep='\t')
df_p_hil = pd.read_csv('a_mtbls364_HILIC_POS_mass_spectrometry.txt',sep='\t')
df_n_lc = pd.read_csv('a_mtbls364_RP_NEG_mass_spectrometry.txt',sep='\t')
df_p_lc = pd.read_csv('a_mtbls364_RP_POS_mass_spectrometry.txt',sep='\t')
df_s = pd.read_csv('s_BoEfRTP2_Serum.txt', sep='\t')
df_n_hil = df_s.merge(df_n_hil, on='Sample Name').set_index('Sample Name')
df_p_hil = df_s.merge(df_p_hil, on='Sample Name').set_index('Sample Name')
df_n_lc = df_s.merge(df_n_lc, on='Sample Name').set_index('Sample Name')
df_p_lc = df_s.merge(df_p_lc, on='Sample Name').set_index('Sample Name')
dfs = [df_n_hil, df_p_hil, df_n_lc, df_p_lc]

# label: Factor Value[smoking status]
# get author files:
# files = ['m_mtbls364_HILIC_NEG_mass_spectrometry_v2_maf.tsv', 'm_mtbls364_HILIC_POS_mass_spectrometry_v2_maf.tsv',
#          'm_mtbls364_RP_NEG_mass_spectrometry_v2_maf.tsv', 'm_mtbls364_RP_POS_mass_spectrometry_v2_maf.tsv']
##### TURNS OUT NO AUTHOR DATA AGAIN....
# for f, df in zip(files, dfs):
#     labels = get_labels(df, 'Raw Spectral Data File', 'Factor Value[smoking status]')
#     features = pd.read_csv(f, sep='\t')
#     sample_names = list(features.iloc[:,21:])
#     if ext == '.cdf':
#         sample_names = [s.zfill(3)+'_1'+ext for s in sample_names]
#     else:
#         sample_names = [s+ext for s in sample_names]
#     df_l = labels.loc[sample_names]
#     data.append({'study':study[:-5],
#                  'disease': disease,
#                  'data_set': f[:-4],
#                  'features': features.iloc[:,21:].T.astype('float'),
#                  'labels': df_l,
#                  'peaks': features[['mass_to_charge', 'retention_time']],
#                  'samples': df})

# get my data:
lab_to_int = {'Smoker':1, 'Never Smoker': 0}
dirs = ['hil_neg', 'hil_pos', 'lip_neg', 'lip_pos']
for folder, df in zip(dirs,dfs):
    labels = get_labels(df, 'Raw Spectral Data File', 'Factor Value[smoking status]')
    data_file = 'IPO_aligned_MTBLS364_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    file_names = list(features.iloc[:,9:-3])
    file_names = [f[:-5]+'.raw' for f in file_names]
    df_l = labels.loc[file_names]
    df_l = df_l.replace(lab_to_int)
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': features.iloc[:,9:-3].T.astype('float'),
                 'labels': df_l,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                 'samples': df})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [106]:
# 239 people in study, no replicates
# 12-7-18: looks like order matches between ds, can combine datasets!
study = 'MTBLS354_data'
disease = 'Pneumonia - Community acquired'

os.chdir(os.path.join(root,study))
data = []
df_s = pd.read_csv('s_CAP.txt', sep='\t').set_index('Sample Name')

files = ['m_cap_metabolite_profiling_mass_spectrometry_v2_maf.tsv',
         'm_cap_metabolite_profiling_mass_spectrometry-1_v2_maf.tsv']
lab_to_int = {'non-community acquired pneumonia':0, 'community acquired pneumonia':1}
for f in files:
    features = pd.read_csv(f, sep='\t')
    file_names = list(features.iloc[:,21:].T.index)
    labels = df_s['Factor Value[disease state]'].loc[file_names]
    labels = labels.replace(lab_to_int)
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': f[:-4],
                 'features': features.iloc[:,21:].T.astype('float'),
                 'labels': labels,
                 'peaks': features[['mass_to_charge', 'retention_time']],
                 'samples': df_s.loc[file_names]})
#get my data:
folders = ['neg', 'pos']
for folder in folders:
    data_file = 'IPO_aligned_MTBLS354_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    file_names = list(features.iloc[:,9:-3])
    file_names = [f[:-9].replace('.','-') for f in file_names]
    labels = df_s['Factor Value[disease state]'].loc[file_names]
    labels = labels.replace(lab_to_int)
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': features.iloc[:,9:-3].T.astype('float'),
                 'labels': labels,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                 'samples': df_s.loc[file_names]})  
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [4]:
# NEED TO UPDATE WITH THE NEWER FILE AND THE CDF WHEN IT RUNS.....
# order of labels maps to the orders of features, no duplicates.
# 12-8-18: looks like you can now combine datasets! 
study = 'MTBLS315_data' 
disease = 'non-malaria febrile illness'

os.chdir(os.path.join(root,study))
data = []
df_n_uplc = pd.read_csv('a_UPLC_NEG_nmfi_and_bsi_diagnosis.txt',sep='\t')
df_p_uplc = pd.read_csv('a_UPLC_POS_nmfi_and_bsi_diagnosis.txt',sep='\t')
df_p_gc = pd.read_csv('a_GC_nmfi_and_bsi_diagnosis.txt',sep='\t')
df_p_lc = pd.read_csv('a_LC_nmfi_and_bsi_diagnosis.txt',sep='\t')
df_s = pd.read_csv('s_NMFI and BSI diagnosis.txt', sep='\t')
df_n_uplc = df_s.merge(df_n_uplc, on='Sample Name').set_index('Sample Name')
df_p_uplc = df_s.merge(df_p_uplc, on='Sample Name').set_index('Sample Name')
df_p_gc = df_s.merge(df_p_gc, on='Sample Name').set_index('Sample Name')
df_p_lc = df_s.merge(df_p_lc, on='Sample Name').set_index('Sample Name')
dfs = [df_p_gc, df_p_lc, df_n_uplc, df_p_uplc] #note this order needs to make the order of the 'files' or 'dirs' below for zip to work
extensions = ['.cdf', '.mzXML', '.mzML', '.mzML'] # same here!
names = ['GC', 'LC', 'UPLC_N', 'UPLC_P']
# label: Factor Value[patient group]
to_be_df = {}
for df,name in zip(dfs,names):
    to_be_df[name] = df['Raw Spectral Data File']
df_mapping_lab_sample = pd.DataFrame(to_be_df) # use this to get all the names consistent

files = ['m_GC_nmfi_and_bsi_diagnosis_v2_maf.tsv', 'm_LC_nmfi_and_bsi_diagnosis_v2_maf.tsv',
         'm_UPLC_NEG_nmfi_and_bsi_diagnosis_v2_maf.tsv', 'm_UPLC_POS_nmfi_and_bsi_diagnosis_v2_maf.tsv']
lab_to_int = {'malaria': 1, 'non-malarial febrile illness':0, 'bacterial bloodstream infection':0} # NOTE THIS DROPPING ON A LABEL
for f, df, ext, name in zip(files,dfs, extensions, names):
    features = pd.read_csv(f, sep='\t')
    f_names = df_mapping_lab_sample[name].values
    if name == 'GC':
        f_names_feat = [str(int(fi[:-6])) for fi in f_names]
        f_names_lab = f_names
    elif name == 'LC':
        f_names_feat = [fi[:-6] for fi in f_names]
        f_names_lab = f_names
    else:
        f_names_feat = [fi[:-5] for fi in f_names]
        f_names_lab = f_names
    labels = get_labels(df, 'Raw Spectral Data File', 'Factor Value[patient group]')
    df_l = labels.loc[f_names_lab]
    df_l = df_l.replace(lab_to_int)
    feat = features.iloc[:,21:].T.astype('float')
    feat = feat.loc[f_names_feat]
#     sample_names = list(features.iloc[:,21:])
#     if ext == '.cdf':
#         sample_names = [s.zfill(3)+'_1'+ext for s in sample_names]
#     else:
#         sample_names = [s+ext for s in sample_names]
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': f[:-4],
                 'features': feat,
                 'labels': df_l,
                 'peaks': features[['mass_to_charge', 'retention_time']],
                 'samples': df})
# will need to change this! 
dirs = ['mzData', 'mzXML', 'n_mzML', 'p_mzML']
for folder, df, ext, name in zip(dirs,dfs,extensions, names):
    labels = get_labels(df, 'Raw Spectral Data File', 'Factor Value[patient group]')
    data_file = 'IPO_aligned_MTBLS315_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    feat = features.iloc[:,9:-3].T.astype('float')
    
    f_names = df_mapping_lab_sample[name].values
    if name == 'GC':
        f_names_feat = ['X'+f[:-3]+'mzData' for f in f_names]
        f_names_lab = f_names
    else:
        f_names_feat = ['X'+f.replace('-','.') for f in f_names]
        f_names_lab = f_names
        
#     file_names = list(features.iloc[:,9:-3])
#     if ext == '.cdf':
#         file_names = [f[1:-7]+ext for f in file_names]
#     elif ext == '.mzXML':
#         file_names = [f[1:].replace('.','-')[:-6]+ext for f in file_names]
#     else:
#         file_names = [f[1:] for f in file_names]
    df_l = labels.loc[f_names_lab]
    df_l = df_l.replace(lab_to_int)
    feat = feat.loc[f_names_feat]
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': feat,
                 'labels': df_l,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                 'samples': df})

# files = ['XCMS-Report-annotated-SingleClass-CDF.xlsx', 'XCMS-Report-annotated-SingleClass-n_mzML.xlsx']
# dfs = [df_p_gc, df_n_uplc]
# extensions = ['.cdf', '.mzML']
# first = True
# for fi, df, ext in zip(files, dfs, extensions):
#     labels = get_labels(df, 'Raw Spectral Data File', 'Factor Value[patient group]')
#     features = pd.read_excel(fi)
#     feat = features.iloc[:,10:-3].T.astype('float')
#     file_names = list(features.iloc[:,10:-3])
#     if ext == '.cdf':
#         file_names = [f+ext for f in file_names]
#     else:
#         file_names = [f+ext for f in file_names]
#     df_l = labels.loc[file_names]
#     df_l = df_l.replace(lab_to_int)
#     if first:
#         file_to_sample = df['Raw Spectral Data File'].reset_index().set_index('Raw Spectral Data File')
        
    
#     data.append({'study':study[:-5],
#                  'disease': disease,
#                  'data_set': fi[:-4],
#                  'features': feat,
#                  'labels': df_l,
#                  'peaks': features[['mzmed', 'mzmin', 'mzmax', 'rtmed', 'rtmin', 'rtmax', 'npeaks' ,'isotopes', 'adduct', 'pcgroup']],
#                  'samples': df})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

  interactivity=interactivity, compiler=compiler, result=result)


In [108]:
# no replicates, labels map to the data
# 12-7-18: looks like you can combine datasets!
study = 'MTBLS266_data'
disease = 'Age related metabolomics'
os.chdir(os.path.join(root,study))
data = []
df_n = pd.read_csv('a_mtbls266_NEG_mass_spectrometry.txt',sep='\t')
df_p = pd.read_csv('a_mtbls266_POS_mass_spectrometry.txt',sep='\t')
df_s = pd.read_csv('s_MTBLS266.txt', sep='\t')
df_n = df_s.merge(df_n, on='Sample Name').set_index('Raw Spectral Data File')
df_p = df_s.merge(df_p, on='Sample Name').set_index('Raw Spectral Data File')
dfs = [df_n, df_p]

# get author data:
files =  ['m_mtbls266_NEG_mass_spectrometry_v2_maf.tsv', 'm_mtbls266_POS_mass_spectrometry_v2_maf.tsv']
for f, df in zip(files,dfs):
    features = pd.read_csv(f, sep='\t')
    feat = features.iloc[:,21:].T.astype('float')
    sample_names = list(features.iloc[:,21:])
    sample_names = [name+'.mzML' for name in sample_names]
    labels = df['Factor Value[Age group]'].loc[sample_names]
    to_replace = {'youth':0, 'elder':1}
    labels = labels.replace(to_replace)
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': f[:-4],
                 'features': feat,
                 'labels': labels,
                 'peaks': features[['mass_to_charge', 'retention_time']],
                 'samples': df.loc[sample_names]})
# get my data:
dirs = ['MTBLS266_neg', 'MTBLS266_pos']
for folder, df in zip(dirs,dfs):
    data_file = 'IPO_aligned_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    feat = features.iloc[:,9:-3].T.astype('float')
    file_names = list(features.iloc[:,9:-3])
    labels = df['Factor Value[Age group]'].loc[file_names]
    to_replace = {'youth':0, 'elder':1}
    labels = labels.replace(to_replace)
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': feat,
                 'labels': labels,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                 'samples': df.loc[file_names]})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [109]:
# no replicates, labels map to the data
# not really a classification problem - its a 24 hr time course
# 12-7-18: looks like you can combine datasets!
study = 'MTBLS264_data'
disease = 'Age related metabolomics'
os.chdir(os.path.join(root,study))
data = []
df_n = pd.read_csv('a_mtbls264_NEG_mass_spectrometry.txt',sep='\t')
df_p = pd.read_csv('a_mtbls264_POS_mass_spectrometry.txt',sep='\t')
df_s = pd.read_csv('s_MTBLS264.txt', sep='\t')
df_n = df_s.merge(df_n, on='Sample Name').set_index('Raw Spectral Data File')
df_p = df_s.merge(df_p, on='Sample Name').set_index('Raw Spectral Data File')
dfs = [df_n, df_p]

# package author data:
files = ['m_mtbls264_NEG_mass_spectrometry_v2_maf.tsv', 'm_mtbls264_POS_mass_spectrometry_v2_maf.tsv']
for f, df in zip(files,dfs):
    features = pd.read_csv(f, sep='\t')
    sample_names = list(features.iloc[:,21:])
    for samp_type in ['blood', 'RBC', 'plasma']:
        names = [ele for ele in sample_names if samp_type in ele]
        feat = features.iloc[:,21:].T.astype('float').loc[names]
        names = [ele+'.mzML' for ele in names]
        labels = df['Factor Value[timepoint]'].loc[names]
        data.append({'study':study[:-5],
                     'disease': disease,
                     'data_set': f[:-4]+'_'+samp_type,
                     'features': feat,
                     'labels': labels,
                     'peaks': features[['mass_to_charge', 'retention_time']],
                     'samples': df.loc[names]})
# get my data:
dirs = ['MTBLS264_blood_neg', 'MTBLS264_blood_pos', 'MTBLS264_plasma_neg', 
        'MTBLS264_plasma_pos', 'MTBLS264_rbc_neg', 'MTBLS264_rbc_pos']
dfs = [df_n, df_p, df_n, df_p, df_n, df_p]
for folder, df in zip(dirs,dfs):
    data_file = 'IPO_aligned_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    file_names = list(features.iloc[:,9:-3])
    labels = df['Factor Value[timepoint]'].loc[file_names]
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': features.iloc[:,9:-3].T.astype('float'),
                 'labels': labels,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                 'samples': df.loc[file_names]})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [110]:
# not really a classification problem, but maybe could do predictive modeling of whether someone will no longer be diabetic
# following the year since 9 were in remission post 1 yr. 
study = 'MTBLS218_data'
disease = 'short-term and long-term metabolic changes after bariatric surgery'
os.chdir(os.path.join(root,study))
data = []
df = pd.read_csv('a_bariatric_surgery_metabolite_profiling_mass_spectrometry.txt',sep='\t')
df_s = pd.read_csv('s_Bariatric Surgery.txt', sep='\t')
df = df_s.merge(df, on='Sample Name').set_index('Raw Spectral Data File')
df_l = df['Factor Value[Time]']
mapper = {'2-3':1, '0':0, '52':2}
df_l = df_l.replace(mapper)

# no author data
# get my data:
data_file = 'IPO_aligned_MTBLS218.csv'
features = pd.read_csv(data_file)
feat = features.iloc[:,9:].T.astype('float')
file_names = list(feat.index)
file_names = [i[1:] for i in file_names]
df_l = df_l.loc[file_names]
df = df.loc[file_names]
data.append({'study':study[:-5],
             'disease': disease,
             'data_set': data_file[:-4],
             'features': feat,
             'labels': df_l,
             'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'npeaks' ,'X1']],
             'samples': df})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [111]:
study = 'MTBLS200_data'
disease = 'Lung Cancer'
os.chdir(os.path.join(root,study))
data = []
df = pd.read_csv('a_MS_uofc-hcc_serum_metabolomic_study.txt',sep='\t')
df_s = pd.read_csv('s_UofC-HCC Serum Metabolomic Study.txt', sep='\t')
df = df_s.merge(df, on='Sample Name').set_index('Raw Spectral Data File')
names = list(df.index)[:-1]
names_dil = [ele for ele in names if 'DIL' in ele]
names_notdil = [ele for ele in names if 'DIL' not in ele]
df_dil = df.loc[names_dil]
df_notdil = df.loc[names_notdil]
dfs = [df_dil, df_notdil]

names = [e[:-3]+'mzData' for e in names]
names_dil = [ele for ele in names if 'DIL' in ele]
names_notdil = [ele for ele in names if 'DIL' not in ele]
names = [names_dil, names_notdil]
#read in my data:
data_file = 'IPO_aligned_MTBLS200.csv'
features = pd.read_csv(data_file)
feat = features.iloc[:,9:-3].T.astype('float')
for n, d in zip(names,dfs):
    f = feat.loc[n]
    l = d[['Factor Value[CA Type]', 'Factor Value[Timepoint]', 'Factor Value[Outcome]']]
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': f,
                 'labels': l,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                 'samples': d})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [112]:
# order looks correct, 7 people 8 timepoints each, but should only be 4 time points...1st 4 then 2nd 4 are replicates 
study = 'MTBLS191_data'
disease = 'high intensity exercise metabolomics'
os.chdir(os.path.join(root,study))
data = []
df = pd.read_csv('a_VUHIIES1.txt',sep='\t')
df_s = pd.read_csv('s_VUHIIES1.txt', sep='\t')
df = df_s.merge(df, on='Sample Name').set_index('Sample Name')

#authors metadata is empty
# create a df that one can map the list of names from the IPO file to get labels in the IPO files order 
labels = df[['Raw Spectral Data File', 'Factor Value[Time]']] # label: Factor Value[Time]
labels.set_index('Raw Spectral Data File', inplace=True)

# my features only have 8 features...lol maybe rerun the IPO / xcms? 
data_file = 'IPO_aligned_MTBLS191.csv'
features = pd.read_csv(data_file)
file_names = list(features.iloc[:,9:-3])
file_names = [s[:27]+'-'+s[28:42]+'-'+s[43:-7]+'.D.zip' for s in file_names]
df_l = labels.loc[file_names]
lab_map = {0:0, 10:1, 30:2, 90:3}
df_l = df_l.replace(lab_map)
#keeping just the first set of data for each and dropping the replicates
drop_labels = [f for f in list(df_l.index) if int(f[-7]) < 5]
drop_features = [f for f in list(features.iloc[:,9:-3].T.index) if int(f[-8]) < 5] # CHAGE TO >4 if you want the second half
data.append({'study':study[:-5],
             'disease': disease,
             'data_set': data_file[:-4],
             'features': features.iloc[:,9:-3].T.loc[drop_features].astype('float'),
             'labels': df_l.loc[drop_labels],
             'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
             'samples': df})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [113]:
# NOT going to classify...its different times at different temps...
# 12-7-18: can combine datasets
import random
# looks like triplicate data - here I randomly dereplicated. 
# order of the labels appears to match the order of the samples (note sorted is required for this!)
study = 'MTBLS148_data'
disease = 'Urine sample storage'
pd.set_option('display.max_columns', 100)
os.chdir(os.path.join(root,study))
data = []
df_n = pd.read_csv('a_NEG_impact_of_collection_conditions_on_the_urine_metabolome_metabolite_profiling_mass_spectrometry.txt',sep='\t')
df_p = pd.read_csv('a_POS_impact_of_collection_conditions_on_the_urine_metabolome_metabolite_profiling_mass_spectrometry.txt',sep='\t')
df_s = pd.read_csv('s_Impact of collection conditions on the urine metabolome.txt', sep='\t')
df_p = df_s.merge(df_p, on='Sample Name').set_index('Sample Name')
df_n = df_s.merge(df_n, on='Sample Name').set_index('Sample Name')

files = ['m_NEG_impact_of_collection_conditions_on_the_urine_metabolome_metabolite_profiling_mass_spectrometry_v2_maf.tsv',
         'm_POS_impact_of_collection_conditions_on_the_urine_metabolome_metabolite_profiling_mass_spectrometry_v2_maf.tsv']
# NEVER MIND, the above files are empty, this is the same group as MTBLS20 where they didn't upload all the data, seemingly on purpose. 
# double check online but i think this group just doesnt like uploading thier data
    
# get my own data from processing the raw files:
dirs = ['neg', 'pos']
for folder in dirs:
    data_file = 'IPO_aligned_MTBLS148_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    
    file_names = [f[1:-5].replace('.', '-') for f in list(features.iloc[:,9:-3])]
    df = df_n if 'n' in folder else df_p
    df_l = df.loc[file_names, ['Factor Value[Temperature]', 'Factor Value[Timepoint]']] #Condition and timepoint

    file_names_labels = list(df_l.index)
    file_names_labels = list(set([f[:-5] for f in file_names_labels]))
    ints = [str(random.randint(1,3)) for i in range(len(file_names_labels))]
    rand_file_names_labels = []
    for f, i in zip(sorted(file_names_labels), ints):
        rand_file_names_labels.append(f+i+'_'+folder.upper())
    
    rand_file_names_features = []
    file_names_features = list(features.iloc[:,9:-3].T.index)
    file_names_features = list(set([f[:-10] for f in file_names_features]))
    for f, i in zip(sorted(file_names_features), ints):
        rand_file_names_features.append(f+i+'_'+folder.upper()+'.mzML')
    f = features.iloc[:,9:-3].T.astype('float')
    f = f.loc[rand_file_names_features]
    labels = df_l.loc[rand_file_names_labels]
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': f,
                 'labels': labels,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                 'samples': df})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [114]:
# order appears to match
# two clearly labeled batches in the positive data
# should be 180 people...but not 180 samples lol
# dont think there are replicates. 

study = 'MTBLS146_data'
disease = 'Pregnancy'

os.chdir(os.path.join(root,study))
data = []
df_n = pd.read_csv('a_pregnancy_negative.txt',sep='\t')
df_p = pd.read_csv('a_pregnancy_positive.txt',sep='\t')
df_s = pd.read_csv('s_pregnancy.txt', sep='\t')
df_p = df_s.merge(df_p, on='Sample Name').set_index('Sample Name')
df_n = df_s.merge(df_n, on='Sample Name').set_index('Sample Name')

# no author feature tables...
lab_to_int = {'9-12':0, '9-12 ':0, '13-16':1, '17-20 ':2, '21-24 ':3, '25-28 ':4, '29-32 ':5}
def extract_name_label(df):
    name_to_label = {}
    for ind, row in df.iterrows():
        name_to_label[row['Raw Spectral Data File']] = row['Factor Value[Pregnancy duration]']
    return name_to_label

name_to_preg_dur_pos = extract_name_label(df_p)
name_to_preg_dur_neg = extract_name_label(df_n)

#get my features 
dirs = ['neg', 'pos']
for folder in dirs:
    data_file = 'IPO_aligned_MTBLS146_global_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    
    file_names = list(features.iloc[:,9:-3])
    file_labels = []
    if 'n' in folder:
        mapping_dict = name_to_preg_dur_neg
    else:
        mapping_dict = name_to_preg_dur_pos
    for ele in file_names:
        file_labels.append(mapping_dict[ele])
    labels = {'label':file_labels}
    df_l = pd.DataFrame(labels, index=file_names) 
    df_l = df_l.replace(lab_to_int)
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': features.iloc[:,9:-3].T.astype('float'),
                 'labels': df_l['label'],
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                 'samples': df_n if 'n' in folder else df_p})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [115]:
# the author data has no features present
# the labels match with the files and there are not replicates 
# 12-7-18: looks like you can combine datasets
study = 'MTBLS105_data'
disease = 'Hepatocellular carcinoma'

os.chdir(os.path.join(root,study))
data = []
df_s = pd.read_csv('s_mtbls105.txt', sep='\t').set_index('Sample Name')

# get author data: TURNS OUT ITS EMPTY
# files = ['m_mtbls105_GC_Q_mass_spectrometry_v2_maf.tsv', 
#          'm_mtbls105_GC_SIM_mass_spectrometry_v2_maf.tsv',
#          'm_mtbls105_GC_TOF_mass_spectrometry_v2_maf.tsv']
# for f in files:
#     features = pd.read_csv(f, sep='\t')
#     file_names = list(features.iloc[:,21:].T.index)
#     labels = df_s['Factor Value[Disease]'].loc[file_names]
#     print(labels.shape, features.iloc[:,21:].T.shape)
#     data.append({'study':study[:-5],
#                  'disease': disease,
#                  'data_set': f[:-4],
#                  'features': features.iloc[:,21:].T.astype('float'),
#                  'labels': labels,
#                  'peaks': features[['mass_to_charge', 'retention_time']],
#                  'samples': df_s.loc[file_names]})

# Get my files
lab_to_int = {'hepatocellular carcinoma': 1, 'cirrhosis of liver': 0}
folders = ['qMS', 'SIM-MS'] #TOFMS didnt work for the IPO run
for folder in folders:
    data_file = 'IPO_aligned_MTBLS105_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    file_names = list(features.iloc[:,9:-3])
    file_names = [f[:-7].replace('.','-') for f in file_names]
    labels = df_s['Factor Value[Disease]'].loc[file_names]
    labels = labels.replace(lab_to_int)
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': features.iloc[:,9:-3].T.astype('float'),
                 'labels': labels,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                 'samples': df_s.loc[file_names]})     
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [116]:
# No replicates it appears and the labels map to the features order
study = 'MTBLS92_data'
disease = 'Breast Cancer'

os.chdir(os.path.join(root,study))
data = []
df_s = pd.read_csv('s_BreastCancerLipidome.txt',sep='\t')
df_p = pd.read_csv('a_breastcancerlipidome_metabolite_profiling_mass_spectrometry.txt',sep='\t')
df_p = df_s.merge(df_p, on='Sample Name').set_index('Sample Name')

#get my IPO-xcms data
directory = 'MTBLS92_out'
data_file = 'IPO_aligned_MTBLS92.csv'
open_file = os.path.join(directory, data_file)
features = pd.read_csv(open_file)

# map the data names to labels
matching = {'A':{}, 'B':{}}
for ind, row in df_p.iterrows():
    dataset = row['Source Name'][6]
    #making BL = 0 and OP = 1
    if row['Factor Value[Timepoint]'] == 'BL':
        value = 0 
    else:
        value = 1 
    matching[dataset][row['Source Name'][8:]] = (value, row['Source Name'])
data_names = list(features.iloc[:,9:-3].T.index)
info = []
for ele in data_names:
    dataset = ele[1:4]
    if dataset == '229':
        dataset = 'B'
    else: 
        dataset = 'A'
    row_data = matching[dataset][ele[9:-7]] 
    info.append(row_data)
headers = ['label','sample_name']
# NOTE the following data from is not JUST the labels but also the sample name (A/B and number from the authors)
df_l = pd.DataFrame.from_records(info, columns=headers, index=features.iloc[:,9:-3].T.index)
# 447 files
data.append({'study':study[:-5],
             'disease': disease,
             'data_set': data_file[:-4],
             'features': features.iloc[:,9:-3].T.astype('float'),
             'labels': df_l['label'],
             'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
             'samples': df_p})
# get author data:
df_m = pd.read_csv('m_breastcancerlipidome_metabolite_profiling_mass_spectrometry_v2_maf.tsv',sep='\t')
data_names = list(df_m.iloc[:,21:].T.index)
info = []
for ele in data_names:
    dataset = ele[0]
    row_data = matching[dataset][ele[2:]] 
    info.append(row_data)
headers = ['label','sample_name']
# NOTE the following data from is not JUST the labels but also the sample name (A/B and number from the authors)
df_l_a = pd.DataFrame.from_records(info, columns=headers, index=df_m.iloc[:,21:].T.index)
data.append({'study':study[:-5],
             'disease': disease,
             'data_set': 'Author_data',
             'features': df_m.iloc[:,21:].T.astype('float'),
             'labels': df_l_a['label'],
             'peaks': df_m[['mass_to_charge', 'retention_time']],
             'samples': df_p})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [117]:
# NOTE number of pos / neg do not match what my spreadsheet says and no mention of 'converters' in my data its just 0 or 1

# looks like there is an A and B for each run 
# post removing the B files, looks like 127 samples in total
# labels match with the feature order, 72 positive samples 
# 12-7-18: looks liek you can combine datasets!
study = 'MTBLS72_data'
disease = 'Alzheimers'
#doesnt look like the authors even put up feature tables for the NEG and POS sample...only the targeted biocrates assay...
# so previous analysis was only with their ~10 targets metabolites

os.chdir(os.path.join(root,study))
data = []
df_s = pd.read_csv('s_Plasma_AD_Lipidomics.txt',sep='\t')
df_n = pd.read_csv('a_NEG_plasma_ad_lipidomics_mass_spectrometry.txt',sep='\t')
df_p = pd.read_csv('a_POS_plasma_ad_lipidomics_mass_spectrometry.txt',sep='\t')
df_p = df_s.merge(df_p, on='Sample Name').set_index('Sample Name')
df_n = df_s.merge(df_n, on='Sample Name').set_index('Sample Name')
dfs = [df_n, df_p]
removed_dfs = []
for df in dfs:
    df = df.set_index('Source Name')
    files_to_keep = [f for f in list(df.index) if 'B' not in f] # tossing all the B labeled files...
    df = df.loc[files_to_keep]
    removed_dfs.append(df)

#get my data since really no author data
dirs = ['neg', 'pos']
for folder, df in zip(dirs,removed_dfs):
    data_file = 'IPO_aligned_MTBLS72_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    
    #get labels 
    mapping = {}
    for ind, row in df_s.iterrows():
        mapping[row['Sample Name']] = row['Factor Value[Cognitive Status]']
    file_names = list(features.iloc[:,9:-3])
    file_to_label = []
    for f in file_names:
        file_to_label.append(mapping[f[1:-5]])
    header_to_label = {'label':file_to_label}
    df_l = pd.DataFrame(header_to_label, index=file_names)
    df_l[df_l['label'] == 'Normal Control'] = 0
    df_l[df_l['label'] != 0] = 1
    # no remove the duplicated file determined from above 
    derep_files = list(df['MS Assay Name'])
    derep_files = ['X'+f+'.mzML' for f in derep_files]
    f = features.iloc[:,9:-3].T.astype('float')
    f = f.loc[derep_files]
    labels = df_l['label']
    labels = labels.loc[derep_files]
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': f,
                 'labels': labels,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                 'samples': df})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [118]:
# I dont think there are replicates - 1005 samples and the order matches
# 12-7-18: looks like you can combine datasets!
study = 'MTBLS28_data'
disease = 'Lung cancer'

os.chdir(os.path.join(root,study))
data = []
df_s = pd.read_csv('s_mtbls28.txt', sep='\t').set_index('Sample Name')

lab_to_int = {'Control':0,'Case':1}
# get author data:
files = ['m_mtbls28_NEG_v2_maf.tsv', 'm_mtbls28_POS_v2_maf.tsv']
for f in files:
    features = pd.read_csv(f, sep='\t')
    file_names = list(features.iloc[:,21:].T.index)
    labels = df_s['Factor Value[Sample Type]'].loc[file_names]
    labels = labels.replace(lab_to_int)
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': f[:-4],
                 'features': features.iloc[:,21:].T.astype('float'),
                 'labels': labels,
                 'peaks': features[['mass_to_charge', 'retention_time']],
                 'samples': df_s.loc[file_names]})

# Get my IPO aligned files:
folders = ['neg', 'pos']
for folder in folders:
    data_file = 'IPO_aligned_MTBLS28_' + folder + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)
    file_names = list(features.iloc[:,9:-3])
    file_names = [f[1:-7] for f in file_names]
    labels = df_s['Factor Value[Sample Type]'].loc[file_names]
    labels = labels.replace(lab_to_int)
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': features.iloc[:,9:-3].T.astype('float'),
                 'labels': labels,
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                 'samples': df_s.loc[file_names]})  
    
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [119]:
#no replicates - also authors did not upload all data since its from a different study thats never published (LOL)
# NOTE ALSO NOT LOOKING AT THE 439033 FILES SINCE ONLY 6 OF THEM....
study = 'MTBLS20_data'
disease = 'Interperson variation'

os.chdir(os.path.join(root,study))
data = []
df_neg_20 = pd.read_csv('a_439020neg_LCMS_metid_urine.txt',sep='\t')
df_neg_33 = pd.read_csv('a_439033neg_LCMS_metid_urine.txt',sep='\t')
df_pos_20 = pd.read_csv('a_439020pos_LCMS_metid_urine.txt',sep='\t')
df_pos_33 = pd.read_csv('a_439033pos_LCMS_metid_urine.txt',sep='\t')
df_s = pd.read_csv('s_Annotation.txt', sep='\t')
df_s = df_s[df_s['Factor Value[Material sample]']=='Individual']
df_neg_20 = df_s.merge(df_neg_20, on='Sample Name').set_index('Sample Name')
df_neg_33 = df_s.merge(df_neg_33, on='Sample Name').set_index('Sample Name')
df_pos_20 = df_s.merge(df_pos_20, on='Sample Name').set_index('Sample Name')
df_pos_33 = df_s.merge(df_pos_33, on='Sample Name').set_index('Sample Name')
dfs = [df_neg_20, df_pos_20] #, df_neg_33, df_pos_33]
#'As the results of these studies are not part of the present manuscript and will be publsihed elsewhere,
#only selected raw data that are representative are provided…
# not looking at now....

dirs = ['439020_neg_out', '439020_pos_out'] #, '439033_neg_out', '439033_pos_out']
for folder, df in zip(dirs,dfs):
    data_file = 'IPO_aligned_MTBLS20_' + folder[:-4] + '.csv'
    open_file = os.path.join(folder,data_file)
    features = pd.read_csv(open_file)  
    file_names = list(features.iloc[:,9:-3].T.index)
    file_names = [f[:-5] for f in file_names]
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': data_file[:-4],
                 'features': features.iloc[:,9:-3].T.astype('float'),
                 'labels': df.loc[file_names],
                 'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                 'samples': df.loc[file_names]})    
pickle.dump(data, open('%s.pkl'%study, 'wb'))  

In [120]:
# dereplicated the F and R samples. 
# labels appear to map to the samples features 
# 12-7-18: can combine the POS / NEG of expt 1F, expt1R, expt2F and expt2R...maybe combine the Expt1/2F or expt1/2R IF 1 and 2 are actually different
study = 'MTBLS19_data'
disease = 'hepatocellular carcinoma'

os.chdir(os.path.join(root,study))
data = []

#read in the study data:
df_s = pd.read_csv('s_MTBLS19.txt',sep='\t')
df_n = pd.read_csv('a_neg_MTBLS19_metabolite profiling_mass spectrometry.txt',sep='\t')
df_p = pd.read_csv('a_pos_MTBLS19_metabolite profiling_mass spectrometry.txt',sep='\t')
df_p = df_s.merge(df_p, on='Sample Name').set_index('Sample Name')
df_n = df_s.merge(df_n, on='Sample Name').set_index('Sample Name')
a_d_files = ['Exp1F_POS.xlsx', 'Exp1R_POS.xlsx', 'Exp2F_POS.xlsx', 'Exp2R_POS.xlsx',
             'Exp1F_NEG.xlsx', 'Exp1R_NEG.xlsx', 'Exp2F_NEG.xlsx', 'Exp2R_NEG.xlsx']
# note 1F and 1R are the same samples but with order of run in LCMS reversed...
for f in a_d_files:
    df_m = pd.read_excel(f)
    data.append({'study': study[:-5],
                 'disease': disease,
                 'data_set': f[:-5],
                 'features': df_m.iloc[1:,2:].T.astype('float'),
                 'labels': df_m.iloc[[0,],2:].T - 1,
                 'peaks': df_m[['mz','rt']].iloc[1:],
                 'samples': df_p if 'POS' in f else df_n})
#These next files has identified metabolites but not the assocaited data:
# df_m_n = pd.read_csv('m_neg_MTBLS19_metabolite profiling_mass spectrometry_v2_maf.tsv', sep='\t')
# df_m_p = pd.read_csv('m_pos_MTBLS19_metabolite profiling_mass spectrometry_v2_maf.tsv', sep='\t')

#Get IPO processed data:
ipo_d_files = ['IPO_aligned_MTBLS19_neg_exp1.csv', 'IPO_aligned_MTBLS19_neg_exp2.csv',
               'IPO_aligned_MTBLS19_pos_exp1.csv', 'IPO_aligned_MTBLS19_pos_exp2.csv']
for f in ipo_d_files:
    df_m = pd.read_csv(f)
    runs = ['F', 'R']
    for run in runs:
        mask_str = 'Exp.'+run
        mask = list(df_m.columns[df_m.columns.str.contains(mask_str).T])
        masked_d = df_m[mask]
        data.append({'study': study,
                     'disease': disease,
                     'data_set': f[:-4]+'_'+run,
                     'features': masked_d.iloc[:,:].T.astype('float'),
                     'labels': pd.DataFrame(masked_d.columns[:].str.contains('HCC'), index=masked_d.iloc[:,:].T.index),
                     'peaks': df_m[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']].iloc[:],
                     'samples': df_p if 'pos' in f else df_n})
# get single batch datasets
ipo_d_files = ['IPO_aligned_MTBLS19_neg_all_F.csv', 'IPO_aligned_MTBLS19_neg_all_R.csv',
               'IPO_aligned_MTBLS19_pos_all_F.csv', 'IPO_aligned_MTBLS19_pos_all_R.csv']
for f in ipo_d_files:
    df_m = pd.read_csv(f)
    features = df_m.iloc[:,9:-3]
    data.append({'study': study,
                 'disease': disease,
                 'data_set': f[:-4]+'_'+run,
                 'features': features.T.astype('float'),
                 'labels': pd.DataFrame(features.columns[:].str.contains('HCC'), index=features.iloc[:,:].T.index),
                 'peaks': df_m[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1', 'isotopes', 'adduct', 'pcgroup']].iloc[:],
                 'samples': df_p if 'pos' in f else df_n})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

In [121]:
#### Another exampels of where MMs code does not actuallly filter#####
# study = 'MTBLS17_data'
# disease = 'hepatocellular carcinoma'

# os.chdir(os.path.join(root,study))
# data = []
# df_s = pd.read_csv('s_live_mtbls17.txt',sep='\t')
# df_p = pd.read_csv('a_live_mtbls17pos_metabolite profiling_mass spectrometry.txt',sep='\t')
# df_n = pd.read_csv('a_live_mtbls17neg_metabolite profiling_mass spectrometry.txt',sep='\t')
# df_p = df_s.merge(df_p, on='Sample Name').set_index('Sample Name')
# df_n = df_s.merge(df_n, on='Sample Name').set_index('Sample Name')
# peak_cols = ['mz','mono_mz','rt','FC','TT_pv','TT_qv','WX_pv','WX_qv','isotopes','adduct']
# for fn in glob.iglob('Peaklist*.xlsx'):
#     df_m = pd.read_excel(fn)
# #     print(pd.DataFrame(df_m.columns[8:-2].str.contains('HCC'), index=df_m.iloc[:,8:-2].T.index))
#     data.append({'study': study,
#                  'disease': disease,
#                  'analysis': os.path.splitext(fn)[0],
#                  'features': df_m.iloc[:,8:-2].T.astype('float'),
#                  'labels': pd.DataFrame(df_m.columns[8:-2].str.contains('HCC'), index=df_m.iloc[:,8:-2].T.index),
#                  'peaks': df_m[peak_cols],
#                  'samples': df_p if 'POS' in fn else df_n})
# data = [filter_samples(x) for x in data]
# for sudy in data:
#     print('postfilter:', sudy['labels'].shape)

In [5]:
# looks like there are a and b for each file - removed the b file
# labels appear to map to the file order in the features 
# 12-7-18: can combine the pos/neg! 
study = 'MTBLS17_data'
disease = 'hepatocellular carcinoma'

os.chdir(os.path.join(root,study))
data = []
df_s = pd.read_csv('s_live_mtbls17.txt',sep='\t')
df_p = pd.read_csv('a_live_mtbls17pos_metabolite profiling_mass spectrometry.txt',sep='\t')
df_n = pd.read_csv('a_live_mtbls17neg_metabolite profiling_mass spectrometry.txt',sep='\t')
df_p = df_s.merge(df_p, on='Sample Name').set_index('Sample Name')
df_n = df_s.merge(df_n, on='Sample Name').set_index('Sample Name')

# get my IPO-xcms features 
dirs = ['neg_exp1', 'neg_exp2', 'neg_exp3', 'pos_exp1', 'pos_exp2', 'pos_exp3', 'neg_onebatch', 'pos_onebatch']
# dirs = ['neg_onebatch', 'pos_onebatch']
for folder in dirs:
    dir_list = os.listdir(folder)
    for fi in dir_list:
        if 'IPO_aligned' in fi:
            data_file = fi
        else:
            continue
#         data_file = 'IPO_aligned_MTBLS17_' + folder + '.csv'
        open_file = os.path.join(folder,data_file)
        features = pd.read_csv(open_file)
        if list(features)[-1] == 'pcgroup': #added this because pos_exp2 didnt get isotope/adduct picking done...
            f = features.iloc[:,9:-3].T.astype('float')
            label = pd.DataFrame(features.columns[9:-3].str.contains('HCC'), index=features.iloc[:,9:-3].T.index)
            keep_index = [fname for fname in list(f.index) if 'a_' in fname]
            f = f.loc[keep_index]
            label = label.loc[keep_index]
            data.append({'study':study[:-5],
                         'disease': disease,
                         'data_set': data_file[:-4],
                         'features': f,
                         'labels': label,
                         'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1' ,'isotopes', 'adduct', 'pcgroup']],
                         'samples': df_p if 'pos' in folder else df_n})
        else:
            f = features.iloc[:,9:].T.astype('float')
            label = pd.DataFrame(features.columns[9:].str.contains('HCC'), index=features.iloc[:,9:].T.index)
            keep_index = [fname for fname in list(f.index) if 'a_' in fname]
            f = f.loc[keep_index]
            label = label.loc[keep_index]
            data.append({'study':study[:-5],
                         'disease': disease,
                         'data_set': data_file[:-4],
                         'features': f,
                         'labels': label,
                         'peaks': features[['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'X1']],
                         'samples': df_p if 'pos' in folder else df_n})       
# get the author features
files = ['Peaklist_EXP1_POS.xlsx','Peaklist_EXP2_POS.xlsx','Peaklist_EXP3_POS.xlsx',
         'Peaklist_EXP1_NEG.xlsx','Peaklist_EXP2_NEG.xlsx','Peaklist_EXP3_NEG.xlsx']
for f in files:
    features = pd.read_excel(f)
    feat = features.iloc[:,8:-2].T.astype('float')
    label = pd.DataFrame(features.columns[8:-2].str.contains('HCC'), index=features.iloc[:,8:-2].T.index)
    keep_index = [fname for fname in list(feat.index) if 'a_' in fname]
    feat = feat.loc[keep_index]
    label = label.loc[keep_index]
    data.append({'study':study[:-5],
                 'disease': disease,
                 'data_set': f[:-5],
                 'features': feat,
                 'labels': label,
                 'peaks': features[['mz','mono_mz','rt','FC','TT_pv','TT_qv','WX_pv','WX_qv','isotopes','adduct']],
                 'samples': df_p if 'pos' in folder else df_n})
pickle.dump(data, open('%s.pkl'%study, 'wb'))

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
