# MetaboLights to CMAP
## Krista Longnecker, 8 July 2025


MetaboLights has FTP access to their data files and that is easy enough to access, but there are some downstream steps to add because I did not upload the full station inforamtion to MetaboLights.

# This really needs organizational help

In [114]:
#need to organize this a bit better as I have connections to FTP spreadthroughout this...will be easier if 
#I pull everything I need at once, and then close the connection
#working on the organization now - I am going to assemble the MetaboLights data into a frictionless data package. 
#This way I can reuse code that I already wrote for the BCO-DMO data.
##get all the pieces first and then do things with the pieces

In [115]:
%reset -f

In [116]:
import pandas as pd
import os
import io
from ftplib import FTP
import re
from datetime import datetime, timedelta, timezone

import json
from frictionless import describe, Package

In [117]:
""" HELPER FUNCTIONS """
def rfc3339_datetime_str():
    """
    Construct an RFC3339-compliant datetime
    """
    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")

In [118]:
#make the data folder if it is not already there (it is in .gitignore, so it will not end up at GitHub)
folder = "data"
os.chdir(".")

if os.path.isdir(folder):
    print("Data will go here (but should not be synced to GitHub): %s" % (os.getcwd()) + '\\' + folder)
else:
    os.mkdir(folder)

Data will go here (but should not be synced to GitHub): C:\Users\klongnecker\Documents\Dropbox\GitHub_espresso\data_pipeline\CMAP\data


First - set up the FTP access to MetaboLights and get the files I want
The following cells read a file, write to disk, and then read the result into Python. I am not sure
how to skip the middle/write step, but it seems like setting this up should work (but it doesn't)\
    # Create an in-memory binary stream\
    in_memory_file = io.BytesIO()

Note: the frictionless website talks about FTP access to get data, but I cannot get it to work (by guessing, seems undocumented)

In [119]:
# start with one dataset at MetaboLights, TSQ data described in Longnecker et al. 2024 (only other dataset ready is the untargeted data)
study_id = 'MTBLS2356'

In [143]:
#while testing, if the FTP command fails the connection is left open and the next command gives error
#error is: AttributeError: 'NoneType' object has no attribute 'sendall'
ftp = FTP('ftp.ebi.ac.uk') #address from MetaboLights webpage
ftp.login()
ftpDataAddress = '/pub/databases/metabolights/studies/public/' + study_id
ftp.cwd(ftpDataAddress)
#ftp.retrlines('LIST') #this will only print to console, not what I want
fileList = ftp.nlst() #can use this to make a list that will be searchable
fileList

['FILES',
 'HASHES',
 'METADATA_REVISIONS',
 'a_MTBLS2356_LC-MS_negative__metabolite_profiling.txt',
 'a_MTBLS2356_LC-MS_positive__metabolite_profiling.txt',
 'i_Investigation.txt',
 'm_MTBLS2356_LC-MS_negative__metabolite_profiling_v2_maf.tsv',
 'm_MTBLS2356_LC-MS_positive__metabolite_profiling_v2_maf.tsv',
 's_MTBLS2356.txt']

In [146]:
test = 'ftp://ftp.ebi.ac.uk' + ftpDataAddress + '/' + 's_MTBLS2356.txt'

In [147]:
test

'ftp://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public/MTBLS2356/s_MTBLS2356.txt'

In [154]:


from pprint import pprint
from frictionless import Resource

path=test
resource = Resource(path=path)




In [155]:
resource

{'name': 's_mtbls2356',
 'type': 'text',
 'path': 'ftp://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public/MTBLS2356/s_MTBLS2356.txt',
 'scheme': 'ftp',
 'format': 'txt',
 'mediatype': 'text/txt'}

In [158]:
datafile = describe(resource['path'])

TypeError: 'TextResource' object is not subscriptable

In [121]:
#start with the metadata about the samples so I can convert each sample to time/lat/lon/depth to match the CMAP requirements
str = 's_' + study_id #this is the search string for the data files
metadataFiles = [v for v in fileList if str in v] 
metadataFiles = pd.DataFrame(metadataFiles,columns = ['files'])
readFile = metadataFiles.loc[0,'files']

# metadataFiles: put them here 
# Is there a way to download an FTP file and not write it disk?
writeFile = 'data/' + 'tempMetadata.txt'

with open(writeFile,'wb') as fp:
    try:
        retr_command = f"RETR {readFile}"
        ftp.retrbinary(retr_command, fp.write)
    except Exception as e: 
        print(f"Error during quit: {e}")
    except AttributeError as e: 
        print(f"AttributeError during quit: {e} - connection was likely already closed.")

# now read in the result
metadata_aboutSamples = pd.read_table(writeFile,delimiter = '\t')

In [122]:
# Now get the data files (more than one because things are split positive/negative ion mode...concatenate them later
str = 'm_' + study_id #this is the search string for the data files
dataFiles = [v for v in fileList if str in v] #Python syntax, will make a list
dataFiles = pd.DataFrame(dataFiles,columns = ['files']) #I find the dataframe easier to manage than the list
readDataFile = dataFiles.loc[0,'files']

idx = 0 #make a loop later as can have multiple data files for a single dataset
writeDataFile = 'data/' + 'tempData.tsv'

#dataFiles.loc[0,'files'] #use this to see the file

with open(writeDataFile,'wb') as fp:
    #try-except to make sure the FTP closes
    try:
        retr_command = f"RETR {readDataFile}"
        ftp.retrbinary(retr_command, fp.write)
    except Exception as e: 
        print(f"Error during quit: {e}")


#now read in the resulting file? 
#see question above, can I just read this in and NOT write to disk?
tsvFile = pd.read_table(writeDataFile,delimiter = '\t')

In [123]:
#finally, details about the experiment are easy because the filename is generic
writeFile = 'data/' + 'i_Investigation.txt'
readFile = 'i_Investigation.txt'

with open(writeFile,'wb') as fp:
    #try-except to make sure the FTP closes
    try:
        retr_command = f"RETR {readFile}"
        ftp.retrbinary(retr_command, fp.write)
    except Exception as e: 
        print(f"Error during quit: {e}")

# del writeFile, readFile

#open up the txt file with the experiment data
writeFile = 'data/i_Investigation.txt'  
with open(writeFile, 'r') as f:
    metadata_aboutExperiment = f.read()

In [124]:
ftp.quit()  #close the FTP connection

'221 Goodbye.'

Now do stuff with the files I just collected: make a frictionless package

In [134]:
#setup the frictionless package, just modify the syntax I have
""" Create a Frictionless Data Package """

metabolights = Package(name='metabolights-datasets', profile='data-package')
metabolights.title = 'Bermuda Institute of Ocean Sciences Simons Collaboration on Ocean Processes and Ecology'
metabolights.description = 'Kujawinski laboratory datasets from MetaboLights' #need to update as needed
metabolights.created = rfc3339_datetime_str()
metabolights.sources = []

In [135]:
metabolights

{'name': 'metabolights-datasets',
 'title': 'Bermuda Institute of Ocean Sciences Simons Collaboration on Ocean '
          'Processes and Ecology',
 'description': 'Kujawinski laboratory datasets from MetaboLights',
 'profile': 'data-package',
 'sources': [],
 'created': '2025-07-09T12:53:33.303756Z'}

In [136]:
#trying to follow the BCO-DMO format (not clear that I am actually doing that...but try)
# Save the dataset as a 'source' in the frictionless package
source = {
'path': ftpDataAddress,
'title': study_id,
#'doi': dataset['doi']
}
metabolights.sources.append(source)

In [137]:
metabolights

{'name': 'metabolights-datasets',
 'title': 'Bermuda Institute of Ocean Sciences Simons Collaboration on Ocean '
          'Processes and Ecology',
 'description': 'Kujawinski laboratory datasets from MetaboLights',
 'profile': 'data-package',
 'sources': [{'path': '/pub/databases/metabolights/studies/public/MTBLS2356',
              'title': 'MTBLS2356'}],
 'created': '2025-07-09T12:53:33.303756Z'}

In [138]:
files = {}
files['data'] = tsvFile #this also holds the metadata_variables
files['metadata_project'] = metadata_aboutExperiment
files['metadata_samples'] = metadata_aboutSamples

In [142]:
metabolights.add_resource(files)

AttributeError: 'dict' object has no attribute 'package'

In [132]:
metabolights.sources.append(files)

In [133]:
metabolights

AttributeError: 'dict' object has no attribute 'to_descriptor_source'

In [None]:
# Save the package
print(metabolights.to_json())
metabolights.to_json('datapackage.json')

In [None]:
#pull Source Name as I need that later to match to columns in the data file
sampleNames  = metadata_aboutSamples['Source Name']
depth = metadata_aboutSamples['Factor Value[Depth]']
#time is messier and the MetaboLights columns names are long, so shorten them to make this easier
temp = metadata_aboutSamples[['Factor Value[Sampling year date]','Factor Value[Sampling month date]',
                 'Factor Value[Sampling day date]','Factor Value[Hour of the day]','Factor Value[Minute of the hour]']]
temp.columns = ['year','month','day','hour','minute']

In [None]:
step1 = pd.to_datetime(dict(year=temp.year,month=temp.month,day = temp.day,hour = temp.hour,minute=temp.minute))
date_cmap = step1.dt.strftime("%Y-%m-%dT%H:%M:%S")
# date_cmap.head()

### Need BIOS-SCOPE file for lat/lon information

In [None]:
#will need the BIOS-SCOPE discrete data file for station information - that will have both BATS and BIOS-SCOPE data in it
fName = 'data/BATS_BS_COMBINED_MASTER_latest.xlsx';
BSdata = pd.read_excel(open(fName,'rb'),sheet_name = 'DATA')

In [None]:
#MetaboLights required samples to begin with a letter, I used 's' and need to strip that out 
NewID_inMTBLS  = pd.to_numeric(sampleNames.str.strip('s')) 
#convert the series into a dataframe:
s_df = NewID_inMTBLS.reset_index()

#use merge as it will be sorted in the right order
merged_df = pd.merge(BSdata,s_df,how='right',left_on='New_ID',right_on='Source Name')

In [None]:
#also need list of cruises

# Data

In [None]:
tsvFile.columns

In [None]:
#column with metabolite name is  (database identifier would be more generic, need to talk to CMAP people about this)
mtabColumn = 'database_identifier'
# mtabColumn = 'metabolite_identification' 

#only keep the columns that are in sampleNames
dataColumns = tsvFile.columns[tsvFile.columns.isin(sampleNames)]
dataOnly = tsvFile.loc[:,dataColumns].transpose() #index is the 's' numbered samples

dataOnly.columns = tsvFile[mtabColumn] #label the columns with the metabolite information, will also use this for the sheet with metadata about the variables
nVariables = len(dataOnly.columns) #need this for the sheet for the metadata on the variables

In [None]:
#start assembling into CMAP format
# Required variables are time, lat, lon, depth
df = pd.DataFrame(columns=['time','lat','lon','depth'])
df['time'] = date_cmap.to_frame()
df['depth'] = depth.to_frame()
df['lat'] = merged_df['latN'].to_frame()
df['lon'] = -merged_df['lonW'].to_frame() #need negative number to put this into -180 to 180 space
#df.insert(1,'test',merged_df['New_ID']) #check that I have the indexing right
#df.insert(1,'test2',s_df['Source Name'])
df.insert(1,'forIndex',sampleNames) #need an index to keep the rows matched up
df.set_index('forIndex',inplace=True)

#concatenate with the data in dataOnly
df = pd.concat([df,dataOnly],axis=1,ignore_index = False)

metadata about the variables

In [None]:
# work on the second sheet: metadata about the variables; use the CMAP dataset template to setup the dataframe so I get the column headers right
fName = 'datasetTemplate.xlsx'
sheet_name = 'vars_meta_data'
vars = pd.read_excel(fName, sheet_name=sheet_name)
cols = vars.columns.tolist()
#df2 will be the dataframe with the metadata about the variables, set it up empty here
df2 = pd.DataFrame(columns=cols,index = pd.RangeIndex(0,nVariables,1))

In [None]:
#need to details about the metabolites

# this is only a partial list of variables for the moment
df2['var_short_name'] = dataOnly.columns
df2.loc[:,'var_long_name'] = tsvFile.loc[:,'metabolite_identification']
df2.loc[:,'var_sensor'] = 'Triple quadrupole mass spectrometer (TSQ Vantage, Thermo Scientific)'
df2.loc[:,'var_unit'] = 'pM' #this is in the protocols, but I also have some inside information here
df2.loc[:,('var_spatial_res')] = 'irregular'
df2.loc[:, ('var_temporal_res')] = 'irregular'
df2.loc[:,('var_discipline')] = 'chemistry'
df2.loc[:,('var_visualize')] = 1 #yes/no, all metabolites can be visualized

In [None]:
#'var_keywords' will be the hardest as the metabolites have many, many keywords. 
#I want to talk to the people at CMAP about best options to handle that (start a list of topics)

In [None]:
## This publication may be useful in getting information about metabolites via SPARQL queries
# https://jcheminf.biomedcentral.com/articles/10.1186/s13321-016-0144-4

metadata about the project

In [None]:
#information about the project is in the i_Investigation file, read in the file and pull details

In [None]:


# pattern = r'NAME = (.*)' # Captures anything after "NAME = "
pattern = r'Study Description(.*)' # Captures anything after "NAME = "
#pattern = r'\d+'  # Matches one or more digits
extracted_data = re.findall(pattern, metadata_aboutExperiment) #this is a list
extracted_data = ' '.join(extracted_data) # ? really, this seems odd, but works.

#tidy up the string
original_string = extracted_data
chars_to_remove = ['<p>', '</p>','\t']

# # Using iteration and replace()
# modified_string_replace = original_string
# for char in chars_to_remove:
#     modified_string_replace = modified_string_replace.replace(char, "")
# print(f"String after replace: {modified_string_replace}")

# Using regular expressions (for more complex patterns or multiple occurrences)
pattern = "|".join(map(re.escape, chars_to_remove)) # Escapes special characters for regex
project_description = re.sub(pattern, "", original_string)
project_description

In [None]:
# gather up the dataset_meta_data into df3

df3 = pd.DataFrame({
    'dataset_short_name': ['BIOSSCOPE_v1'],
    'dataset_long_name': ['BIOS-SCOPE discrete sample data'],
    'dataset_version': ['1.0'],
    'dataset_release_date': ['2025-06-25'],
    'dataset_make': ['observation'],
    'dataset_source': ['Craig Carlson, Bermuda Institute of Ocean Sciences'],
    'dataset_distributor': ['Craig Carlson, Bermuda Institute of Ocean Sciences'],
    'dataset_acknowledgement': ['We thank the BIOS-SCOPE project team and the BATS team for assistance with sample collection, processing, and analysis. The efforts of the captains, crew, and marine technicians of the R/V Atlantic Explorer are a key aspect of the success of this project. This work supported by funding from the Simons Foundation International.'],
    'dataset_history': [''],
    'dataset_description': [project_description],
    'dataset_references': ['Carlson, C. A., Giovannoni, S., Liu, S., Halewood, E. (2025) BIOS-SCOPE survey biogeochemical data as collected on Atlantic Explorer cruises (AE1614, AE1712, AE1819, AE1916) from 2016 through 2019. Biological and Chemical Oceanography Data Management Office (BCO-DMO). (Version 1) Version Date 2021-10-17. doi:10.26008/1912/bco-dmo.861266.1 [25 June 2025]'],
    'climatology': [0]
    })

In [None]:
# # assemble the details here, might setup in a separate text file later
# df3 = pd.DataFrame({
#     'dataset_short_name': ['BIOSSCOPE_v1'],
#     'dataset_long_name': ['BIOS-SCOPE discrete sample data'],
#     'dataset_version': ['1.0'],
#     'dataset_release_date': ['2025-06-25'],
#     'dataset_make': ['observation'],
#     'dataset_source': ['Craig Carlson, Bermuda Institute of Ocean Sciences'],
#     'dataset_distributor': ['Craig Carlson, Bermuda Institute of Ocean Sciences'],
#     'dataset_acknowledgement': ['We thank the BIOS-SCOPE project team and the BATS team for assistance with sample collection, processing, and analysis. The efforts of the captains, crew, and marine technicians of the R/V Atlantic Explorer are a key aspect of the success of this project. This work supported by funding from the Simons Foundation International.'],
#     'dataset_history': [''],
#     'dataset_description': ['This dataset includes analyses from Niskin bottle samples collected on R/V Atlantic Explorer cruises as part of the BIOS-SCOPE campaign in the time period from 2016 until 2025. Included are CTD data, and survey biogeochemical samples including inorganic nutrients, particulate organic carbon and nitrogen, dissolved organic carbon, dissolved organic nitrogen, total dissolved amino acids, bacterial abundance and production.'],
#     'dataset_references': ['Carlson, C. A., Giovannoni, S., Liu, S., Halewood, E. (2025) BIOS-SCOPE survey biogeochemical data as collected on Atlantic Explorer cruises (AE1614, AE1712, AE1819, AE1916) from 2016 through 2019. Biological and Chemical Oceanography Data Management Office (BCO-DMO). (Version 1) Version Date 2021-10-17. doi:10.26008/1912/bco-dmo.861266.1 [25 June 2025]'],
#     'climatology': [0]
#     })

# #get the list of cruise names from the bcodmo data file
# t = pd.DataFrame(bcodmo['Cruise_ID'].unique())
# t.columns = ['cruise_names']
# df3 = pd.concat([df3,t],axis=1,ignore_index = True)



In [None]:
fName_CMAP = 'data/forCMAPfromMetabolights.xlsx'
dataset_names = {'data': df, 'dataset_meta_data': df3, 'vars_meta_data': df2}
with pd.ExcelWriter(fName_CMAP) as writer:
    for sheet_name, data in dataset_names.items():
        data.to_excel(writer, sheet_name=sheet_name, index=False)

In [None]:
raise UserWarning('Stopping and leave code below for historical reasons, code will not run')

In [None]:
#If I can get isatools to install, this will be an easier way (I think/hope) to access data at BCO-DMO. There is also an R equivalent at metabolighteR that provides access to MetaboLights REST API

In [None]:
#pip install isatools #fails with this error:AttributeError: module 'pkgutil' has no attribute 'ImpImporter'. Did you mean: 'zipimporter'?
pip install git+https://github.com/ISA-tools/isa-api/

In [None]:
import isatools

In [None]:
from isatools.net import mtbls as MTBLS
tmp_dir = MTBLS.get('MTBLS2356')

In [None]:
""" Create a Frictionless Data Package """

biosscope = Package(name='biosscope-bcodmo-datasets', profile='data-package')
biosscope.title = 'Bermuda Institute of Ocean Sciences Simons Collaboration on Ocean Processes and Ecology'
biosscope.description = 'BIOSSCOPE datasets from BCO-DMO'
biosscope.created = rfc3339_datetime_str()
biosscope.sources = []

for index, dataset in metadata.iterrows():

  # Save the dataset as a 'source' in the Package
  source = {
    'path': dataset['dataset'],
    'title': dataset['title'],
    'doi': dataset['doi']
  }
  biosscope.sources.append(source)


  # Get the BCO-DMO parameters
  parameters = get_sparql_dataframe(SPARQL_ENDPOINT, DATASET_PARAMS_QUERY.replace('{dataset_uri}', dataset['dataset']))
  schema = []
  for param_index, parameter in parameters.iterrows():
    param = {}
    param['bcodmo:name'] = parameter['supplied_name']
    if parameter['supplied_definition'] is not None:
      param['bcodmo:description'] = parameter['supplied_definition']
    if parameter['datatype'] is not None:
      param['bcodmo:datatype'] = parameter['datatype']
    if parameter['units'] is not None:
      param['bcodmo:units'] = parameter['units']
    if parameter['format'] is not None:
      param['bcodmo:valueFormat'] = parameter['format']
    schema.append(param)

  # Get the 'data' files for a Dataset (skip any supplemental documentation)
  files = get_sparql_dataframe(SPARQL_ENDPOINT, DATASET_FILES_QUERY.replace('{dataset_uri}', dataset['dataset']))
  for file_index, file in files.iterrows():

    # Use Frictionless to describe the file
    datafile = describe(file['url'])

    # Get Table stats
    if datafile.type == 'table':
      datafile.infer(stats=True)

    # Specify which dataset this file belongs to
    datafile.sources = [source]

    # If the file is marked as the primary file for the dataset, attach the parameters to the file
    if schema is not None and file['is_primary_data_file'] == 'true':
      datafile.custom['bcodmo:parameters'] = schema

    # Add the file to the package
    biosscope.add_resource(datafile)

# Save the package
print(biosscope.to_json())
biosscope.to_json('datapackage.json')