# Arrange metadata for ContNeXt

In [1]:
import getpass
import json
import os
import re
import sys
import time
from difflib import get_close_matches

import pandas as pd
from tqdm import tqdm

In [2]:
getpass.getuser()

'rfigueiredo'

In [3]:
sys.version

'3.9.1 (v3.9.1:1e5d33e9b9, Dec  7 2020, 12:10:52) \n[Clang 6.0 (clang-600.0.57)]'

In [4]:
time.asctime()

'Fri Jan  7 15:54:07 2022'

In [5]:
# replace here the location of the external data dir
data_dir = os.path.join(os.path.expanduser("~"), "contnext_data", "data")

### open processed Gemma data

In [6]:
df = pd.read_table(os.path.join(data_dir, "metadata", "gemma_dump_FINAL.tsv"))
df

Unnamed: 0,taxon.Name,experiment.ShortName,class.Type,class.URL,class.Name,term.URL,term.Name,platform.ShortName
0,human,GSE2018,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0002048,lung,GPL96
1,human,GSE2018,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000324,cell type,http://purl.obolibrary.org/obo/CL_0002368,respiratory epithelial cell,GPL96
2,rat,GSE2872,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000324,cell type,http://purl.obolibrary.org/obo/CL_0001031,cerebellar granule cell,GPL1355
3,mouse,GSE4523,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0000955,brain,GPL1261
4,human,GSE4036,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0002037,cerebellum,GPL570
...,...,...,...,...,...,...,...,...
19783,human,GSE5281,FactorValue,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0002740,posterior cingulate gyrus,GPL570
19784,human,GSE5281,FactorValue,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0002661,superior frontal gyrus,GPL570
19785,human,GSE5281,FactorValue,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0002436,primary visual cortex,GPL570
19786,human,GSE5281,FactorValue,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0001954,Ammon's horn,GPL570


In [7]:
ontology_url_dict = {
    "organism part" : {None : pd.NA},
    "cell type" : {None : pd.NA},
    "cell line" : {None : pd.NA},
}

for i, row in df.drop_duplicates(['class.Name','term.URL','term.Name'])[['class.Name','term.URL','term.Name']].iterrows():
    if row['term.Name'] not in ontology_url_dict[row['class.Name']]:
        ontology_url_dict[row['class.Name']][row['term.Name']] = row['term.URL']

In [8]:
df_human_only = df.loc[df["taxon.Name"] == "human"]
df_GPL570 = df[df['platform.ShortName'].str.contains("GPL570")]

### Process data, arrange metadata, prep manual curation check

#### functions

In [9]:
def is_control(sample_data):
    """ TODO
    this function is adapted from R code given by the PavlidisLab team behind Gemma"""
    
    
    # The recommended version strictly limits to terms that are present in ontologies Gemma supports.
    # The commented out section is for IFF you want to include free text terms that are not in ontology form.
    # Note: Use at own risk.
    controlPatternQuery = '|'.join(['control',
                                    'reference substance role',
                                    'reference subject role',
                                    'baseline participant role',
                                    'wild type genotype',
                                    'initial time point',
    #                                'phosphate buffered saline', #see note
    #                                'control group',  #see note
    #                                'normal', #see note
    #                                'untreated', #see note
    #                                'baseline', #see note
    #                                'wild type', #see note
                                   ])



    positiveFlag = False
    
    factorValues = sample_data["sample"]["factorValues"].values()
    samplePositiveCount = [None]*len(factorValues)
    for i, fvObject in enumerate(factorValues):
        #print(fvObject)
        
        if not fvObject:
                continue
                
        valueGrep = re.search(controlPatternQuery, fvObject) is not None
        
        if valueGrep:
            samplePositiveCount[i] = True

    if any(samplePositiveCount):
        return True
    return False


def get_sample_acc(sample_data):
    return sample_data["accession"]["accession"]


def get_alt_sample_acc(sample_data):
    return sample_data["sample"]["name"]
    
    
def get_term_FV(sample_data, term_type):
    match = ""
    FV = sample_data["sample"]["factorValues"]
    for fvObject in FV.values():
        if not fvObject: continue
        fvObject = fvObject.split("::::")
        for part in fvObject:
            if "Category = "+term_type in part:
                if match: 
                    match += ";"+part.split("Value = ")[1]
                else:
                    match = part.split("Value = ")[1]
    if not match or match == "Male/Female; Normal/Diseased":
        match = sample_data["sample"]["characteristicValues"][term_type] #fallback value
    return match


In [10]:
terms_per_ds = {}
for ds in df["experiment.ShortName"].unique():
    #if ds in error_ds: continue 
    subdf = df.loc[df["experiment.ShortName"] == ds]
    terms_per_ds[ds] = {}
    for term in subdf["class.Name"].unique():
        terms_per_ds[ds][term] = list(subdf.loc[subdf["class.Name"] == term]["term.Name"].unique())

In [11]:
for ds in df["experiment.ShortName"].unique():
    rows =  df.loc[df["experiment.ShortName"]== ds]
    if len(set(rows["taxon.Name"])) != 1:
        raise Warning("too many taxa")

In [12]:
path = os.path.join(data_dir, "metadata", "dataset_sample_metadata")

sample_dict = {}
for ds, categories in terms_per_ds.items():
    ETflag = False
    rows =  df.loc[df["experiment.ShortName"]== ds]
    if "ExperimentTag" in list(rows["class.Type"]):
        ETflag = True
    taxon = list(rows["taxon.Name"])[0]
    pltfm = list(rows["platform.ShortName"])[0]
    if ds == "McLean Hippocampus":
        with open(os.path.join(path, '670/sample_metadata.json'), 'r') as f:
            ds_data = json.load(f)
    else:
        with open(os.path.join(path, ds,'sample_metadata.json'), 'r') as f:
            ds_data = json.load(f)
    
    for j, sample in enumerate(ds_data['data']):
        if not is_control(sample):
            continue
        try:
            acc = get_sample_acc(sample)
        except TypeError:
            acc = get_alt_sample_acc(sample)
        sample_dict[acc] = {"dataset" : ds, 
                            "platform" : pltfm,
                            "species" : taxon, 
                            "original organism part" : pd.NA,
                            "organism part" : pd.NA,
                            "organism part URL": pd.NA,
                            "original cell type" : pd.NA,
                            "cell type" : pd.NA, 
                            "cell type URL" : pd.NA,
                            "original cell line" : pd.NA,
                            "cell line" : pd.NA,
                            "cell line URL" : pd.NA,
                            "manual check": "",
                           }  
        if ETflag:
            for term_type, terms in categories.items():
                for term in terms:    
                    sample_dict[acc][term_type] = term
                    sample_dict[acc][term_type+" URL"] = ontology_url_dict[term_type][term]
        else:
            for term_type, terms in categories.items():
                try:
                    term = get_term_FV(sample, term_type)
                except KeyError:
                    continue
                if term == "Male/Female; Normal/Diseased":
                    continue
                sample_dict[acc]["original "+term_type] = term
                match = []
                iters = 0
                similarity_cutoff = .8
                match = get_close_matches(term, terms, 1, similarity_cutoff)
                if not match: #no match found
                    for word in term.split(): #try and break up the words
                        match = get_close_matches(word, terms, 1, similarity_cutoff)
                        if match: 
                            sample_dict[acc]["manual check"] += f"{j} word split on {term_type};"
                            break
                if not match: #still no match
                    for word in term.split(";"): #try and break up parts
                        match = get_close_matches(word, terms, 1, similarity_cutoff)
                        if match: 
                            sample_dict[acc]["manual check"] += f"{j} semicolon split on {term_type};"
                            break
                while len(match) == 0: #still no match, lower standards for match
                    similarity_cutoff -= .05
                    match = get_close_matches(term, terms, 1, similarity_cutoff)
                    if match and similarity_cutoff >.35:
                        sample_dict[acc]["manual check"] += f"{j} {round(similarity_cutoff, 2)} on {term_type} {terms};"
                    if similarity_cutoff < .35:
                        match = [None]
                        sample_dict[acc]["manual check"] += f"{j} check {term_type} {terms};"
                        break
                sample_dict[acc][term_type] = match[0]
                sample_dict[acc][term_type+" URL"] = ontology_url_dict[term_type][match[0]]
        

In [13]:
sample_df = pd.DataFrame.from_dict(sample_dict, orient="index")
print(len(sample_df))
sample_df

124451


Unnamed: 0,dataset,platform,species,original organism part,organism part,organism part URL,original cell type,cell type,cell type URL,original cell line,cell line,cell line URL,manual check
GSM36426,GSE2018,GPL96,human,,lung,http://purl.obolibrary.org/obo/UBERON_0002048,,respiratory epithelial cell,http://purl.obolibrary.org/obo/CL_0002368,,,,
GSM36437,GSE2018,GPL96,human,,lung,http://purl.obolibrary.org/obo/UBERON_0002048,,respiratory epithelial cell,http://purl.obolibrary.org/obo/CL_0002368,,,,
GSM36448,GSE2018,GPL96,human,,lung,http://purl.obolibrary.org/obo/UBERON_0002048,,respiratory epithelial cell,http://purl.obolibrary.org/obo/CL_0002368,,,,
GSM36428,GSE2018,GPL96,human,,lung,http://purl.obolibrary.org/obo/UBERON_0002048,,respiratory epithelial cell,http://purl.obolibrary.org/obo/CL_0002368,,,,
GSM36439,GSE2018,GPL96,human,,lung,http://purl.obolibrary.org/obo/UBERON_0002048,,respiratory epithelial cell,http://purl.obolibrary.org/obo/CL_0002368,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM4742254,GSE156689.1,Generic_mouse_ncbiIds,mouse,,cortex,http://purl.obolibrary.org/obo/UBERON_0001851,,blood vessel endothelial cell,http://purl.obolibrary.org/obo/CL_0000071,,,,
GSM4742256,GSE156689.1,Generic_mouse_ncbiIds,mouse,,cortex,http://purl.obolibrary.org/obo/UBERON_0001851,,blood vessel endothelial cell,http://purl.obolibrary.org/obo/CL_0000071,,,,
GSM4742266,GSE156689.2,Generic_mouse_ncbiIds,mouse,,barrel cortex,http://purl.obolibrary.org/obo/UBERON_0010415,,blood vessel endothelial cell,http://purl.obolibrary.org/obo/CL_0000071,,,,
GSM4742265,GSE156689.2,Generic_mouse_ncbiIds,mouse,,barrel cortex,http://purl.obolibrary.org/obo/UBERON_0010415,,blood vessel endothelial cell,http://purl.obolibrary.org/obo/CL_0000071,,,,


In [14]:
print(len(sample_df["organism part"].unique()), "tissues, totaling in", len(sample_df.loc[sample_df["organism part"].notnull()]), "samples from",  len(sample_df.loc[sample_df["organism part"].notnull()]["dataset"].unique()),"datasets")
print(len(sample_df["cell type"].unique()), "cell type, totaling in", len(sample_df.loc[sample_df["cell type"].notnull()]), "samples from",  len(sample_df.loc[sample_df["cell type"].notnull()]["dataset"].unique()),"datasets")
print(len(sample_df["cell line"].unique()), "cell line, totaling in", len(sample_df.loc[sample_df["cell line"].notnull()]), "samples from",  len(sample_df.loc[sample_df["cell line"].notnull()]["dataset"].unique()),"datasets")

895 tissues, totaling in 88424 samples from 5356 datasets
554 cell type, totaling in 36984 samples from 2757 datasets
622 cell line, totaling in 13115 samples from 1150 datasets


In [15]:
sample_df.to_csv(os.path.join(data_dir, "metadata", "metadata_before_curation.tsv"), sep='\t', index=True)
dropped = sample_df.drop_duplicates()
dropped.to_csv(os.path.join(data_dir, "metadata", "metadata_for_manual_curation.tsv"), sep='\t', index=True)

In [16]:
print(len(dropped))
len(dropped.loc[~dropped["manual check"].isin([""])])

13608


3338

In [17]:
platform_to_keep = "GPL570"

In [18]:
subset_sample_df = sample_df[sample_df['platform'].str.contains(platform_to_keep)]
subset_sample_df.to_csv(os.path.join(data_dir, "metadata", "metadata_before_curation_shortened.tsv"), sep='\t', index=True)
dropped_subset = subset_sample_df.drop_duplicates()
dropped_subset.to_csv(os.path.join(data_dir, "metadata", "metadata_for_manual_curation_shortened.tsv"), sep='\t', index=True)
print(len(dropped_subset))
len(dropped_subset.loc[~dropped_subset["manual check"].isin([""])])

1570


323