# Preliminary stats on metadata for ContNeXt

In [1]:
import getpass
import json
import os
import sys
import time
from urllib import request

import pandas as pd
from tqdm import tqdm

In [2]:
getpass.getuser()

'rfigueiredo'

In [3]:
sys.version

'3.9.1 (v3.9.1:1e5d33e9b9, Dec  7 2020, 12:10:52) \n[Clang 6.0 (clang-600.0.57)]'

In [4]:
time.asctime()

'Fri Jan  7 15:39:10 2022'

In [5]:
# replace here the location of the external data dir
data_dir = os.path.join(os.path.expanduser("~"), "contnext_data", "data")

### Load Gemma dump

In [6]:
df = pd.read_table(os.path.join(data_dir, "metadata", "gemma_dump_ontologies.tsv"))
df = df.loc[df["class.Name"].isin(['organism part', 'cell type', 'cell line'])]
df

Unnamed: 0,taxon.Name,experiment.ShortName,class.Type,class.URL,class.Name,term.URL,term.Name
0,human,GSE2018,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0002048,lung
1,human,GSE2018,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000324,cell type,http://purl.obolibrary.org/obo/CL_0002368,respiratory epithelial cell
6,rat,GSE2872,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000324,cell type,http://purl.obolibrary.org/obo/CL_0001031,cerebellar granule cell
11,mouse,GSE4523,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0000955,brain
13,human,GSE4036,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0002037,cerebellum
...,...,...,...,...,...,...,...
58678,human,GSE5281,FactorValue,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0002740,posterior cingulate gyrus
58679,human,GSE5281,FactorValue,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0002661,superior frontal gyrus
58680,human,GSE5281,FactorValue,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0002436,primary visual cortex
58681,human,GSE5281,FactorValue,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0001954,Ammon's horn


### add platform column

In [7]:
def get_platform(dataset):
    if dataset == "McLean Hippocampus": ### workaround for this ds
        dataset = "670"                 ### workaround
    if " " in dataset:
        dataset = dataset.replace(" ", "%20") ##this is supposed to be the workaround for spaces, but it doesn't work with the above
    url = f"https://gemma.msl.ubc.ca/rest/v2/datasets/{dataset}/platforms"
    webURL = request.urlopen(url)
    data = webURL.read()
    encoding = webURL.info().get_content_charset('utf-8')
    contents = json.loads(data.decode(encoding))
    platforms = []
    for platform in contents["data"]:
        platforms.append(platform["shortName"])
    return ", ".join(platforms)

In [8]:
try:
    with open(os.path.join(data_dir, "metadata", 'platforms.json'), 'r') as f:
        platform_dict = json.load(f)
except:
    platform_dict = {}
    
dict_changed = False

datasets = df["experiment.ShortName"]
platforms = []
error_datasets = []

for dataset in tqdm(datasets):
    if dataset in platform_dict:
        platforms.append(platform_dict[dataset])
        continue
    try:
        pltfm = get_platform(dataset)
        if not pltfm:
            pltfm = pd.NA
    except:
        pltfm = pd.NA
        error_datasets.append(dataset)
    platforms.append(pltfm)
    if not pd.isnull(pltfm):
        dict_changed = True
        platform_dict[dataset] = pltfm
    
if dict_changed:
    with open(os.path.join(data_dir, "metadata", 'platforms.json'), 'w') as f:
        json.dump(platform_dict, f)

100%|██████████| 20261/20261 [01:11<00:00, 284.86it/s]


In [9]:
len(platforms), len(datasets)

(20261, 20261)

In [10]:
df['platform.ShortName'] = platforms
df_subset = df.dropna()

In [11]:
df_subset.head(20)

Unnamed: 0,taxon.Name,experiment.ShortName,class.Type,class.URL,class.Name,term.URL,term.Name,platform.ShortName
0,human,GSE2018,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0002048,lung,GPL96
1,human,GSE2018,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000324,cell type,http://purl.obolibrary.org/obo/CL_0002368,respiratory epithelial cell,GPL96
6,rat,GSE2872,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000324,cell type,http://purl.obolibrary.org/obo/CL_0001031,cerebellar granule cell,GPL1355
11,mouse,GSE4523,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0000955,brain,GPL1261
13,human,GSE4036,ExperimentTag,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0002037,cerebellum,GPL570
17,mouse,GSE4034,FactorValue,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0001954,Ammon's horn,GPL1261
20,mouse,GSE4034,FactorValue,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0001876,amygdala,GPL1261
22,mouse,GSE2866,FactorValue,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0002037,cerebellum,GPL81
23,mouse,GSE2866,FactorValue,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0000956,cerebral cortex,GPL81
24,mouse,GSE2866,FactorValue,http://www.ebi.ac.uk/efo/EFO_0000635,organism part,http://purl.obolibrary.org/obo/UBERON_0001954,Ammon's horn,GPL81


In [12]:
df_human_only = df_subset.loc[df_subset["taxon.Name"] == "human"]
df_GPL570 = df_subset[df_subset['platform.ShortName'].str.contains("GPL570")]

### get sample metadata

In [13]:
path = os.path.join(data_dir, "metadata", "dataset_sample_metadata")

if not os.path.exists(path):
    os.makedirs(path)
    
def get_sample_metadata(dataset, path):
    if dataset == "McLean Hippocampus": ### workaround
        dataset = "670"                 ### workaround
    if os.path.exists(os.path.join(path,dataset,'sample_metadata.json')):
        with open(os.path.join(path,dataset,'sample_metadata.json'), 'r') as f:
            return json.load(f)
    url = f"https://gemma.msl.ubc.ca/rest/v2/datasets/{dataset}/samples"
    webURL = request.urlopen(url)
    data = webURL.read()
    encoding = webURL.info().get_content_charset('utf-8')
    contents = json.loads(data.decode(encoding))
    if not os.path.exists(os.path.join(path,dataset)):
        os.makedirs(os.path.join(path,dataset))
    with open(os.path.join(path,dataset,'sample_metadata.json'), 'w') as f:
        json.dump(contents, f)
    return contents

In [14]:
datasets = df_subset["experiment.ShortName"].unique()
error_ds = []

for dataset in tqdm(datasets):
    try:
        get_sample_metadata(dataset, path)
    except:
        error_ds.append(dataset)

100%|██████████| 10218/10218 [00:43<00:00, 234.57it/s]


In [15]:
length =  len(df_subset)
df_subset = df_subset.loc[~df_subset["experiment.ShortName"].isin(error_ds)]
print(length - len(df_subset), "removed rows due to missing sample data")

0 removed rows due to missing sample data


### save

In [16]:
df_subset.to_csv(os.path.join(data_dir, "metadata", "gemma_dump_FINAL.tsv"), sep='\t', index=False)

### Preliminary stats

#### functions

In [17]:
def flatten_list(l):
    new_l = []
    for item in l:
        if ", " not in item:
            new_l.append(item)
        else:
            for subitem in item.split(", "):
                new_l.append(subitem)
    return new_l

In [18]:
def make_stats(category, overall_df):
    df = overall_df.loc[overall_df["class.Name"] == category]
    term_list = list(df['term.Name'].unique())
    datasets = {
        term : list(df.loc[df['term.Name'] == term]["experiment.ShortName"].unique())
        for term in term_list
        if len(df.loc[df['term.Name'] == term]["experiment.ShortName"].unique()) >= 5
    }  
    platforms = {
        term : flatten_list(list(df.loc[df['term.Name'] == term]["platform.ShortName"].unique()))
        for term in term_list
    }
    term_dict = {
        term : {
            'number_datasets' : len(ds), 
            'number_platforms': len(platforms[term])
        }
        for term, ds in datasets.items()
    }
    term_dict = dict(sorted(term_dict.items(), key=lambda item: item[1]['number_datasets'], reverse = True))
    
    stats_df = pd.DataFrame.from_dict(term_dict, orient = "index")
    stats_df.index.name = category
    return stats_df

#### tissue stats

In [19]:
tissue_stats_from_all = make_stats("organism part", df_subset)
tissue_stats_from_all.head()

Unnamed: 0_level_0,number_datasets,number_platforms
organism part,Unnamed: 1_level_1,Unnamed: 2_level_1
brain,671,112
liver,554,94
Ammon's horn,454,79
blood,454,66
cerebral cortex,334,67


In [20]:
tissue_stats_from_human = make_stats("organism part", df_human_only)
tissue_stats_from_human.head()

Unnamed: 0_level_0,number_datasets,number_platforms
organism part,Unnamed: 1_level_1,Unnamed: 2_level_1
blood,414,47
brain,165,35
breast,110,34
bone marrow,110,22
lung,109,33


In [21]:
tissue_stats_from_topplatform = make_stats("organism part", df_GPL570)
tissue_stats_from_topplatform.head()

Unnamed: 0_level_0,number_datasets,number_platforms
organism part,Unnamed: 1_level_1,Unnamed: 2_level_1
blood,126,1
bone marrow,42,1
brain,42,1
lung,33,1
breast,31,1


#### cell line stats

In [22]:
cell_line_stats_from_all = make_stats("cell line", df_subset)
cell_line_stats_from_all.head()

Unnamed: 0_level_0,number_datasets,number_platforms
cell line,Unnamed: 1_level_1,Unnamed: 2_level_1
MCF7 cell,103,26
iPSC derived cell line,48,9
lymphoblastoid cell line,39,13
A549 cell,35,14
LNCaP cell,33,12


In [23]:
cell_line_stats_from_human = make_stats("cell line", df_human_only)
cell_line_stats_from_human.head()

Unnamed: 0_level_0,number_datasets,number_platforms
cell line,Unnamed: 1_level_1,Unnamed: 2_level_1
MCF7 cell,103,26
iPSC derived cell line,47,8
lymphoblastoid cell line,39,13
A549 cell,35,14
LNCaP cell,33,12


In [24]:
cell_line_stats_from_topplatform = make_stats("cell line", df_GPL570)
cell_line_stats_from_topplatform.head()

Unnamed: 0_level_0,number_datasets,number_platforms
cell line,Unnamed: 1_level_1,Unnamed: 2_level_1
MCF7 cell,30,1
A549 cell,13,1
HCT 116 cell,9,1
lymphoblastoid cell line,8,1
HT-29 cell,8,1


#### cell type stats

In [25]:
cell_type_stats_from_all = make_stats("cell type", df_subset)
cell_type_stats_from_all.head()

Unnamed: 0_level_0,number_datasets,number_platforms
cell type,Unnamed: 1_level_1,Unnamed: 2_level_1
embryonic stem cell,248,48
neuronal stem cell,201,42
neuron,191,40
microglial cell,191,30
peripheral blood mononuclear cell,165,32


In [26]:
cell_type_stats_from_human = make_stats("cell type", df_human_only)
cell_type_stats_from_human.head()

Unnamed: 0_level_0,number_datasets,number_platforms
cell type,Unnamed: 1_level_1,Unnamed: 2_level_1
peripheral blood mononuclear cell,162,29
fibroblast,123,32
embryonic stem cell,91,21
neuronal stem cell,85,20
B cell,63,24


In [27]:
cell_type_stats_from_topplatform = make_stats("cell type", df_GPL570)
cell_type_stats_from_topplatform.head()

Unnamed: 0_level_0,number_datasets,number_platforms
cell type,Unnamed: 1_level_1,Unnamed: 2_level_1
peripheral blood mononuclear cell,55,1
T cell,25,1
fibroblast,22,1
monocyte,18,1
B cell,15,1
