# Analyze metadata for ContNeXt

In [1]:
import getpass
import json
import os
import sys
import time

import pandas as pd

In [2]:
getpass.getuser()

'rfigueiredo'

In [3]:
sys.version

'3.9.1 (v3.9.1:1e5d33e9b9, Dec  7 2020, 12:10:52) \n[Clang 6.0 (clang-600.0.57)]'

In [4]:
time.asctime()

'Fri Jan  7 22:39:37 2022'

In [5]:
# replace here the location of the external data dir
data_dir = os.path.join(os.path.expanduser("~"), "contnext_data", "data")

### metadata for top human platform  + ontology term name mappings

In [6]:
metadata = pd.read_table(os.path.join(data_dir, "metadata", "final_metadata.tsv"), index_col=0)

In [7]:
human_data = metadata[metadata['platform'].str.contains("GPL570")]
human_data = human_data.loc[human_data["species"] == "human"]
human_data.index.name= "sample_id"

In [8]:
with open(os.path.join(data_dir, "mappings", "uberon_name_mappings.json"), 'r') as f:
    uberon_name_mappings = json.load(f)

In [9]:
with open(os.path.join(data_dir, "mappings", "CL_name_mappings.json"), 'r') as f:
    CL_name_mappings = json.load(f)

In [10]:
with open(os.path.join(data_dir, "mappings", "CLO_name_mappings.json"), 'r') as f:
    CLO_name_mappings = json.load(f)

### Term overview

#### tissues

In [11]:
human_tissue_subset = human_data[human_data["organism part"].notnull()]
human_tissue_subset = human_tissue_subset[human_tissue_subset["organism part"] != ""]
human_tissue_subset = human_tissue_subset[human_tissue_subset['organism part URL'].str.contains("UBERON")]

In [12]:
human_tissue_subset

Unnamed: 0_level_0,dataset,platform,species,organism part,organism part URL,cell type,cell type URL,cell line,cell line URL
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GSM92476,GSE4036,GPL570,human,cerebellum,http://purl.obolibrary.org/obo/UBERON_0002037,,,,
GSM92487,GSE4036,GPL570,human,cerebellum,http://purl.obolibrary.org/obo/UBERON_0002037,,,,
GSM92478,GSE4036,GPL570,human,cerebellum,http://purl.obolibrary.org/obo/UBERON_0002037,,,,
GSM92480,GSE4036,GPL570,human,cerebellum,http://purl.obolibrary.org/obo/UBERON_0002037,,,,
GSM92460,GSE4036,GPL570,human,cerebellum,http://purl.obolibrary.org/obo/UBERON_0002037,,,,
...,...,...,...,...,...,...,...,...,...
GSM4763688,GSE157363,GPL570,human,white matter,http://purl.obolibrary.org/obo/UBERON_0002316,,,,
GSM4763694,GSE157363,GPL570,human,white matter,http://purl.obolibrary.org/obo/UBERON_0002316,,,,
GSM4763695,GSE157363,GPL570,human,white matter,http://purl.obolibrary.org/obo/UBERON_0002316,,,,
GSM4763692,GSE157363,GPL570,human,white matter,http://purl.obolibrary.org/obo/UBERON_0002316,,,,


In [13]:
# make a dict to keep track of number of samples per uberon_id

samples_per_uberon = {(k.split("_")[-1]).strip() : v for k,v in dict(human_tissue_subset["organism part URL"].value_counts()).items()}

In [14]:
# make a dict to keep track of number of datasets/experiments per uberon_id

exp_per_uberon = {(k.split("_")[-1]).strip() : v for k,v in dict(human_tissue_subset.groupby('organism part URL').apply(lambda x: len(x['dataset'].unique()))).items()}

In [15]:
with open(os.path.join(data_dir, "misc_data", "FULL_tissue_overview.tsv"),"w") as f:
    f.write("UBERON_id\ttissue_name\tnumber_experiments\tnumber_samples\n")
    for uberon_id, _ in sorted(samples_per_uberon.items(), reverse=True, key=lambda item: item[1]):
        f.write(f"UBERON:{uberon_id}\t{uberon_name_mappings[uberon_id]}\t{exp_per_uberon[uberon_id]}\t{samples_per_uberon[uberon_id]}\n")

#### cell types

In [16]:
human_celltype_subset = human_data[human_data["cell type"].notnull()]
human_celltype_subset = human_celltype_subset[human_celltype_subset["cell type"] != ""]
human_celltype_subset = human_celltype_subset[human_celltype_subset['cell type URL'].str.contains("CL_")]

In [17]:
# make a dict to keep track of number of samples per CL_id

samples_per_CL = {(k.split("_")[-1]).strip() : v for k,v in dict(human_celltype_subset["cell type URL"].value_counts()).items()}

In [18]:
# make a dict to keep track of number of datasets/experiments per CL_id

exp_per_CL = {(k.split("_")[-1]).strip() : v for k,v in dict(human_celltype_subset.groupby('cell type URL').apply(lambda x: len(x['dataset'].unique()))).items()}

In [19]:
with open(os.path.join(data_dir, "misc_data", "FULL_celltype_overview.tsv"),"w") as f:
    f.write("CL_id\tcell_type_name\tnumber_experiments\tnumber_samples\n")
    for CL_id, _ in sorted(exp_per_CL.items(), reverse=True, key=lambda item: item[1]):
        f.write(f"CL:{CL_id}\t{CL_name_mappings[CL_id]}\t{exp_per_CL[CL_id]}\t{samples_per_CL[CL_id]}\n")

#### cell lines

In [20]:
human_cellline_subset = human_data[human_data["cell line"].notnull()]
human_cellline_subset = human_cellline_subset[human_cellline_subset["cell line"] != ""]
human_cellline_subset = human_cellline_subset[human_cellline_subset['cell line URL'].str.contains("CLO_")]

In [21]:
# make a dict to keep track of number of samples per CLO id

samples_per_CLO = {(k.split("_")[-1]).strip() : v for k,v in dict(human_cellline_subset["cell line URL"].value_counts()).items()}

In [22]:
# make a dict to keep track of number of datasets/experiments per CLO_id

exp_per_CLO = {(k.split("_")[-1]).strip() : v for k,v in dict(human_cellline_subset.groupby('cell line URL').apply(lambda x: len(x['dataset'].unique()))).items()}

In [25]:
CLO_name_mappings = {}
for i, row in human_cellline_subset.iterrows():
    clo_id = row["cell line URL"].split("_")[-1]
    if clo_id not in CLO_name_mappings:
        CLO_name_mappings[clo_id] = row["cell line"]

In [26]:
with open(os.path.join(data_dir, "misc_data", "FULL_cellline_overview.tsv"),"w") as f:
    f.write("CLO_id\tcell_line_name\tnumber_experiments\tnumber_samples\n")
    for CLO_id, _ in sorted(exp_per_CLO.items(), reverse=True, key=lambda item: item[1]):
        f.write(f"CLO:{CLO_id}\t{CLO_name_mappings[CLO_id]}\t{exp_per_CLO[CLO_id]}\t{samples_per_CLO[CLO_id]}\n")