# Finalize metadata for ContNeXt

In [1]:
import getpass
import json
import os
import sys
import time

import pandas as pd

In [2]:
getpass.getuser()

'rfigueiredo'

In [3]:
sys.version

'3.9.1 (v3.9.1:1e5d33e9b9, Dec  7 2020, 12:10:52) \n[Clang 6.0 (clang-600.0.57)]'

In [4]:
time.asctime()

'Fri Jan  7 23:20:36 2022'

In [5]:
# replace here the location of the external data dir
data_dir = os.path.join(os.path.expanduser("~"), "contnext_data", "data")

### use curation to finalize sample metadata

In [6]:
curation = pd.read_table(os.path.join(data_dir, "metadata", "metadata_manual_curation_version.tsv"), index_col=0)
curation = curation.fillna("")

In [7]:
sorted(curation["curated?"].unique())

['',
 'add cell line + URL',
 'add cell type + URL',
 'add org. part + URL',
 'add org. part + URL;add cell type + URL',
 'change cell type + URL',
 'change org. part + URL',
 'delete',
 'ok',
 'remove cell line + URL',
 'remove cell type + URL',
 'remove org. part + URL']

In [8]:
samples_to_delete = curation.loc[curation["curated?"] == "delete"].index

In [9]:
changed_tags = [
    'add cell line + URL',
    'add cell type + URL',
    'add org. part + URL',
    'change cell type + URL',
    'change org. part + URL',
    'remove cell line + URL',
    'remove cell type + URL',
    'remove org. part + URL'
]
changed_samples = curation.loc[curation["curated?"].isin(changed_tags)].index

In [10]:
original_metadata = pd.read_table(os.path.join(data_dir, "metadata", "metadata_before_curation.tsv"), index_col=0)

In [11]:
# delete samples marked to remove
updated_metadata = original_metadata.drop(samples_to_delete)

In [12]:
# chcange updated samples
for i in changed_samples:
    updated_metadata.loc[i] = curation.loc[i]

In [13]:
updated_metadata = updated_metadata.drop(columns=["manual check", "original organism part", "original cell type", "original cell line"])

In [14]:
# adjust these deprecated ids
deprecated_ids = {
    "http://purl.obolibrary.org/obo/CL_0000493": "http://purl.obolibrary.org/obo/CL_0000815",
    "http://purl.obolibrary.org/obo/CL_0000230": "http://purl.obolibrary.org/obo/CL_0000084",
}
changed_terms = {
    "T lymphoblast" : "T cell"
}
updated_metadata = updated_metadata.replace({'cell type URL': deprecated_ids})
updated_metadata = updated_metadata.replace({'cell type': changed_terms})

### re map selected terms

In [15]:
remapping = {
    "0002190" : "0001013", # adipose tissue
    "0014455" : "0001013", # ^
    "0006907" : "0001134", # skeletal muscle tissue
    "0002051" : "0004802", # repiratory tract epithelium
    "0005384" : "0004802", # ^
    "0001901" : "0004802", # ^
    "0004814" : "0004802", # ^
    "0004815" : "0004802", # ^
    "0001304" : "0004911", # epithelium of female gonad
    "0001870" : "0016529", # cortex of cerebral lobe
    "0016530" : "0016529", # ^
    "0016538" : "0016529", # ^
    "0016540" : "0016529", # ^
    "0002079" : "0002081", # cardiac atrium
    "0002078" : "0002081", # ^
    "0002084" : "0002082", # cardiac ventricle
    "0002080" : "0002082", # ^
    "0001675" : "0001800", # sensory ganglion
    "0000044" : "0001800", # ^
}

In [16]:
 #id to name map
with open(os.path.join(data_dir, "mappings", "uberon_name_mappings.json"), 'r') as f:
    uberon_name_mappings = json.load(f)

In [17]:
 # dict w/ full replacement strings for the df
url_prefix = "http://purl.obolibrary.org/obo/UBERON_"
uberonURL_remapping = {url_prefix+k : url_prefix+v for k,v in remapping.items()}

names_remapping = {uberon_name_mappings[k] : uberon_name_mappings[v] for k,v in remapping.items()}

In [18]:
 # re-map chosen ids
updated_metadata = updated_metadata.replace({'organism part URL': uberonURL_remapping})
updated_metadata = updated_metadata.replace({'organism part': names_remapping})

In [19]:
updated_metadata.to_csv(os.path.join(data_dir, "metadata", "final_metadata.tsv"), sep='\t', index=True)