# Prepare metadata for the R scripts to build the co-expression networks

In [1]:
import getpass
import json
import os
import sys
import time

import pandas as pd

In [2]:
getpass.getuser()

'rfigueiredo'

In [3]:
sys.version

'3.9.1 (v3.9.1:1e5d33e9b9, Dec  7 2020, 12:10:52) \n[Clang 6.0 (clang-600.0.57)]'

In [4]:
time.asctime()

'Sat Jan  8 00:10:47 2022'

In [5]:
# replace here the location of the external data dir
data_dir = os.path.join(os.path.expanduser("~"), "contnext_data", "data")

___

In [6]:
tissues = pd.read_table(os.path.join(data_dir, "misc_data", "FULL_tissue_overview.tsv"))

In [7]:
tissues_afterDL = pd.read_table(os.path.join(data_dir, "misc_data", "FULL_tissue_overview_after_download.tsv"))

In [8]:
celltypes = pd.read_table(os.path.join(data_dir, "misc_data", "FULL_celltype_overview.tsv"))

In [9]:
celltypes_afterDL = pd.read_table(os.path.join(data_dir, "misc_data", "FULL_celltype_overview_after_download.tsv"))

In [10]:
celllines = pd.read_table(os.path.join(data_dir, "misc_data", "FULL_cellline_overview.tsv"))

In [11]:
celllines_afterDL = pd.read_table(os.path.join(data_dir, "misc_data", "FULL_cellline_overview_after_download.tsv"))

___

In [12]:
tissues

Unnamed: 0,UBERON_id,tissue_name,number_experiments,number_samples
0,UBERON:0000178,blood,106,3152
1,UBERON:0002113,kidney,21,847
2,UBERON:0002371,bone marrow,21,787
3,UBERON:0001379,vastus lateralis,15,586
4,UBERON:0002097,skin of body,22,570
...,...,...,...,...
173,UBERON:0004087,vena cava,1,1
174,UBERON:0002245,cerebellar hemisphere,1,1
175,UBERON:0001875,globus pallidus,1,1
176,UBERON:0000995,uterus,1,1


In [13]:
tis_terms = tissues[(tissues["number_experiments"] >= 3) & (tissues["number_samples"] >= 20)]["tissue_name"]
tis_terms_afterDL = tissues_afterDL[(tissues_afterDL["number_experiments"] >= 3) & (tissues_afterDL["number_samples"] >= 20)]["tissue_name"]

num_tissues = len(tis_terms)
num_tissues_afterDL = len(tis_terms_afterDL)

print(f"tissues meeting criteria in top platform: {num_tissues}, after download(top platform): {num_tissues_afterDL}")

tissues meeting criteria in top platform: 48, after download(top platform): 46


In [14]:
celltype_terms = celltypes[(celltypes["number_experiments"] >= 3) & (celltypes["number_samples"] >= 20)]["cell_type_name"]
celltype_terms_afterDL = celltypes_afterDL[(celltypes_afterDL["number_experiments"] >= 3) & (celltypes_afterDL["number_samples"] >= 20)]["cell_type_name"]

num_celltypes = len(celltype_terms)
num_celltypes_afterDL = len(celltype_terms_afterDL)

print(f"cell types meeting criteria in top platform: {num_celltypes}, after download(top platform): {num_celltypes_afterDL}")

cell types meeting criteria in top platform: 30, after download(top platform): 30


In [15]:
cellline_terms = celllines[(celllines["number_experiments"] >= 3) & (celllines["number_samples"] >= 20)]["cell_line_name"]
cellline_terms_afterDL = celllines_afterDL[(celllines_afterDL["number_experiments"] >= 3) & (celllines_afterDL["number_samples"] >= 20)]["cell_line_name"]

num_celllines = len(cellline_terms)
num_celllines_afterDL = len(cellline_terms_afterDL)

print(f"cell lines meeting criteria in top platform: {num_celllines}, after download(top platform): {num_celllines_afterDL}")

cell lines meeting criteria in top platform: 23, after download(top platform): 22


In [16]:
metadata = pd.read_table(os.path.join(data_dir, "metadata", "final_metadata.tsv"), index_col=0)
metadata.index.name= "sample_id"

In [17]:
metadata_afterDL = pd.read_table(os.path.join(data_dir, "metadata", "metadataFinal_afterDataLoading.tsv"), index_col=0)
metadata_afterDL.index.name= "sample_id"

In [18]:
#human_data = metadata[metadata['platform'].str.contains("GPL570")]
#human_data = human_data.loc[human_data["species"] == "human"]

human_data_after_DL = metadata_afterDL[metadata_afterDL['platform'].str.contains("GPL570")]
human_data_after_DL = human_data_after_DL.loc[human_data_after_DL["species"] == "human"]

In [19]:
def prep_metadata_for_nets(samples_per_id, exp_per_id, metaData, term_type, name_mappings):

    path = os.path.join(data_dir, "data_for_coexp_network_construction", term_type)

    group_name_list = []
    group_id_list = []
    total_sample_num = 0
    total_dataset_num = 0
    datasets_total = set()
    for ID, num in samples_per_id.items():
        if num < 20 : continue # threshold
        if exp_per_id[ID] < 3 : continue # threshold

        os.makedirs(os.path.join(path,ID), exist_ok=True)

        #save group-specific metadata
        metaD = metaData.loc[metaData[term_type+".URL"].str.contains(ID)]
        metaD.to_csv(os.path.join(path, ID, "metadata.tsv"), sep="\t", index=True)

        #save list of datasets that contain specific group
        datasets = list(metaD["dataset"].unique())

        with open(os.path.join(path,ID,"datasets.txt"), "w") as f:
            f.write("\n".join(datasets))
            f.write("\n")

        total_sample_num += num
        for ds in datasets:
            if ds not in datasets_total:
                total_dataset_num += 1
                datasets_total.add(ds)

        group_name_list.append(name_mappings[ID])
        group_id_list.append(ID)
    print(f"{total_sample_num} {term_type} samples from {total_dataset_num} datasets with {len(group_name_list)} groups")
    return group_name_list, group_id_list, datasets_total


### tissue

In [20]:
with open(os.path.join(data_dir, "mappings", "uberon_name_mappings.json"), 'r') as f:
    uberon_name_mappings = json.load(f)

In [21]:
#human_tissue_subset = human_data[human_data["organism part"].notnull()]
#human_tissue_subset = human_tissue_subset[human_tissue_subset["organism part"] != ""]
#human_tissue_subset = human_tissue_subset[human_tissue_subset['organism part URL'].str.contains("UBERON")]


human_tissue_subset_afterDL = human_data_after_DL[human_data_after_DL["organism.part"].notnull()]
human_tissue_subset_afterDL = human_tissue_subset_afterDL[human_tissue_subset_afterDL["organism.part"] != ""]
human_tissue_subset_afterDL = human_tissue_subset_afterDL[human_tissue_subset_afterDL['organism.part.URL'].str.contains("UBERON")]

In [22]:
# make a dict to keep track of number of samples per uberon_id

#samples_per_uberon = {(k.split("_")[-1]).strip() : v for k,v in dict(human_tissue_subset["organism part URL"].value_counts()).items()}
samples_per_uberon_afterDL = {(k.split("_")[-1]).strip() : v for k,v in dict(human_tissue_subset_afterDL["organism.part.URL"].value_counts()).items()}

# make a dict to keep track of number of datasets/experiments per uberon_id

#exp_per_uberon = {(k.split("_")[-1]).strip() : v for k,v in dict(human_tissue_subset.groupby('organism part URL').apply(lambda x: len(x['dataset'].unique()))).items()}
exp_per_uberon_afterDL = {(k.split("_")[-1]).strip() : v for k,v in dict(human_tissue_subset_afterDL.groupby('organism.part.URL').apply(lambda x: len(x['dataset'].unique()))).items()}

In [23]:
tissue_group_list, tissue_id_list, tissue_ds_list = prep_metadata_for_nets(samples_per_uberon_afterDL, exp_per_uberon_afterDL, human_tissue_subset_afterDL, "organism.part", uberon_name_mappings)

10145 organism.part samples from 364 datasets with 46 groups


In [24]:
tissue_id_list

['0000178',
 '0002113',
 '0002371',
 '0002097',
 '0001155',
 '0001379',
 '0002048',
 '0000310',
 '0004802',
 '0002107',
 '0001013',
 '0000029',
 '0000955',
 '0002367',
 '0001134',
 '0000992',
 '0002046',
 '0001295',
 '0001225',
 '0001264',
 '0012168',
 '0001296',
 '0012652',
 '0002018',
 '0000945',
 '0001052',
 '0003729',
 '0001836',
 '0000947',
 '0002331',
 '0009835',
 '0001507',
 '0004911',
 '0002037',
 '0001876',
 '0000002',
 '0001235',
 '0016529',
 '0001987',
 '0000317',
 '0001377',
 '0001158',
 '0001891',
 '0002038',
 '0002316',
 '0000173']

## cell type

In [25]:
with open(os.path.join(data_dir, "mappings", "CL_name_mappings.json"), 'r') as f:
    CL_name_mappings = json.load(f)

In [26]:
#human_celltype_subset = human_data[human_data["cell type"].notnull()]
#human_celltype_subset = human_celltype_subset[human_celltype_subset["cell type"] != ""]
#human_celltype_subset = human_celltype_subset[human_celltype_subset['cell type URL'].str.contains("CL_")]

human_celltype_subset_after_DL = human_data_after_DL[human_data_after_DL["cell.type"].notnull()]
human_celltype_subset_after_DL = human_celltype_subset_after_DL[human_celltype_subset_after_DL["cell.type"] != ""]
human_celltype_subset_after_DL = human_celltype_subset_after_DL[human_celltype_subset_after_DL['cell.type.URL'].str.contains("CL_")]

In [27]:
# make a dict to keep track of number of samples per CL_id

#samples_per_CL = {(k.split("_")[-1]).strip() : v for k,v in dict(human_celltype_subset["cell type URL"].value_counts()).items()}
samples_per_CL_afterDL = {(k.split("_")[-1]).strip() : v for k,v in dict(human_celltype_subset_after_DL["cell.type.URL"].value_counts()).items()}

# make a dict to keep track of number of datasets/experiments per CL_id

#exp_per_CL = {(k.split("_")[-1]).strip() : v for k,v in dict(human_celltype_subset.groupby('cell type URL').apply(lambda x: len(x['dataset'].unique()))).items()}
exp_per_CL_afterDL = {(k.split("_")[-1]).strip() : v for k,v in dict(human_celltype_subset_after_DL.groupby('cell.type.URL').apply(lambda x: len(x['dataset'].unique()))).items()}


In [28]:
ct_group_list, ct_id_list, ct_ds_list = prep_metadata_for_nets(samples_per_CL_afterDL, exp_per_CL_afterDL, human_celltype_subset_after_DL, "cell.type", CL_name_mappings)

4737 cell.type samples from 222 datasets with 30 groups


## cell line 

In [29]:
#human_cellline_subset = human_data[human_data["cell line"].notnull()]
#human_cellline_subset = human_cellline_subset[human_cellline_subset["cell line"] != ""]
#human_cellline_subset = human_cellline_subset[human_cellline_subset['cell line URL'].str.contains("CLO_")]

human_cellline_subset_afterDL = human_data_after_DL[human_data_after_DL["cell.line"].notnull()]
human_cellline_subset_afterDL = human_cellline_subset_afterDL[human_cellline_subset_afterDL["cell.line"] != ""]
human_cellline_subset_afterDL = human_cellline_subset_afterDL[human_cellline_subset_afterDL['cell.line.URL'].str.contains("CLO_")]

In [30]:
# make a dict to keep track of number of samples per CLO id

#samples_per_CLO = {(k.split("_")[-1]).strip() : v for k,v in dict(human_cellline_subset["cell line URL"].value_counts()).items()}
samples_per_CLO_afterDL = {(k.split("_")[-1]).strip() : v for k,v in dict(human_cellline_subset_afterDL["cell.line.URL"].value_counts()).items()}

# make a dict to keep track of number of datasets/experiments per CLO_id

#exp_per_CLO = {(k.split("_")[-1]).strip() : v for k,v in dict(human_cellline_subset.groupby('cell line URL').apply(lambda x: len(x['dataset'].unique()))).items()}
exp_per_CLO__afterDL = {(k.split("_")[-1]).strip() : v for k,v in dict(human_cellline_subset_afterDL.groupby('cell.line.URL').apply(lambda x: len(x['dataset'].unique()))).items()}

In [31]:
CLO_name_mappings = {}
for i, row in human_cellline_subset_afterDL.iterrows():
    clo_id = row["cell.line.URL"].split("_")[-1]
    if clo_id not in CLO_name_mappings:
        CLO_name_mappings[clo_id] = row["cell.line"]

In [32]:
cl_group_list, cl_id_list, cl_ds_list = prep_metadata_for_nets(samples_per_CLO_afterDL, exp_per_CLO__afterDL, human_cellline_subset_afterDL, "cell.line", CLO_name_mappings)

1115 cell.line samples from 103 datasets with 22 groups
