# Intergation of Allen TType data to Nexus

Project allocation:
    
- GO terms live in `neurosciencegraph/datamodels`
- Genes live in `bbp/gene-annotations`
- TTypes and gene expression profiles in `bbp/aibs` (later we should move ttypes to `neurosciencegraph/datamodels`)

In [None]:
import rdflib
from rdflib import XSD, RDF, RDFS, OWL
from zipfile import ZipFile

import scanpy as sc
from scipy.io import mmwrite

import getpass
import json
import operator
from itertools import groupby

import networkx as nx
import numpy as np
import pandas as pd

from bmo.kbs.gene_kbs import get_gene_data

import nexussdk as nxs

from kgforge.core.forge import KnowledgeGraphForge
from kgforge.core.resource import Resource
from kgforge.specializations.resources import Dataset

from bluegraph import PandasPGFrame
from bluegraph.backends.networkx import pgframe_to_networkx

## 0. Helpers

In [None]:
def is_nan(el):
    try:
        if np.isnan(el):
            return True
    except Exception as e:
        pass
    return False


def remove_empty_hierarchy_nodes(frame):
    graph = pgframe_to_networkx(frame)
    while True:
        edges_to_remove = set()
        edges_to_add = set()
        for n in graph.nodes():
            if frame._nodes.loc[n, "cell_set_alias"]:
                for _, t in graph.out_edges(n):
                    father_alias = frame._nodes.loc[t, "cell_set_alias"]
                    if not father_alias:
                        edges_to_remove.add((n, t))
                        for _, tt in graph.out_edges(t):
                            edges_to_add.add((n, tt))

        for s, t in edges_to_remove:
            graph.remove_edge(s, t)
        for s, t in edges_to_add:
            graph.add_edge(s, t)

        if not edges_to_remove and not edges_to_add:
            break
    graph.remove_nodes_from(list(nx.isolates(graph)))
    graph.remove_nodes_from([
        n for n in graph.nodes() if not frame._nodes.loc[n, "cell_set_alias"]
    ])
    frame.remove_nodes([
        n for n in frame.nodes() if n not in graph.nodes()
    ])
    frame.remove_edges([
        (s, t) for s, t in frame.edges() if (s, t) not in graph.edges()
    ])
    new_edges = [
        (s, t) for s, t in graph.edges() if (s, t) not in frame.edges()
    ]
    frame.add_edges(new_edges)
    for s, t in new_edges:
        frame._edges.loc[(s, t), "@type"] = "IS_SUBCLASS_OF"

        
def autocomplete_types_from_children(frame, feature_column, aggregator=None):
    if aggregator is None:
        aggregator = lambda x: np.mean(x, axis=0).tolist()
    
    graph = pgframe_to_networkx(frame)
    prev_nodes_to_process = set()

    while True:
        nodes_to_process = set()
        for n in frame.nodes():
            if is_nan(frame._nodes.loc[n, feature_column]):
                nodes_to_process.add(n)

        if len(nodes_to_process) == 0 or\
           (prev_nodes_to_process == nodes_to_process):
            break

        for n in nodes_to_process:
            all_children = True
            children_vectors = []
            for child, _ in graph.in_edges(n):
                if child in frame.nodes():
                    if is_nan(frame._nodes.loc[child, feature_column]):
                        all_children = False
                        break
                    children_vectors.append(frame._nodes.loc[child, feature_column])
            if all_children:
                frame._nodes.at[n, feature_column] = aggregator(children_vectors)
        prev_nodes_to_process = nodes_to_process


def xrefs_to_payload(x):
    payloads = []
    
    if not is_nan(x):
        for k, v in x.items():
            payloads.append({
              "@type": "CrossReference",
              "name": k,
              "value": v
            })

    return payloads


def augment_cross_refs(x):
    identifiers = []
    for el in x.refs:
        identifier.append({
            "propertyID": el["name"],
            "value": el["value"]
        })
    identifiers.append({
          "propertyID": "Ensembl",
          "value": x.ensembl_id
    })
    identifiers.append({
          "propertyID": "UniProtAC",
          "value": x.uniprot_ac
    })
    return identifiers


def edges_to_go_payloads(x):
    try:
        d = {
            "".join([el.title() if i != 0 else el for i, el in enumerate(k.split("_"))]): [
                {
                    "@id": f"http://purl.obolibrary.org/obo/GO_{t.split(':')[1]}",
                    "label": go_term_payloads[t]["label"],
                    "@type": go_term_payloads[t]["@type"]
                }
                for s, t in v
                if t in go_term_payloads
            ]
            for k, v in groupby(x, operator.itemgetter(0))
        }
        return d
    except TypeError:
        return {}


def generate_gene_expression_payload(gene_batch,
                                     mean_genes, mean_features,
                                     median_genes, median_features,
                                     mean_profiles_dataset_id,
                                     median_profiles_dataset_id,
                                     gene_resource_ids):
    payload =  {
        "@type": "GeneExpressionProfile",
        "expression": [],
        "derivation": {
            "@type": "Derivation",
            "entity": [
                {
                    "@id": mean_profiles_dataset_id,
                    "@type": "Dataset"
                },{
                    "@id": median_profiles_dataset_id,
                    "@type": "Dataset"
                },
            ]
        }
    }

    for gene in gene_batch:
        create_record = False

        mean_value = None
        median_value = None
        if gene in mean_genes:
            mean_gene_index = np.where(mean_genes == gene)[0][0]
            mean_value = mean_features[mean_gene_index]
            if mean_value != 0:
                create_record = True

        if median_genes is not None and gene in median_genes:
                median_gene_index = np.where(median_genes == gene)[0][0]
                median_value = median_features[median_gene_index]
                if median_value != 0:
                    create_record = True
        
        if create_record:
            single_expression = {
                "isMeasurementOf": {
                    "@id": gene_resource_ids[gene] if gene in gene_resource_ids else None,
                    "label": gene,
                    "@type": "Gene"
                },
                "series": []
            }
        
            if mean_value is not None:
                single_expression["series"].append(
                     {
                        "statistic": "trimmed mean",
                        "value": mean_value,
                        "unitCode": "normalized UMI count"
                    }
                )

            if median_value is not None:
                single_expression["series"].append(
                     {
                        "statistic": "median",
                        "value": median_value,
                        "unitCode": "normalized UMI count"
                    }
                )
            payload["expression"].append(single_expression)

    return payload


def register_gene_expression_profiles(forge, frame, mean_genes, median_genes, ttype,
                                      mean_profiles_dataset_id, median_profiles_dataset_id, 
                                      gene_resource_ids,
                                      batch_size=8000):
    if isinstance(ttype, dict):
        p = ttype
    else:
        p = forge.as_json(ttype)

    mean_features = frame._nodes.loc[p["identifier"], "mean_features"]
    non_zero_mean_features = np.array([el for el in mean_features if el != 0])
    non_zero_mean_genes = np.array([
        g for i, g in enumerate(mean_genes) if mean_features[i] != 0
    ])
    
    median_features = frame._nodes.loc[p["identifier"], "median_features"]
    non_zero_median_features = None
    non_zero_median_genes = None
    if not isinstance(median_features, float):
        non_zero_median_features = np.array([el for el in median_features if el != 0])
        non_zero_median_genes = np.array([
            g for i, g in enumerate(median_genes) if median_features[i] != 0
        ])

    if non_zero_median_genes is not None:
        all_non_zero_genes = np.array(list(set(non_zero_mean_genes).union(non_zero_median_genes)))
    else:
        all_non_zero_genes = np.array(non_zero_mean_genes)
        

    size = all_non_zero_genes.shape[0]
    print(f"Processing the profile of {p['prefLabel']} (total {size} records)...")
    batches = int(size / batch_size) + 1
    batched_indices = np.array_split(range(size), batches)

    expression_resource_ids = []
    for batch in batched_indices:
        expression_resource = forge.from_json(
            generate_gene_expression_payload(
                all_non_zero_genes[batch],
                non_zero_mean_genes,
                non_zero_mean_features,
                non_zero_median_genes,
                non_zero_median_features,
                mean_profiles_dataset_id,
                median_profiles_dataset_id,
                gene_resource_ids))
        print(f"\tRegistering a batch with {len(expression_resource.expression)} records...")
        forge.register(expression_resource)
        expression_resource_ids.append(expression_resource.id) 
    
    if isinstance(ttype, dict):
        p["expressionProfile"] = [
            {
                "@id": el,
                "@type": "GeneExpressionProfile"
            } for el in expression_resource_ids
        ]
        ttype_resource = forge.from_json(p)
        forge.register(ttype_resource)
        return ttype_resource
    else:
        ttype.expressionProfile = [
            forge.from_json({
                "@id": el,
                "@type": "GeneExpressionProfile"
            }) for el in expression_resource_ids
        ]
        forge.update(ttype)

Initialize forge sessions

In [None]:
TOKEN = getpass.getpass()

In [None]:
SPECIES = 'mus_musculus'

In [None]:
ng_forge = KnowledgeGraphForge(
    "../../config/prod-forge-config.yml",
    token=TOKEN,
    bucket="neurosciencegraph/datamodels")

In [None]:
genes_forge = KnowledgeGraphForge(
    "../../config/prod-forge-config.yml",
    token=TOKEN,
    bucket="bbp/gene-annotations")

In [None]:
aibs_forge = KnowledgeGraphForge(
    "../../config/prod-forge-config.yml",
    token=TOKEN,
    bucket="bbp/aibs")

In [None]:
nxs.config.set_environment("https://bbp.epfl.ch/nexus/v1")
nxs.config.set_token(TOKEN)

In [None]:
DATASET_TAG = "2022-02-17"

## 1. Load the dataset for TType hierarchy and mean expression profiles

In [None]:
with ZipFile("../data/allen_ttypes_data_25_11_2021/trimmed_means.csv.zip", 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall("../data/allen_ttypes_data_25_11_2021/")

In [None]:
with ZipFile("../data/allen_ttypes_data_25_11_2021/medians.csv.zip", 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall("../data/allen_ttypes_data_25_11_2021/")

In [None]:
with ZipFile("../data/allen_ttypes_data_25_11_2021/dend.json.zip", 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall("../data/allen_ttypes_data_25_11_2021/")

In [None]:
with ZipFile("../data/allen_ttypes_data_25_11_2021/metadata.csv.zip", 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall("../data/allen_ttypes_data_25_11_2021/")

In [None]:
mean_ttype_profiles = pd.read_csv(
    "../data/allen_ttypes_data_25_11_2021/trimmed_means.csv").set_index("feature")

In [None]:
median_ttype_profiles = pd.read_csv(
    "../data/allen_ttypes_data_25_11_2021/medians.csv").set_index("feature")

In [None]:
with open("../data/allen_ttypes_data_25_11_2021/dend.json", "r") as f:
    type_hierarchy = json.load(f)

In [None]:
non_zero_mean_profiles = mean_ttype_profiles[mean_ttype_profiles != 0].dropna(how="all")
non_zero_mean_profiles.sample(5)
non_zero_mean_profiles[non_zero_mean_profiles.isna()] = 0.0

In [None]:
non_zero_median_profiles = median_ttype_profiles[median_ttype_profiles != 0].dropna(how="all")
non_zero_median_profiles.sample(5)
non_zero_median_profiles[non_zero_median_profiles.isna()] = 0.0

In [None]:
genes = non_zero_mean_profiles.index

In [None]:
medain_genes = non_zero_median_profiles.index

Make sure there are no genes in the median dataset that are not in the mean dataset.

In [None]:
set(medain_genes).difference(set(genes))

## 2. Register resources for TType hierarchy and mean expression profiles

In [None]:
mean_profiles_dataset = Dataset(aibs_forge)
mean_profiles_dataset.type = ["Dataset", "GeneExpressionDataset"]
mean_profiles_dataset.name = "Allen Transcriptomic Types (Trimmed) Mean Expression Profiles"
mean_profiles_dataset.description = "This dataset includes single-cell transcriptomes from multiple cortical areas and the hippocampal formation, including 1.1M total cells. Samples were collected from dissections of brain regions from ~8 week-old male and female mice, from pan-neuronal transgenic lines. Gene expression aggregated per cell type, calculated as trimmed means (25%-75%)."
mean_profiles_dataset.distribution = aibs_forge.attach(
    "../data/allen_ttypes_data_25_11_2021/trimmed_means.csv")

In [None]:
aibs_forge.register(mean_profiles_dataset)

In [None]:
aibs_forge.tag(mean_profiles_dataset, DATASET_TAG)

In [None]:
# mean_profiles_dataset_id = mean_profiles_dataset.id

In [None]:
mean_profiles_dataset_id = 'https://bbp.epfl.ch/neurosciencegraph/data/ded3cbe3-541f-43da-a388-c64136194417'

In [None]:
median_profiles_dataset = Dataset(aibs_forge)
median_profiles_dataset.type = ["Dataset", "GeneExpressionDataset"]
median_profiles_dataset.name = "Allen Transcriptomic Types Median Expression Profiles"
median_profiles_dataset.description = "This dataset includes single-cell transcriptomes from multiple cortical areas and the hippocampal formation, including 1.1M total cells. Samples were collected from dissections of brain regions from ~8 week-old male and female mice, from pan-neuronal transgenic lines. Gene expression aggregated per cluster, calculated as medians."
median_profiles_dataset.distribution = aibs_forge.attach(
    "../data/allen_ttypes_data_25_11_2021/medians.csv")

In [None]:
aibs_forge.register(median_profiles_dataset)

In [None]:
aibs_forge.tag(median_profiles_dataset, DATASET_TAG)

In [None]:
median_profiles_dataset_id = "https://bbp.epfl.ch/neurosciencegraph/data/c56e2a37-4477-4d18-b34e-84fa86fa1c43"

In [None]:
hierarchy_dataset = Dataset(aibs_forge)
hierarchy_dataset.type = ["Dataset", "TypeHierarchyDataset"]
hierarchy_dataset.name = "Allen Transcriptomic Type Hierarchy"
hierarchy_dataset.description = "Serialized cluster hierarchy with all node information embedded."
hierarchy_dataset.distribution = aibs_forge.attach(
    "../data/allen_ttypes_data_25_11_2021/dend.json")

In [None]:
aibs_forge.register(hierarchy_dataset)
aibs_forge.tag(hierarchy_dataset, DATASET_TAG)

In [None]:
hierarchy_dataset_id = hierarchy_dataset.id

In [None]:
hierarchy_dataset_id = 'https://bbp.epfl.ch/neurosciencegraph/data/4c55b935-bc10-4b90-b3c0-b7365d1ddbce'

## 3. Register Genes and GO terms

Fetch gene data from Ensembl + UniProt + QuickGO

In [None]:
records = {}
for gene in genes:
    if gene not in records:
        try:
            print(gene)
            records[gene] = get_gene_data(gene, SPECIES)
        except:
            print("Skipped: ", gene)

In [None]:
gene_data = pd.DataFrame(records.values(), index=records.keys())
gene_data["label"] = gene_data.index
gene_data.to_pickle(
    "../data/allen_ttypes_data_25_11_2021/non_zero_expression_genes.pkl")

Load data from the dump if already fetched

In [None]:
gene_data = pd.read_pickle(
    "../data/allen_ttypes_data_25_11_2021/non_zero_expression_genes.pkl")

### 3.1. Register GO terms

Load the GO file

In [None]:
go = rdflib.Graph()
go.parse("../../ontologies/external/go.ttl", format="turtle")

In [None]:
records = gene_data.to_dict("index")

In [None]:
go_terms_to_include = {}
for g, record in records.items():
    if "go_edges" in record:
        if not is_nan(record["go_edges"]):
            for _, t in record["go_edges"]:
                if t not in go_terms_to_include:
                    for s in go.subjects(
                            rdflib.URIRef("http://www.geneontology.org/formats/oboInOwl#id"),
                            rdflib.Literal(t, datatype=XSD.string)):
                        go_terms_to_include[t] = s
                        break

In [None]:
predicates_to_map = {
    rdflib.URIRef("http://www.geneontology.org/formats/oboInOwl#hasOBONamespace"): "hasOboNamespace",
    rdflib.URIRef("http://purl.obolibrary.org/obo/IAO_0000115"): "definition",
    rdflib.URIRef("http://www.geneontology.org/formats/oboInOwl#id"): "identifier"
}

Register a payload representaing the gene ontology

In [None]:
go_payload = {
    "@id": "https://bbp.epfl.ch/ontologies/external/go",
    "@type": "Ontology",
    "label": "Gene Ontology"
}
# ng_forge.register(ng_forge.from_json(go_payload))

Create payloads for relevant GO terms

In [None]:
go_term_payloads = {}
for k, v in go_terms_to_include.items():
    payload = {
        "label": go.label(v).value
    }
    for p, key in predicates_to_map.items():
        for o in go.objects(v, p):
            payload[key] = o.value
            break
    identifier = payload["identifier"].split(":")[1]
    payload["@id"] = f"http://purl.obolibrary.org/obo/GO_{identifier}"
    payload["@type"] = "Class"
    payload["isDefinedBy"] = {
        "@id": "http://purl.obolibrary.org/obo/go.owl"
    }
#     payload["@type"] = ["Entity", "".join([el.title() for el in payload["hasOboNamespace"].split("_")])]
    
    super_classes = []
    for o in go.objects(v, RDFS.subClassOf):
        if isinstance(o, rdflib.URIRef) and o in go_terms_to_include.values():
            super_classes.append({
                "@id": str(o)
            })
    
    payload["subClassOf"] = super_classes
    go_term_payloads[payload["identifier"]] = payload

In [None]:
len(go_term_payloads)

Register GO term payloads

In [None]:
ng_forge.register(ng_forge.from_json(list(go_term_payloads.values())),
                  schema_id="https://neuroshapes.org/dash/ontology")

### 3.2. Register Genes

In [None]:
gene_data["go_edges"] = gene_data["go_edges"].apply(edges_to_go_payloads)

In [None]:
gene_data["xrefs"] = gene_data["xrefs"].apply(xrefs_to_payload)

In [None]:
gene_data.apply(augment_cross_refs, axis=1)

In [None]:
gene_data["involvedIn"] = gene_data["go_edges"].apply(
    lambda x: x.get("involvedIn", []))
gene_data["enables"] = gene_data["go_edges"].apply(
    lambda x: x.get("enables", []))
gene_data["partOf"] = gene_data["go_edges"].apply(
    lambda x: x.get("partOf", []))

Create payloads for genes

In [None]:
gene_payloads = gene_data.rename(
    columns={"xrefs": "identifier"}).drop(
    columns=["ensembl_id", "uniprot_ac", "go_edges"]).to_dict("records")

In [None]:
for p in gene_payloads:
    p.update(
        {
            "@type": "Gene",
            "subject": {
                "@type": "Subject",
                "species": {
                    "@id": "http://purl.obolibrary.org/obo/NCBITaxon_10090",
                    "label": "Mus musculus"
                }
            }
        })
    if is_nan(p["prefLabel"]):
        p["prefLabel"] = p["label"]
    if is_nan(p["hasXRef"]):   
        del p["hasXRef"]
    if is_nan(p["description"]): 
        del p["description"]

Register gene payloads

In [None]:
gene_resources = genes_forge.from_json(gene_payloads)
genes_forge.register(gene_resources)

Add a cross-project resolver to `neurosciencegraph/datamodel`

In [None]:
IDENTITY_resolver = {"realm": "bbp"}

try:
    nxs.resolvers.create(
        org_label="bbp",
        project_label="gene-annotations",
        projects=["neurosciencegraph/datamodels"],
        identities=[IDENTITY_resolver], priority=60)
except nxs.HTTPError as e:
    print(e)
    print("---")
    nxs.tools.pretty_print(e.response.json())

In [None]:
gene_resources = genes_forge.search({"type": "Gene"}, limit=20000)

In [None]:
def replace_entities_by_classes(payloads):
    if isinstance(payloads, list):
        for el in payloads:
            el.type = "Class"
    else:
        payloads.type = "Class"

In [None]:
for r in gene_resources:
    d = genes_forge.as_json(r)
    if "enables" in d:
        replace_entities_by_classes(r.enables)
    if "involvedIn" in d:
        replace_entities_by_classes(r.involvedIn)
    if "partOf" in d:
        replace_entities_by_classes(r.partOf)

In [None]:
genes_forge.update(gene_resources)

## 4. Register TTypes and thier expression profiles

Create a graph from the imported hierarchy:

- _nodes_ are t-types;
- _node features_ are gene expression profiles;
- _edges_ are 'isSubclassOf' relationships.

In [None]:
edges = set()
nodes = {}

def extract_data(hierarchy, father=None):
    if "children" in hierarchy:
        for child in hierarchy["children"]:
            if "node_attributes" in child:
                attrs = child["node_attributes"][0]
            else:
                attrs = child["leaf_attributes"][0]
            nodes[attrs["cell_set_accession"]] = attrs
            extract_data(child, attrs["cell_set_accession"])
            if father:
                edges.add((attrs["cell_set_accession"], father))

In [None]:
extract_data(type_hierarchy)
mean_expression_matrix = non_zero_mean_profiles.T.values
median_expression_matrix = non_zero_median_profiles.T.values

In [None]:
node_df = pd.DataFrame(nodes.values()).rename(
    columns={"cell_set_accession": "@id"})

mean_node_vectors = {}
median_node_vectors = {}
for i, c in enumerate(mean_ttype_profiles.columns):
    mean_node_vectors[c] = mean_expression_matrix[i, :].tolist()
for i, c in enumerate(median_ttype_profiles.columns):
    median_node_vectors[c] = median_expression_matrix[i, :].tolist()

mean_node_vectors = pd.DataFrame(
    mean_node_vectors.items(), columns=["cell_set_alias", "mean_features"])
median_node_vectors = pd.DataFrame(
    median_node_vectors.items(), columns=["cell_set_alias", "median_features"])

node_df = node_df.merge(
    mean_node_vectors, on="cell_set_alias", how="left").merge(
        median_node_vectors, on="cell_set_alias", how="left").set_index("@id")

In [None]:
edge_df = pd.DataFrame(
    edges, columns=["@source_id", "@target_id"]).set_index(["@source_id", "@target_id"])
edge_df["@type"] = "IS_SUBCLASS_OF"
frame = PandasPGFrame.from_frames(node_df, edge_df)
for c in frame._nodes.columns:
    frame.node_prop_as_category(c)

Some nodes in the original hierarchy file don't have an identity (they represent some intermediate clusters). We remove them at the moment and connect their children to their parent class.

In [None]:
remove_empty_hierarchy_nodes(frame)

Mean experession profiles are provided only for the leaf t-types. For non-leaf t-types we compute their expression profiles as mean of all of their children (recursively).

In [None]:
autocomplete_types_from_children(frame, "mean_features")

There is one type that doesn't have an expression profile, but also doesn't have any children from which we can compute their mean profiles. We simply remove it from the graph.

In [None]:
frame.remove_nodes(["CS202106160_381"])
edges_to_remove = set()
for s, t in frame._edges.index:
    if s == "CS202106160_381":
        edges_to_remove.add((s, t))
    if t == "CS202106160_381":
        edges_to_remove.add((s, t))
frame.remove_edges(edges_to_remove)

We check that there are no node with empty expression profiles

In [None]:
frame._nodes[frame._nodes["mean_features"].apply(lambda x: is_nan(x))]

Create t-type payloads

In [None]:
ttype_payloads = frame._nodes[[
    "label",
    "cell_set_alias",
    "cell_set_alias_citation",
    "taxonomy_id",
    "color"
]].reset_index().rename(columns={
    "label": "cellSetLabel",
    "cell_set_alias": "prefLabel",
    "@id": "identifier",
    "taxonomy_id": "taxonomyId"
}).to_dict("records")

Add static properties:
- brain region (all from the same region)
- derivation (from the previously registered hierarchy dataset)
- url with scholarly article
- subject with species

In [None]:
hierarchy_derivation_payload = {
    "@type": "Derivation",
    "entity": {
        "@id": hierarchy_dataset_id,
        "@type": "Dataset"
    }
}

brain_region_payload = [
    {
        "@id": "http://api.brain-map.org/api/v2/data/Structure/695",
        "label": "Cortical plate"
    },
    {
        "@id": "http://api.brain-map.org/api/v2/data/Structure/1089",
        "label": "Hippocampal formation"
    }
]

url_payload =  {
   "@id": "https://doi.org/10.1016/j.cell.2021.04.021",
   "title": "A taxonomy of transcriptomic cell types across the isocortex and hippocampal formation",
   "@type": "ScholarlyArticle"
} 

for p in ttype_payloads:
    p["@type"] = "TType"
    p["derivation"] = hierarchy_derivation_payload
    p["brainRegion"] = brain_region_payload
    p["url"] = url_payload
    del p["cell_set_alias_citation"]
    p["subject"] = {
        "@type": "Subject",
        "species": {
            "@id": "http://purl.obolibrary.org/obo/NCBITaxon_10090",
            "label": "Mus musculus"
        }
    }

Remove old expression profiles if needed

In [None]:
# expression_profiles = aibs_forge.search({"type": "GeneExpressionProfile"}, limit=1000)
# print(len(expression_profiles))
# for p in expression_profiles:
#     if not p._store_metadata._deprecated:
#         aibs_forge.deprecate(aibs_forge.retrieve(p.id))

Add aggregated brain regions and sexes to TTypes

In [None]:
with ZipFile("../data/allen_ttypes_data_25_11_2021/grouped_meta_data.csv.zip", 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall("../data/allen_ttypes_data_25_11_2021/")

In [None]:
grouped_meta_data = pd.read_csv("../data/allen_ttypes_data_25_11_2021/grouped_meta_data.csv")
grouped_meta_data = grouped_meta_data[
    ["cell_type_accession_label", "donor_sex_label", "region_label"]].set_index("cell_type_accession_label")
for c in grouped_meta_data.columns:
    grouped_meta_data[c] = grouped_meta_data[c].apply(literal_eval)

In [None]:
grouped_meta_data["region_label"] = grouped_meta_data["region_label"].apply(
    lambda x: sum([
        [el for el in xx.replace("-", " ").replace("_", " ").split(" ")]
        for xx in x
    ], [])
)

In [None]:
frame._nodes = frame._nodes.reset_index().merge(
    grouped_meta_data[["region_label"]].reset_index().rename(
        columns={"cell_type_accession_label": "@id"}),
    how="left").set_index("@id")

In [None]:
frame._nodes = frame._nodes.reset_index().merge(
    grouped_meta_data[["donor_sex_label"]].reset_index().rename(
        columns={"cell_type_accession_label": "@id"}),
    how="left").set_index("@id")

In [None]:
frame._nodes["region_label"] = frame._nodes["region_label"].apply(lambda x: list(x) if isinstance(x, set) else x)

In [None]:
empty_ttypes = frame._nodes[frame._nodes["region_label"].isna()].index

In [None]:
autocomplete_types_from_children(
    frame, "donor_sex_label", aggregator=lambda x: list(set(sum(x, []))))

In [None]:
autocomplete_types_from_children(
    frame, "region_label", aggregator=lambda x: list(set(sum(x, []))))

In [None]:
male_payload = {
    "id": "http://purl.obolibrary.org/obo/PATO_0000384",
    "label": "male"
}

female_payload = {
    "id": "http://purl.obolibrary.org/obo/PATO_0000383",
    "label": "female"
}

regions_to_resources = {}
regions = set(sum(grouped_meta_data["region_label"].to_list(), []))
for r in regions:
    res = ng_forge.search({
        "type": "Class",
        "notation": r
    })
    try:
        resource = res[0]
        regions_to_resources[r] = {
            "id": resource.id,
            "label": resource.label,
            "notation": resource.notation
        }
    except:
        r = r.replace("-", " ").replace("_", " ")
        parts = r.split(" ")
        for p in parts:
            res = ng_forge.search({
                "type": "Class",
                "notation": p
            })
            resource = res[0]
            regions_to_resources[p] = {
                "id": resource.id,
                "label": resource.label,
                "notation": resource.notation
            }

In [None]:
for ttype_label in empty_ttypes:
    r_id = annotations_label_to_id.get(ttype_label)
    if r_id:
        resource = annotation_id_to_resources[r_id]
        print(ttype_label)
        r_payloads = []
        for el in frame._nodes.loc[ttype_label]["region_label"]:
            r_payloads.append(regions_to_resources[el])
    
        s_payloads = []
        for el in frame._nodes.loc[ttype_label]["donor_sex_label"]:
            if el == "M":
                s_payloads.append(male_payload)
            elif el == "F":
                s_payloads.append(female_payload)
        
        resource.brainLocation.brainRegion = aibs_forge.from_json(
            r_payloads)
        resource.subject.sex = aibs_forge.from_json(
            s_payloads)
        resource.label = resource.label
        aibs_forge.update(resource)
        aibs_forge.tag(resource, DATASET_TAG)

Add expression profiles

In [None]:
batch_size = 800

Add new TType resources

In [None]:
ttypes = []
for p in ttype_payloads:
    new_resource = register_gene_expression_profiles(
        aibs_forge, frame, genes, medain_genes, p,
        mean_profiles_dataset_id, median_profiles_dataset_id,
        gene_resource_ids, batch_size=800)
    ttypes.append(new_resource)

In [None]:
ttype_id_to_resource_id = {
    el.identifier: el.id
    for el in ttypes
}

Add 'subClassOf' properties to t-types

In [None]:
edges = frame._edges.reset_index()
edges["source_resource"] = edges.apply(
    lambda x: ttype_id_to_resource_id[x["@source_id"]], axis=1)
edges["target_resource"] = edges.apply(
    lambda x: ttype_id_to_resource_id[x["@target_id"]], axis=1)
subclass_payloads = edges[["source_resource", "target_resource"]].groupby("source_resource").aggregate(
    {"target_resource": lambda x: [{"@id": x.tolist()[0], "@type": "TType"}]})

In [None]:
labeled_resources = {el.id: el for el in ttype_resources}

In [None]:
for p in subclass_payloads.reset_index().to_dict("records"):
    r = labeled_resources[p["source_resource"]]
    r.subClassOf = aibs_forge.from_json(p["target_resource"])
    aibs_forge.update(r)

Run the following in case you want to update existing TType resources

In [None]:
ttypes = aibs_forge.search({"type": "TType"}, limit=1000)
current_index = 0
while current_index != len(ttypes) - 1:
    for i in range(current_index, len(ttypes)):
        current_index = i
        try:
            register_gene_expression_profiles(
                aibs_forge, frame, genes, ttypes[i],
                profiles_dataset_id, gene_resource_ids, batch_size=800)
        except:
            pass

Clean-up expression profiles not belonging to any t-type

In [None]:
attached_profiles = set()
for r in ttypes:
    if isinstance(r.expressionProfile, list):
        for el in r.expressionProfile:
            attached_profiles.add(aibs_forge.as_json(el)["@id"])
    else:
        attached_profiles.add(aibs_forge.as_json(r.expressionProfile)["@id"])

In [None]:
print("Attached profiles:", len(attached_profiles))
expression_profiles = aibs_forge.search({"type": "GeneExpressionProfile"}, limit=5000)
print("All profiles: ", len(expression_profiles))
print("Removing detached profiles...")
for p in expression_profiles:
    if not p._store_metadata._deprecated and p.id not in attached_profiles:
        aibs_forge.deprecate(aibs_forge.retrieve(p.id))

Add a cross project resolver to `bbp/gene-annotations`

In [None]:
IDENTITY_resolver = {"realm": "bbp"}

try:
    nxs.resolvers.create(
        org_label="bbp",
        project_label="aibs",
        projects=["bbp/gene-annotations"],
        identities=[IDENTITY_resolver], priority=60)
except nxs.HTTPError as e:
    print(e)
    print("---")
    nxs.tools.pretty_print(e.response.json())