In [None]:
import getpass
import json
import os

import numpy as np
import pandas as pd

from kgforge.core.forge import KnowledgeGraphForge

In [None]:
def retrieve_expression_profile(forge, annotation_resource, tag=None):
    expression_profile_batches = annotation_resource.hasBody.expressionProfile
    expression_profile = []
    for batch in expression_profile_batches:
        profile = forge.retrieve(batch.id, version=tag)
        expression_profile += forge.as_json(profile.expression)
    return expression_profile

## Initialize and configure

### Get an authentication token

For now, the [Nexus web application](https://bbp.epfl.ch/nexus/web) can be used to get a token. We are looking for other simpler alternatives.

- Step 1: From the opened web page, click on the login button on the right corner and follow the instructions.

- Step 2: At the end you’ll see a token button on the right corner. Click on it to copy the token.

In [None]:
ENDPOINT = "https://bbp.epfl.ch/nexus/v1"

In [None]:
TOKEN = getpass.getpass()

In [None]:
DOWNLOAD_DIR = "."  # here you can specify any path, current directory is selected in the example

### Configure a client (forge) to access the knowledge graph 

Forge session for working with AIBS data:

In [None]:
aibs_forge = KnowledgeGraphForge(
    "https://raw.githubusercontent.com/BlueBrain/nexus-forge/master/examples/notebooks/use-cases/prod-forge-nexus.yml",
    endpoint=ENDPOINT,
    token=TOKEN,
    bucket="bbp/aibs")

Forge session for working with gene data:

In [None]:
genes_forge = KnowledgeGraphForge(
    "https://raw.githubusercontent.com/BlueBrain/nexus-forge/master/examples/notebooks/use-cases/prod-forge-nexus.yml",
    endpoint=ENDPOINT,
    token=TOKEN,
    bucket="bbp/gene-annotations")

## Search and Download

Here, before we begin fetching the datasets and resources, we need to set the version of the dataset we want to work with. All the datasets (taxonomy of clusters, gene expression profiles) are tagged with the same tag.

The tag can be set to `None` to get the last version.

In [None]:
DATASET_TAG = '2022-02-17'

### Original AIBS datasets

1. Taxonomy of clusters (serialized cluster hierarchy with all node information embedded).

In [None]:
type_hierarchy_dataset = aibs_forge.search(
    {"type": "TypeHierarchyDataset"}, limit=None)[0]
type_hierarchy_dataset = aibs_forge.retrieve(
    type_hierarchy_dataset.id, version=DATASET_TAG)

In [None]:
print(type_hierarchy_dataset)

Download the dataset

In [None]:
aibs_forge.download(
    type_hierarchy_dataset.distribution,
    follow="contentUrl",
    path=DOWNLOAD_DIR,
    overwrite=True
)

2. Gene Expression by Cluster, trimmed means (gene expression aggregated per cell type, calculated as trimmed means (25%-75%)).

In [None]:
mean_expression_dataset = aibs_forge.search(
    {
        "type": "GeneExpressionDataset",
        "name": "Allen Transcriptomic Types (Trimmed) Mean Expression Profiles"
    }, limit=None)[0]
mean_expression_dataset = aibs_forge.retrieve(
    mean_expression_dataset.id, version=DATASET_TAG)

In [None]:
print(mean_expression_dataset)

Download the dataset

In [None]:
aibs_forge.download(
    mean_expression_dataset.distribution,
    follow="contentUrl",
    path=DOWNLOAD_DIR,
    overwrite=True
)

Read the downloaded gene expression data into a pandas dataframe.

In [None]:
raw_mean_expression_data = pd.read_csv(
    os.path.join(
        DOWNLOAD_DIR, mean_expression_dataset.distribution.name))
raw_mean_expression_data.head(3)

3. Gene Expression by Cluster, medians (gene expression aggregated per cell type).

In [None]:
median_expression_dataset = aibs_forge.search(
    {
        "type": "GeneExpressionDataset",
        "name": "Allen Transcriptomic Types Median Expression Profiles"
    }, limit=None)[0]
median_expression_dataset = aibs_forge.retrieve(
    median_expression_dataset.id, version=DATASET_TAG)

In [None]:
print(median_expression_dataset)

In [None]:
aibs_forge.download(
    median_expression_dataset.distribution,
    follow="contentUrl",
    path=DOWNLOAD_DIR,
    overwrite=True
)

In [None]:
raw_median_expression_data = pd.read_csv(
    os.path.join(
        DOWNLOAD_DIR, median_expression_dataset.distribution.name))
raw_median_expression_data.head(3)

### Annotations of TTypes

Get all TTypes for the retrieved taxonomy dataset.

In [None]:
query = f"""
    SELECT ?id
    WHERE {{
        ?id a GeneExpressionAnnotation ;
            <https://bluebrain.github.io/nexus/vocabulary/deprecated> false ;
            derivation/entity <{type_hierarchy_dataset.id}>.
    }}
"""
annotation_ids = aibs_forge.sparql(query, limit=None)

# If we set up specific version of the dataset, we need to make sure
# we fetch TType annotations corresponding to this version.
ttype_annotations = []
if DATASET_TAG:
    for annotation in annotation_ids:
        ttype_annotations.append(
            aibs_forge.retrieve(annotation.id, version=DATASET_TAG))

In [None]:
print("Retrieved", len(ttype_annotations), "TType annotations")

You can work with the retrieved resources as a dataframe

In [None]:
df = aibs_forge.as_dataframe(ttype_annotations)
print(df.columns)
df.head(3)

We can visualize brain regions aggregated from all individual cells for each of the T-types.

We can visualize subject sex aggregated from all individual cells for each of the T-types.

In [None]:
def get_brain_region_notations(records):
    if isinstance(records, float):
        return []
    return [
        record["notation"]
        for record in records
    ]

In [None]:
df["regionLabel"] = df["brainLocation.brainRegion"].apply(get_brain_region_notations)
df[["label", "regionLabel"]]

In [None]:
def get_sex_labels(records):
    if isinstance(records, float):
        return []
    return [
        record["label"]
        for record in records
    ]

In [None]:
df["sexLabel"] = df["subject.sex"].apply(get_sex_labels)
df[["label", "sexLabel"]]

We can get a list of "high-order" types from the cell clustering for which their gene expression annotation was infered (only mean is calculated as mean of all the representatives of the cluster)

In [None]:
df[df.inferred.notna()]["label"]

You can work with individual results as json:

In [None]:
json_repr = aibs_forge.as_json(ttype_annotations[0])

# Pretty print of the JSON dict
print(json.dumps(json_repr, indent="   "))

You can search by a particular TType label

In [None]:
annotation_376_Astro = aibs_forge.search({
    "type": "GeneExpressionAnnotation",
    "hasTarget": {
        "hasSource": {
            "label": "376_Astro"
        }
    },
    "derivation": {
        "entity": {
            "id": type_hierarchy_dataset.id
        }
    }
}, limit=None)

In [None]:
for a in annotation_376_Astro:
    print(a)

### Gene Expression Profiles of TTypes

Retrieve the gene expression profile associated with a given TType annotation.

In [None]:
annotation = ttype_annotations[0]

In [None]:
annotation.hasTarget.hasSource.label

In [None]:
gene_expression_profile = retrieve_expression_profile(
    aibs_forge, annotation, DATASET_TAG)

In [None]:
gene_expression_profile

Gene expression profile contains all non-zero expression values for different genes.

In [None]:
# Print 5 records of the retrieved expression profile
for el in gene_expression_profile[:5]:
    if isinstance(el["series"], dict):
        el["series"] = [el["series"]]
    print("Gene: ", el["isMeasurementOf"]["label"])
    for record in el["series"]:
        print("\tStatistic: ", record["statistic"])
        print("\tValue: ", record["value"], record["unitCode"])
        print()

We can also get a raw matrix with gene expression for each 'leaf' T-types and perform necessary operations with it (e.g. find variance).

Download and open the raw expression matrix

In [None]:
raw_expression_dataset = aibs_forge.retrieve(annotation.rawExpressionDataset.id)

In [None]:
aibs_forge.download(
    raw_expression_dataset.distribution,
    follow="contentUrl",
    path=DOWNLOAD_DIR,
    overwrite=True
)

In [None]:
expression_matrix = np.load(os.path.join(
        DOWNLOAD_DIR, raw_expression_dataset.distribution.name))

In [None]:
expression_matrix.shape

Now we also need to load the index for this matrix (the list of genes to which columns correspond).

In [None]:
index_resource = aibs_forge.retrieve(
    raw_expression_dataset.geneExpressionIndex.id)
aibs_forge.download(
    index_resource.distribution,
    follow="contentUrl",
    path=DOWNLOAD_DIR,
    overwrite=True
)

In [None]:
with open(os.path.join(
        DOWNLOAD_DIR, index_resource.distribution.name), "r") as f:
    gene_index = json.load(f)

In [None]:
print("First five genes: ", gene_index[:5])
print("Number of genes in the index: ", len(gene_index))

Finally, we can compute, for example, variance:

In [None]:
gene_variance = np.var(expression_matrix, axis=0)

In [None]:
print("Variance for the five first genes:")
for i, gene in enumerate(gene_index[:5]):
    print("\t", gene, gene_variance[i])

### Gene meta-data and their GO annotations

Retrieve some gene meta-data:

- name, description
- cross-references
- go terms (molecular function, biological process, cellular component)

Search for a gene by UniprotAC

In [None]:
gene_uniprot_ac = "Q6PFX2"
genes = genes_forge.search({
    "type": "Gene",
    "identifier": {
        "propertyID": "UniProtAC",
        "value": gene_uniprot_ac
    }
})
for gene in genes:
    print(gene)

Search for a gene by a common name (for now the name should match exactly to the one registered in Nexus)

In [None]:
gene_name = "Bend6"
genes = genes_forge.search({
    "type": "Gene",
    "label": gene_name
})
for gene in genes:
    print(gene)

Let us take 5 arbitrary records from the previously extracted gene expression profiles and retrieve their meta-data.

In [None]:
for el in gene_expression_profile[:5]:
    gene_link = el["isMeasurementOf"]
    if "id" in gene_link:
        gene_resource = genes_forge.retrieve(gene_link["id"])
        print(gene_resource)                         