# Process the large transcriptomic data

In [None]:
import getpass
import json
import re

from ast import literal_eval
from collections import defaultdict

import urllib.request

import pandas as pd
import numpy as np

from kgforge.core.forge import KnowledgeGraphForge
from kgforge.core.resource import Resource
from kgforge.specializations.resources import Dataset

### Download the data if not done already

In [None]:
# urllib.request.urlretrieve(
#     "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hpf_10x/matrix.csv", "allen_cell_expression.csv")
# urllib.request.urlretrieve(
#     "https://idk-etl-prod-download-bucket.s3.amazonaws.com/aibs_mouse_ctx-hpf_10x/metadata.csv", "allen_cell_metadata.csv")

### Group the meta-data of cell samples by t-types.

In [None]:
metadata = pd.read_csv("allen_cell_metadata.csv")
grouped_data = metadata.groupby("cell_type_accession_label").aggregate(lambda x: list(set(x)))

In [None]:
grouped_data.to_csv("grouped_meta_data.csv")

In [None]:
grouped_data = pd.read_csv("grouped_meta_data.csv").set_index("cell_type_accession_label")[["sample_name"]]
for c in grouped_data.columns:
    print(c)
    grouped_data[c] = grouped_data[c].apply(literal_eval)

### Extract cell samples for each t-type.

Here we wanted to split the huge matrix (70GB) with all cell samples into small matrices (`npy` extension) each representing expressions measured from cells beloning to a particular ttype.


This is a long and costly process implemented in the script `aggregate_cells_v2.py` (takes ~10h to run). The result matrices are stored into `cells_by_type` directory.

## Register t-type cell samples for individual ttypes and link them to TType annotations in `bbp/aibs`.

In [None]:
TOKEN = getpass.getpass()

In [None]:
aibs_forge = KnowledgeGraphForge(
    "https://raw.githubusercontent.com/BlueBrain/nexus-forge/master/examples/notebooks/use-cases/prod-forge-nexus.yml",
    token=TOKEN,
    bucket="bbp/aibs")

In [None]:
from os import listdir
from os.path import isfile, join
matrix_files = [f for f in listdir("cells_by_type") if isfile(join("cells_by_type", f))]

In [None]:
genes_forge = KnowledgeGraphForge(
    "https://raw.githubusercontent.com/BlueBrain/nexus-forge/master/examples/notebooks/use-cases/prod-forge-nexus.yml",
    token=TOKEN,
    bucket="bbp/gene-annotations")

In [None]:
gene_index = list(data.columns)

In [None]:
with open("gene_expression_index.json", "w") as f:
    json.dump(gene_index, f)

In [None]:
index_dataset = Dataset(aibs_forge)
index_dataset.type = ["Dataset", "GeneExpressionIndex"]
index_dataset.name = f"Index of genes for gene expressions"
index_dataset.description = f"Index of genes for raw gene expression datasets. Gene expression datasets are given by matrices whose columns correspond to genes specified in this index"
index_dataset.distribution = aibs_forge.attach(
    f"gene_expression_index.json", content_type="application/json")

In [None]:
aibs_forge.register(index_dataset)
aibs_forge.tag(index_dataset, "2022-02-17")

In [None]:
index_dataset_id = index_dataset.id
index_dataset_id

In [None]:
for file in matrix_files:
    print("Processing... ", file)
    match = re.match("(.*)_cells.npy", file)
    ttype_id = match.groups()[0]
    resource = aibs_forge.search({
        "type": "GeneExpressionAnnotation",
        "identifier": {
            "value": ttype_id
        }
    })
    if len(resource) > 0:
        resource = resource[0]
        ttype = resource.label
        expression_dataset = Dataset(aibs_forge)
        expression_dataset.type = ["Dataset", "GeneExpressionDataset"]
        expression_dataset.name = f"Raw gene expression matrix for {ttype}"
        expression_dataset.geneExpressionIndex = aibs_forge.from_json(
            {
                "id": index_dataset_id,
                "type": "GeneExpressionIndex"
            })
        expression_dataset.description = f"Gene expression matrix for {ttype} provided as a npy (a servialized numpy matrix)"
        expression_dataset.distribution = aibs_forge.attach(
            f"cells_by_type/{file}", content_type="application/octet-stream")

        aibs_forge.register(expression_dataset)
        aibs_forge.tag(expression_dataset, "2022-02-17")

        resource.rawExpressionDataset = aibs_forge.from_json({
            "id": expression_dataset.id,
            "type": "GeneExpressionDataset"
        })
        resource.label = resource.label
        aibs_forge.update(resource)
        aibs_forge.tag(resource, "2022-02-17")