In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
from flecs.utils import get_project_root
import os

## Load original data

In [2]:
adata = sc.read_csv(
    os.path.join(get_project_root(), "datasets", "Paul15", "GSE72857_umitab.txt"), 
    delimiter="\t"
).T

exp_des = pd.read_csv(
    os.path.join(get_project_root(), "datasets", "Paul15", "GSE72857_experimental_design.txt"), 
    skiprows=19, 
    sep="\t", 
    index_col="Well_ID"
)

adata.obs = exp_des.loc[adata.obs.index]

## Load Mouse ATACseq-derived GRN

In [3]:
grn_df = pd.read_csv(
    os.path.join(get_project_root(), "datasets", "Paul15", "mouse_scATAC_atlas_base_GRN.csv"), 
    index_col=0
)

Extract all gene names from the grn

In [4]:
# Let us first get the TFs
all_grn_tfs = list(grn_df.columns[2:])

# Second let us get the other genes
all_grn_other_genes = grn_df["gene_short_name"].tolist()

all_grn_genes = np.unique(all_grn_tfs + all_grn_other_genes)

## Map GRN gene names to the variable names in the adata

In [5]:
def find_match(grn_gene, var_names):
    match_list = [variable_name for variable_name in var_names if grn_gene.lower() in variable_name.lower()]
    
    if len(match_list) == 1:
        return match_list[0]
    else:
        return -1


In [None]:
var_names = list(adata.var.index)
grn_genes_to_var_match_unfiltered = {grn_gene: find_match(grn_gene, var_names) for grn_gene in all_grn_genes}

# Remove the grn genes that have not been mapped
grn_genes_to_var_match_unfiltered = {k: v for k, v in grn_genes_to_var_match_unfiltered.items() if v != -1}

# Drop duplicate values (several GRN genes may map to the same var)
grn_genes_to_var_match = {}
for k, v in grn_genes_to_var_match_unfiltered.items():
    if v not in grn_genes_to_var_match.values():
        grn_genes_to_var_match[k] = v

## Subset adata and build GRN adjacency matrix

In [None]:
all_keys_list = list(grn_genes_to_var_match.keys())
grn_genes_to_idx = {all_keys_list[i]: i for i in range(len(all_keys_list))}

In [None]:
adata_subset = adata[:, list(grn_genes_to_var_match.values())].copy()

In [None]:
grn_adj_mat = np.zeros((adata_subset.shape[1], adata_subset.shape[1]))

In [None]:
# Build adjacency matrix of the GRN, in the order consistent with adata_subset
mapped_grn_tfs = [tf for tf in all_grn_tfs if tf in grn_genes_to_var_match.keys()]

for tf in mapped_grn_tfs:
    targeted_genes = grn_df[grn_df[tf] == 1]["gene_short_name"].unique()
    mapped_targeted_genes = [gene for gene in targeted_genes if gene in grn_genes_to_var_match.keys()]
    target_indices = [grn_genes_to_idx[gene] for gene in mapped_targeted_genes]
    grn_adj_mat[grn_genes_to_idx[tf], target_indices] = 1

In [None]:
adata_subset.varp["grn_adj_mat"] = grn_adj_mat

## Save

In [None]:
adata_subset.write_h5ad(os.path.join(get_project_root(), 
                                     "datasets", "Paul15", "processed", "paul15_with_mouse_grn.h5ad")
                       )