# Preprocessing the ABA spatial transcriptomics data

In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from morphontogeny.functions.IO import nifti_to_array
from sklearn.preprocessing import StandardScaler

In [None]:
# List of the genes used in this analysis are taken from:
# Bohland et al., 2010: https://doi.org/10.1016/j.ymeth.2009.09.001

# Loading the list of top genes,
# which is included in this repo at files/list_top_genes.csv
list = pd.read_csv("files/list_top_genes.csv")
list_top = list[list['Used in analysis?']==1]

In [None]:
# All gene expression files are downloaded using the tools provided by
# Allen Institute in R language (not provided here) and converted to CSV format

# Make the E(v,g) matrix of ISH data for top genes
# Voxels stored in rows, genes as columns
# (but some of the genes in thre list can't be found on ABA dataset)

# Making a numpy MEMMAP since the file is large for the RAM
topgenes_mmp = np.memmap('data/genes_mmp.mymemmap',\
            dtype='float32', mode='w+', shape=(list_top.shape[0],gene.shape[0]))

for i in range (list_top.shape[0]):
    try:
        topgenes_mmp[:,i] = np.genfromtxt('data/'+list_top['ID'][i]+'.csv',\
        delimiter=',', skip_header=1, usecols=range(1,2379)).flatten()
    except:
        pass

In [None]:
# To deal with -1s in the data
# basic method is to convert them to 0s
# or, impute them (recommended)

# Chaning -1s to 0s
X_pos = np.where(topgenes_mmp < 0, 0, topgenes_mmp)

In [None]:
# Standardizing features by removing the mean and scaling to unit variance
X_std = StandardScaler().fit_transform(X_pos)

In [None]:
# Masking the data for half brain
# Using the NIFTI files for the Allen Mouse Brain Average in 200 um res.
# Provided in the 'files' folder

# Loading the neuroanatomy
anat_arr = nifti_to_array('files/allen_annot200.nii')

# Halving by Neuroanatomy
anat_half = np.zeros_like(anat_arr)
anat_half[:,:29,:] = anat_3D[:,:29,:]

# Vectorizing and getting indices of non-zero values
anat_half_vec = anat_half.flatten()
half_indices = np.nonzero(anat_half_vec)
anat_half_masked = anat_half_vec[half_indices]

# Saving indices to file
# Indices file is provided in this repo
np.save('files/half_indices.npy', half_indices)

# Making a new array to save the half-brain masked gene expression data
half_arr = np.zeros((half_indices.shape[0],X_std.shape[1]))

for i in range(X_std.shape[1]):
    half_arr[:,i] = X_std[half_indices,i]

# Standardizing the matrix
half_mask_pos_std = StandardScaler().fit_transform(half_arr)

In [None]:
# Masking the data for whole brain
# Using the NIFTI files for the Allen Mouse Brain Average in 200 um res.
# Provided in the 'files' folder

# Loading the neuroanatomy
anat_vec = nifti_to_vector('files/allen_annot200.nii')

# Getting indices of non-zero values
indices = np.nonzero(anat_vec)

# Masking the anatomy
anat_masked = anat_vec[indices]

# Saving indices to file
# Indices file is provided in this repo
np.save('files/mask_indices.npy', indices)

# Making a new array to save the half-brain masked gene expression data
mask_arr = np.zeros((indices.shape[0],X_std.shape[1]))

for i in range(X_std.shape[1]):
    mask_arr[:,i] = X_std[indices,i]

# Standardizing the matrix
mask_pos_std = StandardScaler().fit_transform(mask_arr)