In [1]:
import pandas as pd
from functools import reduce
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
from cplAE_MET.preproc.data_proc_MET import set_paths

# This is used to be able to run the notebook both remote and locally
Allen_dir = "/allen/programs/celltypes/workgroups/rnaseqanalysis/Fahimehb/git_workspace/cplAE_MET/data/"
remote_dir = "/Users/fahimehb/Documents/git-workspace/cplAE_MET/data/"

In [2]:
pth = set_paths(config_file="config_preproc.toml")

### Reading T, E and M data to be combined togther in a mat file

In [3]:
print("...................................................")
print("Loading specimens")
cells = pd.read_csv(pth['specimen_ids'])
print("Loading E, T and M data")
E_data = pd.read_csv(pth['e_input'])
print("shape of E data:", E_data.shape)
T_data = pd.read_csv(pth['t_input'])
print("shape of T data:", T_data.shape)
M_dat = sio.loadmat(pth['m_input'])
print("shape of hist_ax_de_api_bas data:", M_dat['hist_ax_de_api_bas'].shape)
gene_id = pd.read_csv(pth['gene_id_input'])
print("Loading T annotations")
T_ann = pd.read_csv(pth['anno'])

...................................................
Loading specimens
Loading E, T and M data
shape of E data: (6570, 133)
shape of T data: (6570, 1253)
shape of hist_ax_de_api_bas data: (6570, 120, 4, 4)
Loading T annotations


### Aligning T, E and M data

In [4]:
print("...................................................")
print("read specimen ids from m data and align other with that")
m_anno = pd.DataFrame({"specimen_id": M_dat['specimen_id'][0], "soma_depth": M_dat["soma_depth"][0]})

print("...................................................")
print("Combining M, E and T data and metadata")
result = reduce(lambda left, right: pd.merge(left, right, on=['specimen_id'], how='left'), [m_anno, E_data])
result = reduce(lambda left, right: pd.merge(left, right, on=['specimen_id'], how='left'), [result, T_data])
result = reduce(lambda left, right: pd.merge(left, right, on=['specimen_id'], how='left'), [result, cells])
result = reduce(lambda left, right: pd.merge(left, right, on=['specimen_id'], how='left'), [result, T_ann])


...................................................
read specimen ids from m data and align other with that
...................................................
Combining M, E and T data and metadata


### Combining all the data in a mat file

In [5]:
print("...................................................")
print("Writing the output mat")

#writing M, E and T data
model_input_mat = {}
model_input_mat["E_dat"] = np.array(result[[c for c in E_data.columns if c != "specimen_id"]])
model_input_mat["T_dat"] = np.array(result[[c for c in T_data.columns if c != "specimen_id"]])
model_input_mat["M_dat"] = np.array(M_dat["hist_ax_de_api_bas"])
model_input_mat["soma_depth"] = np.array(result["soma_depth"])
model_input_mat["gene_ids"] = gene_id["gene_id"].to_list()


#writing the sample_ids and the masks and some meta data
model_input_mat["specimen_id"] = result.specimen_id.to_list()
model_input_mat["cluster_id"] = result.Tree_first_cl_id.to_list()
model_input_mat["cluster_color"] = result.Tree_first_cl_color.to_list()
model_input_mat["cluster_label"] = result.Tree_first_cl_label.to_list()


#Saving input mat
print("Size of M data:", model_input_mat['M_dat'].shape)
print("Size of M data:", model_input_mat["soma_depth"].shape)
print("Size of E data:", model_input_mat["E_dat"].shape)
print("Size of T data:", model_input_mat["T_dat"].shape)
print("saving!")

...................................................
Writing the output mat
Size of M data: (6570, 120, 4, 4)
Size of M data: (6570,)
Size of E data: (6570, 132)
Size of T data: (6570, 1252)
saving!
