# Prepare for the envrioment
Load the libraries and packages

In [51]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
from gprofiler import GProfiler

import rpy2.rinterface_lib.callbacks
import logging

from rpy2.robjects import pandas2ri
import anndata2ri

In [52]:
sample_strings = ['Duo_M1', 'Duo_M2', 'Jej_M1', 'Jej_M2', 'Il_M1', 'Il_M2']
sample_id_strings = ['3', '4', '5', '6', '7', '8']
file_base = '/home/yiyang/github/MD/data/GSE92332_uncompressed/GSM283657'
exp_string = '_Regional_'
data_file_end = '_matrix.mtx'
barcode_file_end = '_barcodes.tsv'
gene_file_end = '_genes.tsv'
cc_genes_file = '/home/yiyang/github/MD/data/Macosko_cell_cycle_genes.txt'

def load_and_process_sample(sample, sample_id):
    data_file = f"{file_base}{sample_id}{exp_string}{sample}{data_file_end}"
    barcode_file = f"{file_base}{sample_id}{exp_string}{sample}{barcode_file_end}"
    gene_file = f"{file_base}{sample_id}{exp_string}{sample}{gene_file_end}"

    # load data
    adata = sc.read_mtx(data_file).transpose()
    barcodes = pd.read_csv(barcode_file, header=None, sep='\t')
    genes = pd.read_csv(gene_file, header=None, sep='\t')

    # annotate data
    barcodes.columns = ['barcode']
    barcodes['unique_barcode'] = sample + '_' + barcodes['barcode'].astype(str)
    barcodes.set_index('unique_barcode', inplace=True)
    adata.obs = barcodes
    adata.obs['sample'] = sample
    adata.obs['region'] = sample.split("_")[0]
    adata.obs['donor'] = sample.split("_")[1]

    genes.columns = ['gene_id', 'gene_symbol']
    genes['unique_gene'] = genes['gene_symbol'] + '_' + genes['gene_id']
    genes.set_index('unique_gene', inplace=True)
    adata.var = genes

    return adata


In [53]:
adatas = []
for sample, sample_id in zip(sample_strings, sample_id_strings):
    adata = load_and_process_sample(sample, sample_id)
    adatas.append(adata)

adata = adatas[0].concatenate(adatas[1:], join='outer')

adata.var_names_make_unique()
adata.obs_names_make_unique()

adata.var['original_gene_symbol'] = adata.var.index.str.split('_').str[0]
adata.var['gene_id'] = adata.var.index.str.split('_').str[1]

adata.var['unique_gene_symbol'] = adata.var['original_gene_symbol'] + '_' + adata.var.groupby('original_gene_symbol').cumcount().astype(str)

adata.var_names = adata.var['unique_gene_symbol']

adata.var_names_make_unique()

cc_genes = pd.read_csv(cc_genes_file, header=None, names=['gene_symbols'])
s_genes = cc_genes[cc_genes['gene_symbols'].str.startswith('s_genes')]['gene_symbols'].str.split('s_genes:').str[1].str.split('\t').explode().tolist()
g2m_genes = cc_genes[cc_genes['gene_symbols'].str.startswith('g2m_genes')]['gene_symbols'].str.split('g2m_genes:').str[1].str.split('\t').explode().tolist()



  adata = adatas[0].concatenate(adatas[1:], join='outer')


In [59]:
print("region:")
print(adata.obs['region'].value_counts())
print("\n" + "="*50 + "\n")

print("donor:")
print(adata.obs['donor'].value_counts())
print("\n" + "="*50 + "\n")

print("sample:")
print(adata.obs['sample'].value_counts())
print("\n" + "="*50 + "\n")

adata.write('/home/yiyang/github/MD/data/mouse_intestinal_epithelium.h5ad')



region:
region
Duo    2348
Name: count, dtype: int64


donor:
donor
M1    2348
Name: count, dtype: int64


sample:
sample
Duo_M1    2348
Name: count, dtype: int64


