# Set up

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
import scanpy as sc
import scanpy.external as sce
import os
import scipy.sparse as ssp
import sys
import pickle
from collections import Counter

# !!!!
# Change this path to point to folder containing helper_functions.py
sys.path.append('/Users/kalki/Dropbox (HMS)/PhD_lab/SPRING_dev/data_prep/')
import helper_functions_py3 as hf

sc.settings.verbosity = 2  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=150)  # low dpi (dots per inch) yields small inline figures

# Load data

In [2]:
# load 24 perturbation data:

adata = sc.read_h5ad('../../../data/anndata_objects/adata24_perturbations.h5ad')


In [3]:
adata6 =  sc.read_h5ad('../../../../4.Indrops5/data/anndata_objects/adata6_raw.h5ad')
adata8 =  sc.read_h5ad('../../../../4.Indrops5/data/anndata_objects/adata8_raw.h5ad')
adata10 =  sc.read_h5ad('../../../../4.Indrops5/data/anndata_objects/adata10_raw.h5ad')
adata14 = sc.read_h5ad('../../../../4.Indrops5/data/anndata_objects/adata14_raw.h5ad')
adata18 = sc.read_h5ad('../../../../3.Indrops4/data/anndata_objects/adata18_raw.h5ad')
adata21 = sc.read_h5ad('../../../../3.Indrops4/data/anndata_objects/adata21_raw.h5ad')

adata8c = adata8[adata8.obs.treatment == 'control'].copy()
adata10c = adata10[adata10.obs.treatment == 'control'].copy()
adata14c = adata14[adata14.obs.treatment == 'control'].copy()
adata18c = adata18[adata18.obs.treatment == 'control'].copy()
adata21c = adata21[adata21.obs.treatment == 'control'].copy()

In [4]:
# normalize counts per 10k

sc.pp.normalize_per_cell(adata, counts_per_cell_after=10000)
sc.pp.normalize_per_cell(adata6, counts_per_cell_after=10000)
sc.pp.normalize_per_cell(adata8c, counts_per_cell_after=10000)
sc.pp.normalize_per_cell(adata10c, counts_per_cell_after=10000)
sc.pp.normalize_per_cell(adata14c, counts_per_cell_after=10000)
sc.pp.normalize_per_cell(adata18c, counts_per_cell_after=10000)
sc.pp.normalize_per_cell(adata21c, counts_per_cell_after=10000)




normalizing by total count per cell
    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
normalizing by total count per cell
    finished (0:00:00): normalized adata.X and

In [5]:
# subset data:


adata24c = adata[adata.obs.treatment == 'control'].copy()
adata24h = adata[adata.obs.treatment == 'perturbed'].copy()
adata24e = adata[adata.obs.treatment == 'emi'].copy()

ctrl_mask1 = (adata24c.obs.replicate == '1')|(adata24c.obs.replicate == '3')|(adata24c.obs.replicate == '5')|(adata24c.obs.replicate == '7') 
ctrl_mask2 = (adata24c.obs.replicate == '2')|(adata24c.obs.replicate == '4')|(adata24c.obs.replicate == '6')|(adata24c.obs.replicate == '8') 

adata24c1 = adata24c[ctrl_mask1].copy()
adata24c2 = adata24c[ctrl_mask2].copy()

# Find marker genes

In [6]:
marker_ctrl_coarse = hf.find_markers(adata24c.X, gene_list = np.array(adata24c.var_names),
                              groups = np.array(adata_pr.obs.state_v3p2)[adata.obs.treatment == 'control'],
                              min_frac_expr=0.01, min_fold_change=1.2, pseudocount=0.1, max_p=0.05) 


Testing 9622 genes for cluster Cardiac muscle 1
cluster=Cardiac muscle 1, n_cells=860, n_diff=4
Testing 9726 genes for cluster Cardiac muscle 2
cluster=Cardiac muscle 2, n_cells=49, n_diff=86
Testing 9738 genes for cluster Differentiating neurons
cluster=Differentiating neurons, n_cells=4390, n_diff=325
Testing 10020 genes for cluster Endodermal lining
cluster=Endodermal lining, n_cells=411, n_diff=17
Testing 10608 genes for cluster Endodermal: pancreatic
cluster=Endodermal: pancreatic, n_cells=55, n_diff=117
Testing 9889 genes for cluster Endodermal: pharyngeal pouch
cluster=Endodermal: pharyngeal pouch, n_cells=431, n_diff=29
Testing 9246 genes for cluster Endothelial
cluster=Endothelial, n_cells=1307, n_diff=241
Testing 10511 genes for cluster Epidermal other
cluster=Epidermal other, n_cells=5666, n_diff=85
Testing 8893 genes for cluster Erythroid
cluster=Erythroid, n_cells=2139, n_diff=238
Testing 10441 genes for cluster Floorplate
cluster=Floorplate, n_cells=127, n_diff=94
Testing

KeyboardInterrupt: 

In [None]:
marker_ctrl1_coarse = hf.find_markers(adata24c1.X, gene_list = np.array(adata24c1.var_names),
                              groups = np.array(adata24c1.obs.state_v3p2),
                              min_frac_expr=0.01, min_fold_change=1.2, pseudocount=0.1, max_p=0.05) 


In [None]:
marker_ctrl2_coarse = hf.find_markers(adata24c2.X, gene_list = np.array(adata24c2.var_names),
                              groups = np.array(adata24c2.obs.state_v3p2),
                              min_frac_expr=0.01, min_fold_change=1.2, pseudocount=0.1, max_p=0.05) 


In [None]:
marker_emi_coarse = hf.find_markers(adata24e.X, gene_list = np.array(adata24e.var_names),
                              groups = np.array(adata_pr.obs.state_v3p2)[adata.obs.treatment == 'emi'],
                              min_frac_expr=0.01, min_fold_change=1.2, pseudocount=0.1, max_p=0.05) 


In [None]:
marker_hua_coarse = hf.find_markers(adata24h.X, gene_list = np.array(adata24h.var_names),
                              groups = np.array(adata_pr.obs.state_v3p2)[adata.obs.treatment == 'perturbed'],
                              min_frac_expr=0.01, min_fold_change=1.2, pseudocount=0.1, max_p=0.05) 


In [None]:
marker_6_coarse = hf.find_markers(adata6.X, gene_list = np.array(adata6.var_names),
                              groups = np.array(adata6.obs.state_v3p2),
                              min_frac_expr=0.01, min_fold_change=1.2, pseudocount=0.1, max_p=0.05) 


In [None]:
marker_8_coarse = hf.find_markers(adata8c.X, gene_list = np.array(adata8c.var_names),
                              groups = np.array(adata8c.obs.state_v3p2),
                              min_frac_expr=0.01, min_fold_change=1.2, pseudocount=0.1, max_p=0.05) 


In [None]:

marker_10_coarse = hf.find_markers(adata10c.X, gene_list = np.array(adata10c.var_names),
                              groups = np.array(adata10c.obs.state_v3p2),
                              min_frac_expr=0.01, min_fold_change=1.2, pseudocount=0.1, max_p=0.05) 


In [None]:

marker_14_coarse = hf.find_markers(adata14c.X, gene_list = np.array(adata14c.var_names),
                              groups = np.array(adata14c.obs.state_v3p2),
                              min_frac_expr=0.01, min_fold_change=1.2, pseudocount=0.1, max_p=0.05) 


In [None]:

marker_18_coarse = hf.find_markers(adata18c.X, gene_list = np.array(adata18c.var_names),
                              groups = np.array(adata18c.obs.state_v3p2),
                              min_frac_expr=0.01, min_fold_change=1.2, pseudocount=0.1, max_p=0.05) 


In [None]:
# Find marker genes between 6 h and 24 h control data

marker_21_coarse = hf.find_markers(adata21c.X, gene_list = np.array(adata21c.var_names),
                              groups = np.array(adata21c.obs.state_v3p2),
                              min_frac_expr=0.01, min_fold_change=1.2, pseudocount=0.1, max_p=0.05) 


In [None]:

marker_ctrl1_coarse.to_csv('../../../data/differentiation_continues/marker_genes24h_ctrl1_coarse_grain.csv', sep=',')
marker_ctrl2_coarse.to_csv('../../../data/differentiation_continues/marker_genes24h_ctrl2_coarse_grain.csv', sep=',')

marker_ctrl_coarse.to_csv('../../../data/differentiation_continues/marker_genes24h_ctrl_coarse_grain.csv', sep=',')
marker_emi_coarse.to_csv('../../../data/differentiation_continues/marker_genes24h_emi_coarse_grain.csv', sep=',')
marker_hua_coarse.to_csv('../../../data/differentiation_continues/marker_genes24h_hua_coarse_grain.csv', sep=',')

In [None]:

marker_6_coarse.to_csv('../../../data/differentiation_continues/marker_genes6_coarse_grain.csv', sep=',')
marker_8_coarse.to_csv('../../../data/differentiation_continues/marker_genes8_coarse_grain.csv', sep=',')
marker_10_coarse.to_csv('../../../data/differentiation_continues/marker_genes10_coarse_grain.csv', sep=',')
marker_14_coarse.to_csv('../../../data/differentiation_continues/marker_genes14_coarse_grain.csv', sep=',')
marker_18_coarse.to_csv('../../../data/differentiation_continues/marker_genes18_coarse_grain.csv', sep=',')
marker_21_coarse.to_csv('../../../data/differentiation_continues/marker_genes21_coarse_grain.csv', sep=',')
