# Pre-processing steps for SCENICplus, including:
1. Create pycisTopic object to identify cis-regulatroy topics.
2. Running pycisTarget to excavate underlying motifs based on the results found by pycisTopic.

## Create pycisTopic object

In [None]:
### SET UP WORKING ENVIRONMENT ##
import warnings
warnings.simplefilter(action = 'ignore')
import pycisTopic
pycisTopic.__version__
import numpy as np
import pandas as pd

projDir = 'YOURWORKDIR/'
outDir = projDir + 'output'
import os
if not os.path.exists(outDir):
    os.makedirs(outDir)
tmpDir = 'YOURTMPDIR'

In [None]:
## read in sparse matrix, with features & barcodes respectively.
from scipy.io import mmread, mmwrite, mminfo
coo_mtx_path = 'matrix.mtx'
coo_mtx = mmread(coo_mtx_path)
mtx = coo_mtx.tocsr()

path_to_blacklist = 'mm10-blacklist.v2.bed'

features = pd.read_csv('features.csv', sep = ',', names = ['features'], dtype = 'str')
barcodes = pd.read_csv('barcodes.csv', sep = ',', names = ['barcode'], dtype = 'str')

from pycisTopic.cistopic_class import *
cistopic_obj = create_cistopic_object(fragment_matrix = mtx, path_to_blacklist = path_to_blacklist, cell_names = barcodes['barcode'], region_names = features['features'])

## Adding cell information
cell_data = pd.read_csv('metadata.csv', sep = ',', dtype= 'str')
cell_data.index = cistopic_obj.cell_data.index
cistopic_obj.add_cell_data(cell_data)

## Run pyCisTopic models

In [None]:
models = run_cgs_models(cistopic_obj, 
                       n_topics = [2,5,10,15,20,25],
                       n_cpu = 200,
                       n_iter = 200,
                       random_state = 555,
                       alpha = 50,
                       alpha_by_topic = True,
                       eta = 0.1,
                       eta_by_topic = False,
                       save_path = None,
                       _temp_dir = tmpDir)

In [None]:
import pickle
with open(outDir + 'CGS_models.pkl',  'wb') as f:
    pickle.dump(models, f)
model = evaluate_models(models, select_model = None, return_model = True, metrics = ['Arun_2010', 'Cao_Juan_2009', 'Minmo_2011', 'loglikelihood'], plot_metrics = False, save = outDir + 'model_selection.pdf')

In [None]:
cistopic_obj.add_LDA_model(model)
with open(outDir + 'cistopicObject.pkl', 'wb') as f:
    pickle.dump(cistopic_obj, f)

## Clustering and visualization

In [None]:
import pickle
infile = open(outDir + 'cistopicObject.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()

In [None]:
from pycisTopic.clust_vis import *
find_clusters(cistopic_obj, 
             target = 'cell',
             k = 15,
             res = [0.3, 0.5, 0.7, 0.9],
             prefix = 'pycisTopic_',
             scale = True,
             split_pattern = '-')

In [None]:
run_umap(cistopic_obj, target = 'cell', scale = True)

In [None]:
visual_path = outDir + '/visualization'
if not os.path.exists(visual_path):
    os.mkdir(visual_path)
plot_metadata(cistopic_obj, 
             reduction_name = 'UMAP',
             variables = ['CELLTYPE', 'pycisTopic_leiden_15_0.9', 'pycisTopic_leiden_15_0.7', 'pycisTopic_leiden_15_0.5', 'pycisTopic_leiden_15_0.3'], 
             target = 'cell',  num_columns = 1,
             text_size = 10, dot_size = 5,
             figsize = (15,5),
             save = outDir + '/visualization/dimensionality_reduction_label.pdf')

In [None]:
cell_topic_heatmap(cistopic_obj, 
                  variables = ['CELLTYPE'], 
                  scale = True, 
                  legend_loc_x = 1.05, 
                  legend_loc_y = -1.2, 
                  legend_dist_y = -1,
                  figsize = (10, 10), 
                  save = outDir + 'visualization/heatmap_topic_contr.pdf')

In [None]:
with open(outDir + 'cistopicObject.pkl', 'wb') as f:
    pickle.dump(cistopic_obj, f)

## Topic binarization & QC

In [None]:
import pickle
infile = open(outDir + 'cistopicObject.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()

In [None]:
os.mkdir(outDir + 'topic_binarization')
from pycisTopic.topic_binarization import *
region_bin_topics = binarize_topics(cistopic_obj, method = 'otsu', ntop = 3000, plot = True, num_columns = 5, save = outDir + 'topic_binarization/otsu.pdf')

In [None]:
binarized_cell_topic = binarize_topics(cistopic_obj, target = 'cell', method = 'li', plot = True, num_columns = 5, nbins = 60)

In [None]:
## compute QC metrics.
from pycisTopic.topic_qc import *
topic_qc_metrics = compute_topic_metrics(cistopic_obj)

fig_dict={}
fig_dict['CoherenceVSAssignments']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Log10_Assignments', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['AssignmentsVSCells_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Log10_Assignments', var_y='Cells_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSCells_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Cells_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSRegions_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Regions_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSMarginal_dist']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Marginal_topic_dist', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSGini_index']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Gini_index', var_color='Gini_index', plot=False, return_fig=True)

# Plot topic stats in one figure
fig=plt.figure(figsize=(40, 43))
i = 1
for fig_ in fig_dict.keys():
    plt.subplot(2, 3, i)
    img = fig2img(fig_dict[fig_]) #To convert figures to png to plot together, see .utils.py. This converts the figure to png.
    plt.imshow(img)
    plt.axis('off')
    i += 1
plt.subplots_adjust(wspace=0, hspace=-0.70)
fig.savefig(outDir + 'topic_binarization/Topic_qc.pdf', bbox_inches='tight')
plt.show()

In [None]:
topic_annot = topic_annotation(cistopic_obj, annot_var='CELLTYPE', binarized_cell_topic=binarized_cell_topic, general_topic_thr = 0.2)

In [None]:
topic_qc_metrics = pd.concat([topic_annot[['CELLTYPE', 'Ratio_cells_in_topic', 'Ratio_group_in_population']], topic_qc_metrics], axis=1)

In [None]:
# Save
with open(outDir + 'topic_binarization/Topic_qc_metrics_annot.pkl', 'wb') as f:
    pickle.dump(topic_qc_metrics, f)
with open(outDir + 'topic_binarization/binarized_cell_topic.pkl', 'wb') as f:
    pickle.dump(binarized_cell_topic, f)
with open(outDir + 'topic_binarization/binarized_topic_region.pkl', 'wb') as f:
    pickle.dump(region_bin_topics, f)

## Differentially Accessible Regions(DARs)

In [None]:
import pickle
infile = open(outDir + 'cistopicObject.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()import pickle
infile = open(outDir + 'cistopicObject.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()

In [None]:
from pycisTopic.diff_features import *
imputed_acc_obj = impute_accessibility(cistopic_obj, selected_cells=None, selected_regions=cistopic_obj.region_names, scale_factor=10**6)

In [None]:
normalized_imputed_acc_obj = normalize_scores(imputed_acc_obj, scale_factor=10**4)

In [None]:
#os.mkdir(outDir+'DARs')
variable_regions = find_highly_variable_features(normalized_imputed_acc_obj,
                                           min_disp = 0.05,
                                           min_mean = 0.0125,
                                           max_mean = 3,
                                           max_disp = np.inf,
                                           n_bins=20,
                                           n_top_features=None,
                                           plot=True,
                                           save= outDir + 'DARs/HVR_plot.pdf')

In [None]:
markers_dict= find_diff_features(cistopic_obj,
                      imputed_acc_obj,
                      variable='CELLTYPE',
                      var_features=variable_regions,
                      contrasts=None,
                      adjpval_thr=0.05,
                      log2fc_thr=np.log2(1.5),
                      n_cpu=5,
                      _temp_dir=tmpDir + 'ray_spill',
                      split_pattern = '-')

In [None]:
x = [print(x + ': '+ str(len(markers_dict[x]))) for x in markers_dict.keys()]

In [None]:
# Save
with open(outDir + 'DARs/Imputed_accessibility.pkl', 'wb') as f:
    pickle.dump(imputed_acc_obj, f)
with open(outDir + 'DARs/DARs.pkl', 'wb') as f:
    pickle.dump(markers_dict, f)

## Run with pycisTarget

In [None]:
import pickle
infile = open(outDir + 'topic_binarization/binarized_topic_region.pkl', 'rb')
binarized_topic_region = pickle.load(infile)
infile.close()

import pickle
infile = open(outDir + 'DARs/DARs.pkl', 'rb')
DARs_dict = pickle.load(infile)
infile.close()

In [None]:
# format region sets
import re
import pyranges as pr
from pycistarget.utils import *
region_sets = {}
region_sets['Topics'] = {key : pr.PyRanges(region_names_to_coordinates(binarized_topic_region[key].index.tolist())) for key in binarized_topic_region.keys()}
region_sets['DARs'] = {re.sub('[^A-Za-z0-9]+', '_', key): pr.PyRanges(region_names_to_coordinates(DARs_dict[key].index.tolist())) for key in DARs_dict.keys()}
# Run pycistarget
# run_without_promoters = True, will run the methods in all regions + the region sets without promoters
savepath = outDir + 'pycisTarget/'
if not os.path.exists(savepath):
    os.mkdir(savepath)

import os
from scenicplus.wrappers.run_pycistarget import *
run_pycistarget(region_sets,
                 ctx_db_path = 'mm10_screen_v10_clust.regions_vs_motifs.rankings.feather', ## download this file at aertslab's website
                 species = 'mus_musculus',
                 save_path = savepath,
                 run_without_promoters = False,
                 biomart_host = 'http://www.ensembl.org',
                 promoter_space = 500,
                 ctx_auc_threshold = 0.005,
                 ctx_nes_threshold = 3.0,
                 ctx_rank_threshold = 0.05,
                 dem_log2fc_thr = 0.5,
                 dem_motif_hit_thr = 3.0,
                 dem_max_bg_regions = 500,
                 n_cpu = 16,
                 _temp_dir = tmpDir,
               ignore_reinit_error=True,
               path_to_motif_annotations = 'motifs-v10nr_clust-nr.mgi-m0.001-o0.0.tbl',
               annotation_version = 'v10nr_clust',
               annotation = ['Direct_annot', 'Orthology_annot'])