In [1]:
import dask
import dask.dataframe as dd
from scipy import stats
import os
import sys
import pandas as pd
import subprocess as sp
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
import shutil
import glob
import gimmemotifs
from pathlib import Path
import qnorm
from sklearn import datasets, linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, average_precision_score

%matplotlib inline

In [2]:
def hue_regplot(data, x, y, hue, palette=None, **kwargs):
    from matplotlib.cm import get_cmap
    
    regplots = []
    
    levels = data[hue].unique()
    
    if palette is None:
        default_colors = get_cmap('tab10')
        palette = {k: default_colors(i) for i, k in enumerate(levels)}
    
    for key in levels:
        regplots.append(
            sns.regplot(
                x=x,
                y=y,
                fit_reg=False, 
                data=data[data[hue] == key],
                color=palette[key],
                **kwargs
            )
        )
    
    return regplots

In [3]:
output_dir = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1'
Path(f"{output_dir}").mkdir(parents=True, exist_ok=True)       

sample_data_file = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/Cell_types_files_v1_esccomp.csv'

#load the genome info
genome_path_size = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/genome/hg38/hg38.fa.sizes"
genome_path = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/genome/hg38/hg38.fa"

In [3]:
# specifiy files for network generation intra comparison stromal and epi
output_dir = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1'
Path(f"{output_dir}").mkdir(parents=True, exist_ok=True)       

sample_data_file = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/Cell_types_files_v1_intracomp.csv'

#load the genome info
genome_path_size = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/genome/hg38/hg38.fa.sizes"
genome_path = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/genome/hg38/hg38.fa"

In [4]:
#Lets loop over all the comparisons Ananse needs to make:
sample_data = pd.read_table(sample_data_file, 
                            sep = ',', comment = '#')
sample_data

Unnamed: 0,cell_type,Accesibility_peakfiles,Merged_peakfiles,scATAC_BAMfiles,TPM_matrix,compare_with,count_table_files
0,epi,/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data...,unused,/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data...,/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data...,stromal,/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data...
1,stromal,/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data...,unused,/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data...,/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data...,epi,/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data...


# Ananse binding

In [5]:
#lets first perform motif enrichment
if not os.path.exists(f"{output_dir}/scan_results_acc_v4.scanfile"): 
    sp.check_call(f'nice -15 gimme scan '
        f'-T -N 4 '
        f'-g hg38 '
        f'/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/ATAC_qquant/tmp/mergedpeaks3.bed '
        f'> {output_dir}/scan_results_acc_v4.scanfile',shell = True)

In [10]:
# lets do ananse binding for your cell populations
i = 0
for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    print(cell_id)
    Path(f"{output_dir}/{cell_id}").mkdir(parents=True, exist_ok=True)       
    bam_file = sample_data.iloc[index,3]
    
    if not os.path.exists(f"{output_dir}/{cell_id}/binding.h5"): 
        print(f'running ananse binding for celltype {cell_id} using the scATAC bamfile {bam_file} with the ATAC merged peaks of all joined peaks')
        sp.check_call(f'ananse binding '
            f'-A {bam_file} '
            f'-o {output_dir}/{cell_id}/ '
            f'--jaccard-cutoff 0.1 '
            f'-g hg38 '
            f'-r /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/ATAC_qquant/tmp/mergedpeaks3.bed '
            f'--pfmscorefile /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/scan_results_acc_v4.scanfile '
            f'-n 1 '    
            f'2> {output_dir}/{cell_id}/ananse_binding_log.txt',shell = True)
        
#

LSC
LESC
CE
Cj
LE
CF
CSSC


In [None]:
# lets do ananse binding for your cell populations now without the pfmscorefile and only the narrowpeak
i = 0
for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    print(cell_id)
    Path(f"{output_dir}/{cell_id}/narrow").mkdir(parents=True, exist_ok=True)       
    bam_file = sample_data.iloc[index,3]
    peak_file = sample_data.iloc[index,1]
    
    if not os.path.exists(f"{output_dir}/{cell_id}/narrow/binding.h5"): 
        print(f'running ananse binding for celltype {cell_id} using the scATAC bamfile {bam_file} with the narrowpeakfile {peak_file}')
        sp.check_call(f'ananse binding '
            f'-A {bam_file} '
            f'-o {output_dir}/{cell_id}/narrow/ '
            f'--jaccard-cutoff 0.1 '
            f'-g hg38 '
            f'-r {peak_file} '
            f'-n 1 '    
            f'2> {output_dir}/{cell_id}/narrow/ananse_binding_log_narrow.txt',shell = True)
        
#

epi
running ananse binding for celltype epi using the scATAC bamfile /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/ATAC_bam/epi/outs/subsets/epi.bam with the narrowpeakfile /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/ATAC_bam/epi/outs/subsets/epi.narrowPeak
stromal
running ananse binding for celltype stromal using the scATAC bamfile /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/ATAC_bam/stromal/outs/subsets/stromal.bam with the narrowpeakfile /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/ATAC_bam/stromal/outs/subsets/stromal.narrowPeak


In [9]:
# generating the binding.h5 file for ESC data ananse develop 0.3.0
Path(f"{output_dir}/ESC").mkdir(parents=True, exist_ok=True)       
if not os.path.exists(f"{output_dir}/ESC/binding.h5"): 
    sp.check_call(f'ananse binding '
        f'-H /ceph/rimlsfnwi/data/moldevbio/zhou/jsmits/Ananse_test_data/H3K27ac/results/final_bam/hg38-GSM466732.samtools-coordinate.bam /ceph/rimlsfnwi/data/moldevbio/zhou/jsmits/Ananse_test_data/H3K27ac/results/final_bam/hg38-GSM663427.samtools-coordinate.bam '
        f'-r /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/ESC_ANANSE/all_peaks.tsv '
        f'-o {output_dir}/ESC '
        f'--jaccard-cutoff 0.1 '
        f'-g hg38 '
        f'-n 4 '   
        f'2> {output_dir}/ESC/ananse_binding_log.txt',shell = True)

# Ananse network

Generate GRN's based on the binding.tsv output and RNAseq expression

In [12]:
for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    print(cell_id)
    Path(f"{output_dir}/{cell_id}").mkdir(parents=True, exist_ok=True)  
    binding_file = f"/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/{cell_id}/binding.h5"
    print(f'running ananse binding using the binding file {binding_file}')
    file_name = sample_data.iloc[index,4]
    print(f'together with the gene expression file {file_name}')
    print(f'and the output file full_network_includeprom.txt')
    
    if not os.path.exists(f"{output_dir}/{cell_id}/full_network_includeprom.txt"): 
        sp.check_call(f'nice -15 ananse network '
            f' {binding_file} '
            f'-e {file_name} '
            f'-o {output_dir}/{cell_id}/full_network_includeprom.txt '
            f'--full-output '
            f'-n 1 '
            f'2> {output_dir}/{cell_id}/ananse_network_log.txt',shell = True)
        
#             f'-a /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/genome/genomehg38gimme/hg38/hg38.annotation.gtf.gz ' Check if this can be re-run with all tfs given the right name

LSC
running ananse binding using the binding file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LSC/binding.h5
together with the gene expression file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/RNA_CPM/LSC_cpm.tsv
and the output file full_network_includeprom.txt
LESC
running ananse binding using the binding file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LESC/binding.h5
together with the gene expression file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/RNA_CPM/LESC_cpm.tsv
and the output file full_network_includeprom.txt
CE
running ananse binding using the binding file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/CE/binding.h5
together with the gene expression file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/RNA_CPM/CE_cpm.tsv
and the output file full_network_includeprom.txt
Cj
running ananse binding using the binding file /ceph/rimlsfnwi/da

In [6]:
# Generate networks from Narrow peak binding files
for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    print(cell_id)
    Path(f"{output_dir}/{cell_id}").mkdir(parents=True, exist_ok=True)  
    binding_file = f"{output_dir}/{cell_id}/narrow/binding.h5"
    print(f'running ananse binding using the binding file {binding_file}')
    file_name = sample_data.iloc[index,4]
    print(f'together with the gene expression file {file_name}')
    print(f'and the output file full_network_includeprom.txt')
    
    if not os.path.exists(f"{output_dir}/{cell_id}/narrow/full_network_includeprom.txt"): 
        sp.check_call(f'nice -15 ananse network '
            f' {binding_file} '
            f'-e {file_name} '
            f'-o {output_dir}/{cell_id}/narrow/full_network_includeprom.txt '
            f'--full-output '
            f'-n 1 '
            f'2> {output_dir}/{cell_id}/narrow/ananse_network_log.txt',shell = True)
        
#       

epi
running ananse binding using the binding file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/epi/narrow/binding.h5
together with the gene expression file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_intra_cpm/20220421/RNA_CPM/epi_cpm.tsv
and the output file full_network_includeprom.txt
stromal
running ananse binding using the binding file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/stromal/narrow/binding.h5
together with the gene expression file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_intra_cpm/20220421/RNA_CPM/stromal_cpm.tsv
and the output file full_network_includeprom.txt


In [13]:
# generate the ESC network file from the binding.h5 file
binding_file = f"{output_dir}/ESC/binding.h5"
print(f'running ananse binding using the enhancer file {binding_file}')
file_name = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/RNA_CPM/ESC_cpm.tsv"
print(f'together with the CPM file {file_name}')
print(f'and the output file full_network_includeprom.txt')
    
if not os.path.exists(f"{output_dir}/ESC/full_network_includeprom.txt"): 
    sp.check_call(f'nice -15 ananse network '
        f'{binding_file} '
        f'-e {file_name} '
        f'-o {output_dir}/ESC/full_network_includeprom.txt '
        f'--full-output '
        f'-g hg38 '
        f'-n 1 '
        f'2> {output_dir}/ESC/ananse_network_log.txt',shell = True)

running ananse binding using the enhancer file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/binding.h5
together with the CPM file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/RNA_CPM/ESC_cpm.tsv
and the output file full_network_includeprom.txt


# Ananse Influence

Lets finally run Ananse influence to predict key TFs that differ between cell types.
it will compare each network with the networks listed in the 'comparison' column

In [6]:
# NARROW test
# cell population of interest against only ESC data
sample_data_file = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/Cell_types_files_v1_esccomp.csv'
#Lets loop over all the comparisons Ananse needs to make:
sample_data = pd.read_table(sample_data_file, 
                            sep = ',', comment = '#')

for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    print(cell_id)
    network_file = f"{output_dir}/{cell_id}/narrow/full_network_includeprom.txt"
    print(f'running ananse influence using the network file {network_file}')
    net2 = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt"
    print(f'compared to the network file {net2}')
    Path(f"{output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_500000_0401").mkdir(parents=True, exist_ok=True) 
    file_name = sample_data.iloc[index,6]
    print(f'together with DEG file {file_name} and the output file influence.txt')

    if not os.path.exists(f"{output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_500000_0401/ANANSE_influence.tsv"):
        sp.check_call(f'nice -15 ananse influence '
            f'-t {network_file} '
            f'-s {net2} '
            f'-d {file_name} '
            f'--select-after-join '
            f'-i 500000 '
            f'--full-output '    
            f'-o {output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_500000_0401/ANANSE_influence.tsv '
            f'-n 1 '
            f'2> {output_dir}/{cell_id}/narrow/ananse_influence_log_ESC_0401.txt',shell = True)

LSC
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LSC/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_DE/20220401/LSC_ESC_pseudobulkpadj.tsv and the output file influence.txt
LESC
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LESC/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_DE/20220401/LESC_ESC_pseudobulkpadj.tsv and the output file influence.txt
CE
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zho

In [7]:
# Compare cell_types against other cells Narrow (intra epi-stromal comp)
for index, cell_type in sample_data.iterrows(): 
    cell_id = sample_data.iloc[index,0]
    comparisons = sample_data.compare_with[index]
    network_file = f"{output_dir}/{cell_id}/narrow/full_network_includeprom.txt"
    print(f'running ananse influence using the network file {network_file}')
    net2 = f"{output_dir}/{comparisons}/narrow/full_network_includeprom.txt"
    print(f'compared to the network file {net2}')
    Path(f"{output_dir}/{cell_id}/narrow/{comparisons}_to_{cell_id}_influence_250000_2604").mkdir(parents=True, exist_ok=True) 
    file_name = sample_data.iloc[index,6]
    print(f'together with DEG file {file_name} and the output file influence.txt')

    if not os.path.exists(f"{output_dir}/{cell_id}/narrow/{comparisons}_to_{cell_id}_influence_250000_2604/ANANSE_influence.tsv"):
        sp.check_call(f'nice -15 ananse influence '
            f'-t {network_file} '
            f'-s {net2} '
            f'-d {file_name} '
            f'--select-after-join '
            f'-i 250000 '
            f'--full-output '    
            f'-o {output_dir}/{cell_id}/narrow/{comparisons}_to_{cell_id}_influence_250000_2604/ANANSE_influence.tsv '
            f'-n 1 '
            f'2> {output_dir}/{cell_id}/narrow/ananse_influence_log_intra_2604.txt',shell = True)

running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/epi/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/stromal/narrow/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_intra_cpm/20220421/epi_stromal_pseudobulkpadj.tsv and the output file influence.txt
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/stromal/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/epi/narrow/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_intra_cpm/20220421/stromal_epi_pseudobulkpadj.tsv and the output file influence.txt


In [8]:
# Compare cell_types against epi or stromal Narrow (INTER epi-stromal comp)
# cell population of interest against only ESC data
sample_data_file = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/Cell_types_files_v1_intercomp.csv'
#Lets loop over all the comparisons Ananse needs to make:
sample_data = pd.read_table(sample_data_file, 
                            sep = ',', comment = '#')

for index, cell_type in sample_data.iterrows(): 
    cell_id = sample_data.iloc[index,0]
    comparisons = sample_data.compare_with[index]
    network_file = f"/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/{cell_id}/narrow/full_network_includeprom.txt"
    print(f'running ananse influence using the network file {network_file}')
    net2 = f"{output_dir}/{comparisons}/narrow/full_network_includeprom.txt"
    print(f'compared to the network file {net2}')
    Path(f"{output_dir}/{cell_id}/narrow/{comparisons}_to_{cell_id}_influence_250000_2604").mkdir(parents=True, exist_ok=True) 
    file_name = sample_data.iloc[index,6]
    print(f'together with DEG file {file_name} and the output file influence.txt')

    if not os.path.exists(f"{output_dir}/{cell_id}/narrow/{comparisons}_to_{cell_id}_influence_250000_2604/ANANSE_influence.tsv"):
        sp.check_call(f'nice -15 ananse influence '
            f'-t {network_file} '
            f'-s {net2} '
            f'-d {file_name} '
            f'--select-after-join '
            f'-i 250000 '
            f'--full-output '    
            f'-o {output_dir}/{cell_id}/narrow/{comparisons}_to_{cell_id}_influence_250000_2604/ANANSE_influence.tsv '
            f'-n 1 '
            f'2> {output_dir}/{cell_id}/narrow/ananse_influence_log_intra_2604.txt',shell = True)

running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LSC/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/stromal/narrow/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_intra_cpm/20220421/LSC_stromal_pseudobulkpadj.tsv and the output file influence.txt
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LESC/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/stromal/narrow/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_intra_cpm/20220421/LESC_stromal_pseudobulkpadj.tsv and the output file influence.txt
running ananse influence using the network file /c

In [13]:
# Generate union_file for no sel after join
all_interactions_set = set()
Path(f"{output_dir}/influence/").mkdir(parents=True, exist_ok=True) 
interaction_file = f'{output_dir}/influence/ESC_all_interactions_0604_500000.txt'
            
for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    diff_network = f'{output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_500000_0401/ANANSE_influence_diffnetwork.tsv'
    rnet = pd.read_csv(
                    diff_network,
                    sep="\t",
                    dtype="float64",
                    converters={"source": str,
                               "target": str},
                )
    rnet['source_target'] = rnet['source'] + "—" + rnet['target']
    rnet_set = set(rnet['source_target'])
    all_interactions_set.update(rnet_set)
    print(len(all_interactions_set))

interaction_file = f'{output_dir}/influence/ESC_all_interactions_0604_500000.txt'
interaction_df = pd.DataFrame(data=list(all_interactions_set))
interaction_df.to_csv(interaction_file, index=False, header = False)

500000
760668
998967
1217553
1363376
1514422
1619553


In [9]:
# NARROW peak file networks with no sel after join
# cell population of interest against only ESC data
sample_data_file = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/Cell_types_files_v1_esccomp.csv'
#Lets loop over all the comparisons Ananse needs to make:
sample_data = pd.read_table(sample_data_file, 
                            sep = ',', comment = '#')

for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    print(cell_id)
    network_file = f"{output_dir}/{cell_id}/narrow/full_network_includeprom.txt"
    print(f'running ananse influence using the network file {network_file}')
    net2 = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt"
    print(f'compared to the network file {net2}')
    Path(f"{output_dir}/{cell_id}/narrow/no_sel_after_join_0604/ESC_to_{cell_id}_influence_500000").mkdir(parents=True, exist_ok=True) 
    file_name = sample_data.iloc[index,6]
    print(f'together with DEG file {file_name} and the output file influence.txt')

    if not os.path.exists(f"{output_dir}/{cell_id}/narrow/no_sel_after_join_0604/ESC_to_{cell_id}_influence_500000/ANANSE_influence.tsv"):
        sp.check_call(f'nice -15 ananse influence '
            f'-t {network_file} '
            f'-s {net2} '
            f'-d {file_name} '
            f'-i 500000 '
            f'--full-output '    
            f'-o {output_dir}/{cell_id}/narrow/no_sel_after_join_0604/ESC_to_{cell_id}_influence_500000/ANANSE_influence.tsv '
            f'-n 1 '
            f'2> {output_dir}/{cell_id}/narrow/no_sel_after_join_0604/ananse_influence_log_ESC.txt',shell = True)

LSC
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LSC/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_DE/20220401/LSC_ESC_pseudobulkpadj.tsv and the output file influence.txt
LESC
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LESC/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_DE/20220401/LESC_ESC_pseudobulkpadj.tsv and the output file influence.txt
CE
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zho

In [11]:
# Generate union_file for no sel after join
all_interactions_set = set()
Path(f"{output_dir}/influence/").mkdir(parents=True, exist_ok=True) 
interaction_file = f'{output_dir}/influence/ESC_all_interactions_0604_500000_no_sel.txt'
            
for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    diff_network = f'{output_dir}/{cell_id}/narrow/no_sel_after_join_0604/ESC_to_{cell_id}_influence_500000/ANANSE_influence_diffnetwork.tsv'
    rnet = pd.read_csv(
                    diff_network,
                    sep="\t",
                    dtype="float64",
                    converters={"source": str,
                               "target": str},
                )
    rnet['source_target'] = rnet['source'] + "—" + rnet['target']
    rnet_set = set(rnet['source_target'])
    all_interactions_set.update(rnet_set)
    print(len(all_interactions_set))

interaction_file = f'{output_dir}/influence/ESC_all_interactions_0604_500000_no_sel.txt'
interaction_df = pd.DataFrame(data=list(all_interactions_set))
interaction_df.to_csv(interaction_file, index=False, header = False)

448666
648352
850195
1018480
1122872
1259717
1371514


In [15]:
# NARROW peak files with less edges: 250000; perhaps less noise and still more relevant factors?
# cell population of interest against only ESC data
sample_data_file = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/Cell_types_files_v1_esccomp.csv'
#Lets loop over all the comparisons Ananse needs to make:
sample_data = pd.read_table(sample_data_file, 
                            sep = ',', comment = '#')

for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    print(cell_id)
    network_file = f"{output_dir}/{cell_id}/narrow/full_network_includeprom.txt"
    print(f'running ananse influence using the network file {network_file}')
    net2 = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt"
    print(f'compared to the network file {net2}')
    Path(f"{output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_250000_0604").mkdir(parents=True, exist_ok=True) 
    file_name = sample_data.iloc[index,6]
    print(f'together with DEG file {file_name} and the output file influence.txt')

    if not os.path.exists(f"{output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_250000_0604/ANANSE_influence.tsv"):
        sp.check_call(f'nice -15 ananse influence '
            f'-t {network_file} '
            f'-s {net2} '
            f'-d {file_name} '
            f'--select-after-join '
            f'-i 250000 '
            f'--full-output '    
            f'-o {output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_250000_0604/ANANSE_influence.tsv '
            f'-n 1 '
            f'2> {output_dir}/{cell_id}/narrow/ananse_influence_log_ESC_0604_250.txt',shell = True)

LSC
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LSC/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_DE/20220401/LSC_ESC_pseudobulkpadj.tsv and the output file influence.txt
LESC
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LESC/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_DE/20220401/LESC_ESC_pseudobulkpadj.tsv and the output file influence.txt
CE
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zho

In [5]:
# NARROW peak files with less edges: 100000; perhaps less noise and still more relevant factors seems like it based on 250.000
# cell population of interest against only ESC data
sample_data_file = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/Cell_types_files_v1_esccomp.csv'
#Lets loop over all the comparisons Ananse needs to make:
sample_data = pd.read_table(sample_data_file, 
                            sep = ',', comment = '#')

for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    print(cell_id)
    network_file = f"{output_dir}/{cell_id}/narrow/full_network_includeprom.txt"
    print(f'running ananse influence using the network file {network_file}')
    net2 = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt"
    print(f'compared to the network file {net2}')
    Path(f"{output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_100000_0804").mkdir(parents=True, exist_ok=True) 
    file_name = sample_data.iloc[index,6]
    print(f'together with DEG file {file_name} and the output file influence.txt')

    if not os.path.exists(f"{output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_100000_0804/ANANSE_influence.tsv"):
        sp.check_call(f'nice -15 ananse influence '
            f'-t {network_file} '
            f'-s {net2} '
            f'-d {file_name} '
            f'--select-after-join '
            f'-i 100000 '
            f'--full-output '    
            f'-o {output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_100000_0804/ANANSE_influence.tsv '
            f'-n 1 '
            f'2> {output_dir}/{cell_id}/narrow/ananse_influence_log_ESC_0804_100.txt',shell = True)

LSC
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LSC/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_DE/20220401/LSC_ESC_pseudobulkpadj.tsv and the output file influence.txt
LESC
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LESC/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_DE/20220401/LESC_ESC_pseudobulkpadj.tsv and the output file influence.txt
CE
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zho

In [14]:
# cell population of interest against only ESC data
sample_data_file = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/Cell_types_files_v1_esccomp.csv'
#Lets loop over all the comparisons Ananse needs to make:
sample_data = pd.read_table(sample_data_file, 
                            sep = ',', comment = '#')

for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    print(cell_id)
    network_file = f"{output_dir}/{cell_id}/full_network_includeprom.txt"
    print(f'running ananse influence using the network file {network_file}')
    net2 = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt"
    print(f'compared to the network file {net2}')
    Path(f"{output_dir}/{cell_id}/ESC_to_{cell_id}_influence_500000_0401").mkdir(parents=True, exist_ok=True) 
    file_name = sample_data.iloc[index,6]
    print(f'together with DEG file {file_name} and the output file influence.txt')

    if not os.path.exists(f"{output_dir}/{cell_id}/ESC_to_{cell_id}_influence_500000_0401/ANANSE_influence.tsv"):
        sp.check_call(f'nice -15 ananse influence '
            f'-t {network_file} '
            f'-s {net2} '
            f'-d {file_name} '
            f'--select-after-join '
            f'-i 500000 '
            f'--full-output '    
            f'-o {output_dir}/{cell_id}/ESC_to_{cell_id}_influence_500000_0401/ANANSE_influence.tsv '
            f'-n 1 '
            f'2> {output_dir}/{cell_id}/ananse_influence_log_ESC_0401.txt',shell = True)

LSC
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LSC/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_DE/20220401/LSC_ESC_pseudobulkpadj.tsv and the output file influence.txt
LESC
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LESC/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_DE/20220401/LESC_ESC_pseudobulkpadj.tsv and the output file influence.txt
CE
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/s

In [15]:
all_interactions_set = set()
Path(f"{output_dir}/influence/").mkdir(parents=True, exist_ok=True) 
interaction_file = f'{output_dir}/influence/ESC_all_interactions_0401_500000.txt'
            
for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    diff_network = f'{output_dir}/{cell_id}/ESC_to_{cell_id}_influence_500000_0401/ANANSE_influence_diffnetwork.tsv'
    rnet = pd.read_csv(
                    diff_network,
                    sep="\t",
                    dtype="float64",
                    converters={"source": str,
                               "target": str},
                )
    rnet['source_target'] = rnet['source'] + "—" + rnet['target']
    rnet_set = set(rnet['source_target'])
    all_interactions_set.update(rnet_set)
    print(len(all_interactions_set))

interaction_file = f'{output_dir}/influence/ESC_all_interactions_0401_500000.txt'
interaction_df = pd.DataFrame(data=list(all_interactions_set))
interaction_df.to_csv(interaction_file, index=False, header = False)

500000
721884
870189
982089
1138593
1342377
1514043


In [None]:
# test dif interactions inbetween corneal populations
    if not os.path.exists(f"{output_dir}/{cell_id}/ESC_to_{cell_id}_influence_500000_0401/ANANSE_influence.tsv"):
        sp.check_call(f'nice -15 ananse influence '
            f'-t /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/LSC/full_network_includeprom.txt '
            f'-s /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/CF/full_network_includeprom.txt '
            f'-d /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_DE/20220401/ '
            f'--select-after-join '
            f'-i 500000 '
            f'--full-output '    
            f'-o {output_dir}/{cell_id}/ESC_to_{cell_id}_influence_500000_0401/ANANSE_influence.tsv '
            f'-n 1 '
            f'2> {output_dir}/{cell_id}/ananse_influence_log_ESC_0401.txt',shell = True)

In [16]:
# cell population of interest against only ESC data 200.000 edges diffnetwork
sample_data_file = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/Cell_types_files_v1_esccomp.csv'
#Lets loop over all the comparisons Ananse needs to make:
sample_data = pd.read_table(sample_data_file, 
                            sep = ',', comment = '#')

for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    print(cell_id)
    network_file = f"/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/outs_v1/{cell_id}/full_network_includeprom.txt"
    print(f'running ananse influence using the network file {network_file}')
    net2 = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/outs_v1/ESC/full_network_includeprom.txt"
    print(f'compared to the network file {net2}')
    Path(f"{output_dir}/{cell_id}/ESC_to_{cell_id}_influence_200000_3103").mkdir(parents=True, exist_ok=True) 
    file_name = sample_data.iloc[index,6]
    print(f'together with DEG file {file_name} and the output file influence.txt')

    if not os.path.exists(f"{output_dir}/{cell_id}/ESC_to_{cell_id}_influence_200000_3103/ANANSE_influence.tsv"):
        sp.check_call(f'nice -15 ananse influence '
            f'-t {network_file} '
            f'-s {net2} '
            f'-d {file_name} '
            f'--select-after-join '
            f'-i 200000 '
            f'--full-output '    
            f'-o {output_dir}/{cell_id}/ESC_to_{cell_id}_influence_200000_3103/ANANSE_influence.tsv '
            f'-n 1 '
            f'2> {output_dir}/{cell_id}/ananse_influence_log_ESC_3103.txt',shell = True)

LSC
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/outs_v1/LSC/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_16032022/RNA_DE/20220316/LSC_ESC_pseudobulkpadj.tsv and the output file influence.txt
LESC
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/outs_v1/LESC/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_16032022/RNA_DE/20220316/LESC_ESC_pseudobulkpadj.tsv and the output file influence.txt
CE
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio

In [12]:
all_interactions_set = set()
Path(f"{output_dir}/influence/").mkdir(parents=True, exist_ok=True) 
interaction_file = f'{output_dir}/influence/ESC_all_interactions.txt'
            
for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    diff_network = f'{output_dir}/{cell_id}/ESC_to_{cell_id}_influence_500000_3003/ANANSE_influence_diffnetwork.tsv'
    rnet = pd.read_csv(
                    diff_network,
                    sep="\t",
                    dtype="float64",
                    converters={"source": str,
                               "target": str},
                )
    rnet['source_target'] = rnet['source'] + "—" + rnet['target']
    rnet_set = set(rnet['source_target'])
    all_interactions_set.update(rnet_set)
    print(len(all_interactions_set))

interaction_file = f'{output_dir}/influence/ESC_all_interactions_3003.txt'
interaction_df = pd.DataFrame(data=list(all_interactions_set))
interaction_df.to_csv(interaction_file, index=False, header = False)

500000
801096
969015
1152289
1253554
1400970
1508259


In [16]:
# Rerun against ESC with interaction file for a more direct comparison

# cell population of interest against only ESC data
sample_data_file = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/Cell_types_files_v1_esccomp.csv'
#Lets loop over all the comparisons Ananse needs to make:
sample_data = pd.read_table(sample_data_file, 
                            sep = ',', comment = '#')

for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    print(cell_id)
    network_file = f"{output_dir}/{cell_id}/full_network_includeprom.txt"
    print(f'running ananse influence using the network file {network_file}')
    net2 = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/outs_v1/ESC/full_network_includeprom.txt"
    print(f'compared to the network file {net2}')
    Path(f"{output_dir}/{cell_id}/ESC_to_{cell_id}_influence_500000_2503").mkdir(parents=True, exist_ok=True) 
    file_name = sample_data.iloc[index,6]
    print(f'together with DEG file {file_name} and the output file influence.txt')
    interaction_file = f'{output_dir}/influence/ESC_all_interactions.txt'
    
    if not os.path.exists(f"{output_dir}/{cell_id}/ESC_to_{cell_id}_influence_500000_2503/ANANSE_influence.tsv"):
        sp.check_call(f'nice -15 ananse influence '
            f'-t {network_file} '
            f'-s {net2} '
            f'-d {file_name} '
            f'-i 500000 '
            f'--full-output '
            f'-u {interaction_file} '
            f'-o {output_dir}/{cell_id}/ESC_to_{cell_id}_influence_500000_2503/ANANSE_influence.tsv '
            f'-n 1 '
            f'2> {output_dir}/{cell_id}/ananse_influence_log_ESC_intfile.txt',shell = True)

LSC
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_16032022/outs_v1/LSC/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_16032022/RNA_DE/20220316/LSC_ESC_pseudobulkpadj.tsv and the output file influence.txt


CalledProcessError: Command 'nice -15 ananse influence -t /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_16032022/outs_v1/LSC/full_network_includeprom.txt -s /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/outs_v1/ESC/full_network_includeprom.txt -d /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_16032022/RNA_DE/20220316/LSC_ESC_pseudobulkpadj.tsv -i 500000 --full-output -u /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_16032022/outs_v1/influence/ESC_all_interactions.txt -o /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_16032022/outs_v1/LSC/ESC_to_LSC_influence_500000_2503/ANANSE_influence.tsv -n 1 2> /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_16032022/outs_v1/LSC/ananse_influence_log_ESC_intfile.txt' returned non-zero exit status 2.

In [None]:
# Compare cell_types against other cells (normalized expression)
for index, cell_type in sample_data.iterrows(): 
    cell_id = sample_data.iloc[index,0]
    comparisons = sample_data.compare_with[index]
    if not cell_id.find("blobmn") != -1:
        #print(cell_id)
        #print(comparisons)
        network_file = f"{output_dir}/{cell_id}/full_network_includeprom.txt"
        print(f'running ananse influence using the network file {network_file}')
        net2 = f"{output_dir}/{comparisons}/full_network_includeprom.txt"
        print(f'compared to the network file {net2}')
        Path(f"{output_dir}/{cell_id}/{cell_id}_influence_500000_1403").mkdir(parents=True, exist_ok=True) 
        file_name = sample_data.iloc[index,6]
        print(f'together with DEG file {file_name} and the output file influence.txt')

        if not os.path.exists(f"{output_dir}/{cell_id}/{cell_id}_influence_500000_1403/influence_diffnetwork.txt"):
            sp.check_call(f'nice -15 ananse influence '
                f'-t {network_file} '
                f'-s {net2} '
                f'-d {file_name} '
                f'-i 500000 '
                f'--full-output '    
                f'-o {output_dir}/{cell_id}/{cell_id}_influence_500000_1403/ANANSE_influence.tsv '
                f'-n 1 '
                f'2> {output_dir}/{cell_id}/ananse_influence_{comparisons}to{cell_id}log_500.txt',shell = True)
                #f'--union-grn ' no union-grn anymore?

running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/outs_v1/LSC/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/outs_v1/CF/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_11032022/RNA_DE/20220311/LSC_CF_pseudobulkpadj.tsv and the output file influence.txt
