In [1]:
import dask
import dask.dataframe as dd
from scipy import stats
import os
import sys
import pandas as pd
import subprocess as sp
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
import shutil
import glob
import gimmemotifs
from pathlib import Path
import qnorm
from sklearn import datasets, linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, average_precision_score

%matplotlib inline

In [2]:
def hue_regplot(data, x, y, hue, palette=None, **kwargs):
    from matplotlib.cm import get_cmap
    
    regplots = []
    
    levels = data[hue].unique()
    
    if palette is None:
        default_colors = get_cmap('tab10')
        palette = {k: default_colors(i) for i, k in enumerate(levels)}
    
    for key in levels:
        regplots.append(
            sns.regplot(
                x=x,
                y=y,
                fit_reg=False, 
                data=data[data[hue] == key],
                color=palette[key],
                **kwargs
            )
        )
    
    return regplots

In [3]:
# specifiy files for network generation intra comparison stromal and epi
output_dir = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1'
Path(f"{output_dir}").mkdir(parents=True, exist_ok=True)       

#load the genome info
genome_path_size = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/genome/hg38/hg38.fa.sizes"
genome_path = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/genome/hg38/hg38.fa"

# cell population of interest against only ESC data
sample_data_file = '/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/Cell_types_files_epi_strom_v1_esccomp.csv'
#Lets loop over all the comparisons Ananse needs to make:
sample_data = pd.read_table(sample_data_file, 
                            sep = ',', comment = '#')

In [4]:
#Lets loop over all the comparisons Ananse needs to make:
sample_data = pd.read_table(sample_data_file, 
                            sep = ',', comment = '#')
sample_data

Unnamed: 0,cell_type,Accesibility_peakfiles,Merged_peakfiles,scATAC_BAMfiles,TPM_matrix,compare_with,count_table_files
0,epi,-,-,-,-,-,/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data...
1,stromal,-,-,-,-,-,/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data...


# Generate TF-TF interactions networks

In [5]:
# Generate the networks with weighted binding

for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    print(cell_id)
    network_file = f"{output_dir}/{cell_id}/narrow/full_network_includeprom.txt"
    print(f'running ananse influence using the network file {network_file}')
    net2 = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt"
    print(f'compared to the network file {net2}')
    Path(f"{output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_250000_1107").mkdir(parents=True, exist_ok=True) 
    file_name = sample_data.iloc[index,6]
    print(f'together with DEG file {file_name} and the output file influence.txt')

    if not os.path.exists(f"{output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_250000_1107/ANANSE_influence.tsv"):
        sp.check_call(f'nice -15 ananse influence '
            f'-t {network_file} '
            f'-s {net2} '
            f'-d {file_name} '
            f'--select-after-join '
            f'-i 250000 '
            f'--full-output '    
            f'-o {output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_250000_1107/ANANSE_influence.tsv '
            f'-n 1 '
            f'2> {output_dir}/{cell_id}/narrow/ananse_influence_log_ESC_1107.txt',shell = True)

epi
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/epi/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_ESC_cpm/20220708/epi_ESC_pseudobulkpadj.tsv and the output file influence.txt
stromal
running ananse influence using the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/stromal/narrow/full_network_includeprom.txt
compared to the network file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_31032022/outs_v1/ESC/full_network_includeprom.txt
together with DEG file /ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_ESC_cpm/20220708/stromal_ESC_pseudobulkpadj.tsv and the output file influence.txt


In [35]:
# Import the differential networks with weighted binding data

epi_dif = None
stormal_dif = None

for index, cell_type in sample_data.iterrows():
    cell_id = sample_data.iloc[index,0]
    net = pd.read_csv(f"{output_dir}/{cell_id}/narrow/ESC_to_{cell_id}_influence_250000_1107/ANANSE_influence_diffnetwork.tsv", sep='\t', header=0)
    if cell_id == "epi":
        epi_dif = net
    if cell_id == "stromal":
        stromal_dif = net
    print(net.head())

    source   target  weight_target  tf_expression_target  \
0  ONECUT1  FAM122C       0.347974                   0.0   
1     DLX1   YY1AP1       0.363925                   0.0   
2     E4F1   TRIM31       0.434956                   0.0   
3    SALL4    PDCL2       0.330029                   0.0   
4    ZFP28   KCNJ18       0.328050                   0.0   

   target_expression_target  weighted_binding_target  tf_activity_target  \
0                       0.0                 0.647033            0.744861   
1                       0.0                 0.860778            0.594921   
2                       0.0                 0.981661            0.758162   
3                       0.0                 0.478520            0.841596   
4                       0.0                 0.732996            0.579202   

   weight_source  tf_expression_source  target_expression_source  \
0       0.146112                   0.0                       0.0   
1       0.162063                   0.0        

In [36]:
# Load in the stromal and epi TFs
stromal_tf = pd.read_csv("/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/ANANSE_TF_TF/20220708/stromal_tfs.csv", sep=',', header=0)
stromal_tf = stromal_tf["x"].tolist()
epi_tf = pd.read_csv("/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/ANANSE_TF_TF/20220708/corneal_tfs.csv", sep=',', header=0)
epi_tf = epi_tf["x"].tolist()

print(stromal_tf)
print(epi_tf)

['AHR', 'ALX4', 'ATF3', 'ATOH8', 'BHLHE22', 'CEBPA', 'CEBPB', 'CEBPD', 'CREB3', 'CREB5', 'DDIT3', 'DLX5', 'DMRTA2', 'EGR3', 'EHF', 'EMX2', 'EPAS1', 'ESR1', 'FLI1', 'FOS', 'FOSB', 'FOSL2', 'FOXC1', 'FOXD1', 'FOXD2', 'FOXF2', 'FOXN1', 'FOXQ1', 'GLIS3', 'HES1', 'HES4', 'HES5', 'HES7', 'HIC1', 'HLF', 'HLX', 'HOXB2', 'HOXB3', 'HOXD8', 'HSF4', 'IRF1', 'IRX1', 'IRX3', 'IRX5', 'IRX6', 'JUN', 'KLF10', 'KLF2', 'KLF4', 'KLF5', 'KLF9', 'MAFB', 'MAFF', 'MEF2B', 'MEIS1', 'MESP1', 'MITF', 'MSX1', 'NPAS2', 'NR2F1', 'NR2F2', 'NR4A3', 'OSR1', 'OSR2', 'PITX1', 'PITX2', 'POU3F3', 'PPARG', 'PRDM1', 'RUNX1', 'SHOX', 'TBX18', 'TBX2', 'TBX3', 'TFAP2A', 'TFAP2B', 'TP63', 'TWIST1', 'TWIST2', 'VDR', 'ZBTB16', 'ZNF257']
['ASCL2', 'ATF3', 'BARX2', 'BATF', 'BCL6', 'BHLHE22', 'BHLHE40', 'BHLHE41', 'CEBPA', 'CEBPB', 'CEBPD', 'CREB5', 'DDIT3', 'DLX2', 'DLX3', 'DLX4', 'DLX5', 'DMRTA1', 'DMRTA2', 'EGR3', 'EGR4', 'EHF', 'ELF1', 'ELF3', 'ELF4', 'EMX2', 'EPAS1', 'FOS', 'FOSB', 'FOSL1', 'FOSL2', 'FOXA1', 'FOXC1', 'FOXD2', '

In [39]:
# subselect the TF-TF interactions and differential weighted binding

# stromal
stromal_net_wb = stromal_dif[stromal_dif['source'].isin(stromal_tf)]
stromal_net_wb = stromal_net_wb[stromal_net_wb['target'].isin(stromal_tf)]
stromal_net_wb = stromal_net_wb[["source","target","weighted_binding_source","weighted_binding_target"]]
stromal_net_wb["dif"] = stromal_net_wb["weighted_binding_target"] - stromal_net_wb["weighted_binding_source"]

# select interactions with at least 0.3 difference in binding
stromal_net_wb = stromal_net_wb[stromal_net_wb["dif"] > 0.3]

print(stromal_net_wb.sort_values(by=['dif'], ascending=False))

# epi
epi_net_wb = epi_dif[epi_dif['source'].isin(epi_tf)]
epi_net_wb = epi_net_wb[epi_net_wb['target'].isin(epi_tf)]
epi_net_wb = epi_net_wb[["source","target","weighted_binding_source","weighted_binding_target"]]
epi_net_wb["dif"] = epi_net_wb["weighted_binding_target"] - epi_net_wb["weighted_binding_source"]

# select interactions with at least 0.3 difference in binding
epi_net_wb = epi_net_wb[epi_net_wb["dif"] > 0.3]

print(epi_net_wb.sort_values(by=['dif'], ascending=False))

# saving the networks to csv files for import in cytoscape
stromal_dif_file = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/strom_dif.txt"
epi_dif_file = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/epi_dif.txt"

stromal_net_wb.to_csv(stromal_dif_file, index=False, header = True)
epi_net_wb.to_csv(epi_dif_file, index=False, header = True)

        source target  weighted_binding_source  weighted_binding_target  \
223325    TBX3   SHOX                 0.000000                 0.736156   
216971   CREB5   SHOX                 0.000000                 0.726663   
49097    FOXQ1   SHOX                 0.000000                 0.709839   
181589   MEIS1   SHOX                 0.000000                 0.709552   
114233    HES4   SHOX                 0.000000                 0.708128   
46734    FOXD2   SHOX                 0.000000                 0.707399   
226131   HOXD8   SHOX                 0.000000                 0.707381   
162295    IRF1   SHOX                 0.000000                 0.706895   
44748    FOXF2   SHOX                 0.000000                 0.705393   
213023   TBX18   SHOX                 0.000000                 0.705224   
214464    IRX5   SHOX                 0.000000                 0.700283   
233553     EHF   SHOX                 0.000000                 0.696411   
127804    MSX1   SHOX    

In [44]:
# subselect the TF-TF interactions and differential weighted binding V2

# stromal
stromal_net_wb = stromal_dif[stromal_dif['source'].isin(stromal_tf)]
stromal_net_wb = stromal_net_wb[stromal_net_wb['target'].isin(stromal_tf)]
stromal_net_wb = stromal_net_wb[["source","target","weighted_binding_source","weighted_binding_target"]]
stromal_net_wb["dif"] = stromal_net_wb["weighted_binding_source"] - stromal_net_wb["weighted_binding_target"]
stromal_net_wb = stromal_net_wb[stromal_net_wb["dif"] > 0]
print(stromal_net_wb.sort_values(by=['dif'], ascending=False))

# epi
epi_net_wb = epi_dif[epi_dif['source'].isin(epi_tf)]
epi_net_wb = epi_net_wb[epi_net_wb['target'].isin(epi_tf)]
epi_net_wb = epi_net_wb[["source","target","weighted_binding_source","weighted_binding_target"]]
epi_net_wb["dif"] = epi_net_wb["weighted_binding_source"] - epi_net_wb["weighted_binding_target"]


       source  target  weighted_binding_source  weighted_binding_target  \
25685    TP63    HES7                 0.929870                 0.861606   
29062    TP63   FOXQ1                 0.583285                 0.518303   
32094    TP63   NPAS2                 0.593190                 0.531125   
41979    TP63  ZNF257                 0.474092                 0.421945   
53395    TP63    EMX2                 0.525082                 0.484771   
33607   KLF10    IRF1                 0.967469                 0.928918   
34251   KLF10  TWIST1                 0.765371                 0.727457   
55929    TP63   CEBPD                 0.672645                 0.635110   
35047   KLF10    TBX3                 0.778969                 0.741848   
68232    TP63    OSR2                 0.611124                 0.576255   
38058   KLF10   PRDM1                 0.808969                 0.774898   
38548   KLF10   MEF2B                 0.775442                 0.741867   
41093   KLF10    ATF3    

In [73]:
# Import the differential networks with weighted binding data
epi_dif = pd.read_csv("/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/epi/narrow/stromal_to_epi_influence_250000_2604/ANANSE_influence_diffnetwork.tsv", sep='\t', header=0)
stromal_dif = pd.read_csv("/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/stromal/narrow/epi_to_stromal_influence_250000_2604/ANANSE_influence_diffnetwork.tsv", sep='\t', header=0)

In [128]:
# subselect the TF-TF interactions and differential weighted binding

# stromal
stromal_net_wb = stromal_dif[stromal_dif['source'].isin(stromal_tf)]
stromal_net_wb = stromal_net_wb[stromal_net_wb['target'].isin(stromal_tf)]
stromal_net_wb = stromal_net_wb[["source","target","weighted_binding_source","weighted_binding_target"]]
stromal_net_wb["dif"] = stromal_net_wb["weighted_binding_target"] - stromal_net_wb["weighted_binding_source"]

# select interactions with at least 0.3 difference in binding
stromal_net_wb = stromal_net_wb[stromal_net_wb["dif"] > 0.3]

print(stromal_net_wb.sort_values(by=['dif'], ascending=False))

# epi
epi_net_wb = epi_dif[epi_dif['source'].isin(epi_tf)]
epi_net_wb = epi_net_wb[epi_net_wb['target'].isin(epi_tf)]
epi_net_wb = epi_net_wb[["source","target","weighted_binding_source","weighted_binding_target"]]
epi_net_wb["dif"] = epi_net_wb["weighted_binding_target"] - epi_net_wb["weighted_binding_source"]

# select interactions with at least 0.3 difference in binding
epi_net_wb = epi_net_wb[epi_net_wb["dif"] > 0.3]

print(epi_net_wb.sort_values(by=['dif'], ascending=False))


# add the mode of regulation and coincidence from DoRoThea
# Load in the stromal and epi TFs
stromal_tf_dor = pd.read_csv("/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/ANANSE_TF_TF/20220711/net_stromal_nofilt.csv", sep=',', header=0)
epi_tf_dor = pd.read_csv("/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/ANANSE_TF_TF/20220711/net_epi_nofilt.csv", sep=',', header=0)


# saving the networks to csv files for import in cytoscape
stromal_dif_file = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/strom_dif_V2.txt"
epi_dif_file = "/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE_22042022/outs_v1/epi_dif_V2.txt"

stromal_net_wb.to_csv(stromal_dif_file, index=False, header = True)
epi_net_wb.to_csv(epi_dif_file, index=False, header = True)

       source  target  weighted_binding_source  weighted_binding_target  \
169861   MAFF    TBX2                 0.486925                 0.951482   
16290    KLF9  TWIST1                 0.391539                 0.809298   
230171  HOXB2    SHOX                 0.364191                 0.777265   
236110   MITF    MITF                 0.423617                 0.830940   
204774  CREB5    MITF                 0.410245                 0.807637   
248400   TP63    MITF                 0.373564                 0.769188   
211935   IRX3    MITF                 0.409339                 0.800929   
232198   MITF  TWIST1                 0.399762                 0.788770   
210090   IRX5    MITF                 0.409614                 0.797236   
152113   ESR1    MITF                 0.424050                 0.809536   
208124   IRX1    MITF                 0.408378                 0.791962   
21545    MAFF    MITF                 0.425201                 0.804845   
224931  NR4A3    SHOX    

In [137]:
# Load in the dataframes from DoRoThea and DEG

# Load in the stromal and epi TFs
stromal_tf_dor = pd.read_csv("/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/ANANSE_TF_TF/20220711/net_stromal_nofilt.csv", sep=',', header=0)
epi_tf_dor = pd.read_csv("/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/R/ANANSE_TF_TF/20220711/net_epi_nofilt.csv", sep=',', header=0)

stromal_tf_dor= stromal_tf_dor.rename(columns={"tf": "source"})
epi_tf_dor = epi_tf_dor.rename(columns={"tf": "source"})

print(epi_tf_dor)

DEG_epi_strom = pd.read_csv("/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_intra_cpm/20220421/epi_stromal_pseudobulkpadj.tsv", sep='\t', header=0)
DEG_epi_strom= DEG_epi_strom.rename(columns={"resid": "source","log2FoldChange": "L2FC_source"})
print(DEG_epi_strom)

DEG_epi_strom_target = pd.read_csv("/ceph/rimlsfnwi/data/moldevbio/zhou/jarts/data/scANANSE/RNA_intra_cpm/20220421/epi_stromal_pseudobulkpadj.tsv", sep='\t', header=0)
DEG_epi_strom_target= DEG_epi_strom_target.rename(columns={"resid": "target","log2FoldChange": "L2FC_target"})
del DEG_epi_strom_target["padj"]
print(DEG_epi_strom_target)

     source confidence   target  mor
0     ASCL2          E     BATF    1
1     ASCL2          E     TBX2    1
2      ATF3          C    DDIT3    1
3      ATF3          C  BHLHE40   -1
4      ATF3          C    CREB5   -1
..      ...        ...      ...  ...
411  ZNF467          E   ZBTB16    1
412  ZNF467          E   ZNF524    1
413  ZNF524          E  BHLHE40    1
414  ZNF524          E    MESP1    1
415  ZNF524          E   ZNF467    1

[416 rows x 4 columns]
           source  L2FC_source      padj
0      AL627309.1     0.383232  0.784433
1      AL627309.5     1.544175  0.046602
2       LINC01409    -0.047582  0.943376
3          FAM87B    -2.267994       NaN
4       LINC01128    -0.787576  0.015315
...           ...          ...       ...
26952  AC007663.2     0.622308       NaN
26953  AL008723.2     0.000000       NaN
26954    Z99756.1     1.082226       NaN
26955  AL008718.2     1.082226       NaN
26956      ADGRG4    -2.002417       NaN

[26957 rows x 3 columns]
           tar

In [139]:
# Stromal
stromal_net_wb = pd.merge(stromal_net_wb, DEG_epi_strom,on='source', how ="left")
stromal_net_wb = pd.merge(stromal_net_wb, DEG_epi_strom_target,on='target', how ="left")
mergedRes = pd.merge(stromal_net_wb, stromal_tf_dor,on=['source','target'], how ="left")
mergedRes

#Add values for colors with sources that have no target DEG value
new_df = mergedRes[~mergedRes["source"].isin(mergedRes["target"])]
new_df["target"]=new_df["source"]
new_df["L2FC_target"]=new_df["L2FC_source"]

# Exclude all other data for these values
for i in new_df.columns.values:
    print(i)
    if i not in ["L2FC_target","target"]:
        new_df[i]=float('NaN')

mergedRes2 = mergedRes.append(new_df, ignore_index=True)

mergedRes2.to_csv(stromal_dif_file, index=False, header = True)

source
target
weighted_binding_source
weighted_binding_target
dif
L2FC_source_x
padj_x
L2FC_target_x
L2FC_source_y
padj_y
L2FC_target_y
L2FC_source_x
padj_x
L2FC_target_x
L2FC_source_y
padj_y
L2FC_target_y
L2FC_source
padj
L2FC_target
confidence
mor


In [140]:
# Epi
epi_net_wb = pd.merge(epi_net_wb, DEG_epi_strom,on='source', how ="left")
epi_net_wb = pd.merge(epi_net_wb, DEG_epi_strom_target,on='target', how ="left")
mergedRes = pd.merge(epi_net_wb, epi_tf_dor,on=['source','target'], how ="left")
mergedRes

#Add values for colors with sources that have no target DEG value
new_df = mergedRes[~mergedRes["source"].isin(mergedRes["target"])]
new_df["target"]=new_df["source"]
new_df["L2FC_target"]=new_df["L2FC_source"]

# Exclude all other data for these values
for i in new_df.columns.values:
    print(i)
    if i not in ["L2FC_target","target"]:
        new_df[i]=float('NaN')

mergedRes2 = mergedRes.append(new_df, ignore_index=True)

mergedRes2.to_csv(epi_dif_file, index=False, header = True)


source
target
weighted_binding_source
weighted_binding_target
dif
L2FC_source_x
padj_x
L2FC_target_x
L2FC_source_y
padj_y
L2FC_target_y
L2FC_source_x
padj_x
L2FC_target_x
L2FC_source_y
padj_y
L2FC_target_y
L2FC_source_x
padj_x
L2FC_target_x
L2FC_source_y
padj_y
L2FC_target_y
L2FC_source
padj
L2FC_target
confidence
mor
