## Libraries

In [None]:
import bioframe
import numpy as np
import pandas as pd
import gseapy as gp
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import glob
import cooler
#bm = gp.Biomart()

pd.options.mode.chained_assignment = None # default='warn'

## Helper functions and files ops

In [None]:
# read gtf
gtf = "/home/carlos/Desktop/manuscripts/notebooks/unibind/GRCh38.gtf"
genes_all = bioframe.read_table(gtf, schema='gtf').query('feature=="CDS"')
genes_all.start = genes_all.start.astype(int)
genes_all.end = genes_all.end.astype(int)
genes_all.sort_values(by=['chrom', 'start'], inplace=True)

In [None]:
# use appris or mane

In [None]:
df = pd.read_csv("/home/carlos/Desktop/manuscripts/notebooks/RNA/appris_data.appris.txt", sep='\t')
df = df.loc[df["MANE"].isin(['MANE_Select'])] #, 'MANE_Plus_Clinical'])]
mane_tx_list = df['Transcript ID'].tolist()

In [None]:
genes = genes_all.copy()
genes['gene_id'] = [gene_id.split(".")[0] for gene_id in genes.attributes.str.extract(r'gene_id "(.*?)";', expand=False)]
genes['tx_id'] = [tx_id.split(".")[0] for tx_id in genes.attributes.str.extract(r'transcript_id "(.*?)";', expand=False)]
genes['external_gene_name'] = [gene_name.split(".")[0] for gene_name in genes.attributes.str.extract(r'gene_name "(.*?)";', expand=False)]
genes = genes.loc[genes['tx_id'].isin(mane_tx_list)]
genes.sort_values(by=['chrom', 'start'], inplace=True)

In [None]:
tx_adjusted = []
for tx_id, tx_df in genes.groupby('tx_id').__iter__():
    if "+" in tx_df.strand.to_list():
        tx_adjusted.append(tx_df.iloc[0, :])
    elif "-" in tx_df.strand.to_list():
        tx_adjusted.append(tx_df.iloc[-1, :])

genes = pd.concat(tx_adjusted, axis=1).T
genes.reset_index(drop=True, inplace=True)
genes.start = genes.start.astype(int)
genes.end = genes.end.astype(int)

In [None]:
human = pd.Series(gp.get_library_name(organism='Human'))
pathways = human.loc[human.str.contains("MSigDB_Hallmark_2020") | human.str.contains("GO_Biological_Process_2023") | human.str.contains("NCI-Nature_2016")].reset_index(drop=True)
#pathways = human.loc[human.str.contains("MSigDB_Hallmark_2020") | human.str.contains("NCI-Nature_2016") ].reset_index(drop=True)
pathways = {
    pathway: gp.get_library(name=pathway, organism='Human')
    for pathway in pathways
}

In [None]:
def enrichrrr(
    degs_list: list, 
    pathways_dict: dict, 
    universe_list: list = None):
    results = []

    for pathway_name, gene_sets in pathways_dict.items():
        enr = gp.enrichr(gene_list=degs_list,
                     gene_sets=gene_sets,
                     #gene_sets = pathway_name,
                     outdir=None,
                     background=universe_list,
                     verbose=False)
        results.append(enr)
    return results

In [None]:
def overlapper(
    current_degs_df: pd.DataFrame, 
    my_regions_df: pd.DataFrame, 
    all_genes: pd.DataFrame, 
    tss_coord_only: bool = True, # True if you want to use only the TSS coordinates (point-wise), False if you want to create a window around the TSS
    upstream: int =2000, 
    downstream: int =500,
    returnNames = True):
    strand_oriented_genes = all_genes.copy()

    if tss_coord_only == True:
        strand_oriented_genes['start'] = all_genes.apply(lambda x: x['start'] if x['strand'] == "+" else x['end'], axis=1)
        strand_oriented_genes['end'] = strand_oriented_genes['start']
    else:
        strand_oriented_genes['start'] = all_genes.apply(lambda x: x['start'] - upstream if x['strand'] == 1 else x['end'] - downstream, axis=1)
        strand_oriented_genes['end'] = all_genes.apply(lambda x: x['start'] + downstream if x['strand'] == 1 else x['end'] + upstream, axis=1)

    my_regions_universe = bioframe.overlap(strand_oriented_genes, my_regions_df, how='inner')
    degs_filter = current_degs_df.loc[current_degs_df['ensembl_gene_id'].isin(my_regions_universe['gene_id'])]
    
    if returnNames == True:
        return list(degs_filter.external_gene_name.dropna().unique()), my_regions_universe
    else:
        return degs_filter, my_regions_universe

In [None]:
def merge_enrs_into_common_df(res_1, res_2):
    comparison_dfs = []

    for i, (enr1, enr2) in enumerate(zip(res_1, res_2)):
        df1 = enr1.results.sort_values('Adjusted P-value')
        df2 = enr2.results.sort_values('Adjusted P-value')
        sig_terms_df1 = df1.loc[df1['Adjusted P-value'] <= 0.05].Term
        sig_terms_df2 = df2.loc[df2['Adjusted P-value'] <= 0.05].Term
        df1['logPadj'] = -np.log10(df1['Adjusted P-value'])
        df2['logPadj'] = -np.log10(df2['Adjusted P-value'])
        
        # merge dfs
        df = pd.merge(df1, df2, on='Term', suffixes=('_0_12', '_0_60'))
        if i == 2:
            df = df.loc[df['logPadj_0_60'] >= 1.3]
        # keep columns term, logPadj_0_12, logPadj_0_60
        df = df[['Term', 'logPadj_0_12', 'logPadj_0_60']]
        
        df = df.loc[df.Term.isin(sig_terms_df1) | df.Term.isin(sig_terms_df2)]
        df.reset_index(drop=True, inplace=True)
        # sort df based on the mean of logPadj_0_12 and logPadj_0_60, without writing over df
        df = df.loc[df.iloc[:,[1,2]].max(axis=1).sort_values(ascending=True).index]

        if i == 0:
            # remove " (GO):$" from term
            df['Term'] = ["".join(term.split("(GO")[0]) for term in df.Term]
        if i == 2:
            df['Term'] = [" ".join(term.split(" ")[:-3]) for term in df.Term]
        comparison_dfs.append(df)

    return comparison_dfs


def merge_terms(df_list: list):
    pathways = []
    for df in df_list:
        sig_terms = df.loc[df['Adjusted P-value'] <= 0.05].Term.unique()
        pathways += list(sig_terms)
    return list(set(pathways))


def merge_enrs_into_common_df_2(res_list : list, names : list = None):
    comparison_dfs = []
    n_pathways = len(res_list[0])

    for i in range(n_pathways):
        df_list_toMergeTerms = []
        for res in res_list:
            curr_df = res[i].results
            curr_df = curr_df.loc[curr_df['Adjusted P-value'] <= 0.05]
            df_list_toMergeTerms.append(curr_df)
    
        merged_terms = merge_terms(df_list_toMergeTerms)

        db_dfs = []
        for resIdx, res in enumerate(res_list):
            curr_df = res[i].results
            curr_df = curr_df.loc[curr_df['Term'].isin(merged_terms)]
            curr_df.reset_index(drop=True, inplace=True)
            curr_df.loc[:, 'logPadj'] = -np.log10(curr_df['Adjusted P-value'])
            curr_df.sort_values(by='logPadj', inplace=True, ascending=False)
            if names is not None:
                curr_df['whichRes'] = names[resIdx]
            else:
                curr_df['whichRes'] = resIdx
        
            if i == 0:
                curr_df['Term'] = ["".join(term.split("(GO")[0]) for term in curr_df.Term]
            if i == list(range(n_pathways))[-1]:
                curr_df['Term'] = [" ".join(term.split(" ")[:-3]) for term in curr_df.Term]

            db_dfs.append(curr_df)

        df = pd.concat(db_dfs, axis=0)
        comparison_dfs.append(df)

    # for comp_df in comparison_dfs:
    #     if len(list(set(comp_df.groupby('whichRes').count().Term.values))) != 1 and len(comp_df) != 0:
    #         print(comp_df)
    #         print("WARNING: different number of terms in different results")
    #         return None

    comparison_dfs_reformat = []
    for comp_df in comparison_dfs:
        comp_df.sort_values(by=['Term', 'logPadj'], inplace=True, ascending=False)

        comp_df = comp_df.pivot(index='Term', columns='whichRes', values='logPadj')
        comp_df['Term'] = comp_df.index
        # nColumns = len(comp_df.columns) - 1
        # comp_df = comp_df.loc[comp_df.iloc[:,:nColumns].max(axis=1).sort_values(ascending=True).index]

        comp_df.Term = [split_text_into_lines(term) for term in comp_df.Term]
        comparison_dfs_reformat.append(comp_df)

    return comparison_dfs_reformat

def split_text_into_lines(text, max_line_length=30):
    # split text into lines, but it should not split words
    lines = []
    words = text.split(" ")
    line = ""
    for word in words:
        if len(line) + len(word) <= max_line_length:
            line += word + " "
        else:
            lines.append(line)
            line = word + " "
    lines.append(line)  # append the last line
    return "\n".join([line[:-1] for line in lines])  # remove last space from all

def write_results(res, out):
    df= res.results.sort_values('Adjusted P-value')
    df = df.loc[df['Adjusted P-value'] <= 0.05]
    if len(df) != 0:
        df.to_csv(out, sep='\t', index=False)

In [None]:
def get_anchors(regions_file):
    regions_bedpe =  pd.read_csv(regions_file, sep="\t")
    fivePrime_anchors = regions_bedpe[['chrom1', 'start1', 'end1']]
    fivePrime_anchors.columns = ['chrom', 'start', 'end']
    threePrime_anchors = regions_bedpe[['chrom2', 'start2', 'end2']]
    threePrime_anchors.columns = ['chrom', 'start', 'end']
    regions = pd.concat([fivePrime_anchors, threePrime_anchors], axis=0).drop_duplicates().reset_index(drop=True)
    regions.drop_duplicates(inplace=True)
    return regions

We gather anchors from the loops because there might be shared anchors between loops
And there is a possibilty that a gene is regulated by other pair, that is not present in unique anchors (to a timepoint) list
However, we can use the unique anchors to a timepoint to estimate differential TF binding, which is another analysis

In [None]:
degs_0_12 = pd.read_csv("/home/carlos/Desktop/manuscripts/notebooks/RNA/t0-t12.degs.tsv", sep="\t")
degs_0_60 = pd.read_csv("/home/carlos/Desktop/manuscripts/notebooks/RNA/t0-t60.degs.tsv", sep="\t")
degs_0_30 = pd.read_csv("/home/carlos/Desktop/manuscripts/notebooks/RNA/t0-t30.degs.tsv", sep="\t")

deseq_lrt = pd.read_csv("/home/carlos/Desktop/manuscripts/notebooks/RNA/all_deseq_lrt.tsv", sep="\t")
deseq_lrt.rename(columns={'gene_id': 'ensembl_gene_id'}, inplace=True)

## gnn res

In [None]:
paths = [f"/home/carlos/Desktop/manuscripts/notebooks/gnn/t0_t12_results_{comp_now}.tsv" for comp_now in ["0_0", "1_0", "0_1", "1_1"]]
all_dfs = [pd.read_csv(df, sep="\t") for df in paths] 
all_regions = pd.concat(all_dfs) # This is the GNN evaluated regions
all_regions = all_regions.iloc[:, :3]
universe_3d_df = bioframe.overlap(all_regions, genes, how='inner')
universe_3d_ids = list(universe_3d_df.gene_id_.dropna().unique())
universe_3d_names = list(universe_3d_df.external_gene_name_.dropna().unique())

In [None]:
comp_labels = ["comp1", "comp2", "comp3", "comp4"]
comp_files = ["t0_t12_results_0_0", "t0_t12_results_0_1", "t0_t12_results_1_0", "t0_t12_results_1_1"]

filter_11 = False
for label, comp in zip(comp_labels, comp_files):
    regions = pd.read_csv(f"/home/carlos/Desktop/manuscripts/notebooks/gnn/{comp}.tsv", sep="\t")
    filter_w_unibind = False

    unibind_mapping = {
        "0_1": "0_1_vs_1_0",
        "1_0": "1_0_vs_0_1",
        "0_0": "0_0_vs_1_1",
        "1_1": "1_1_vs_0_0"
    }

    comp_name = comp[-3:]
    if comp_name == "1_1" and filter_11:
        regions = regions.loc[(regions["t0_q30-t12_q30"] == 0) & (regions["t12_q30-t0_q30"] == 0)]

    if filter_w_unibind:
        unibind_regions = pd.read_csv(f"/home/carlos/Desktop/manuscripts/notebooks/unibind/gnn_res/gnn_{unibind_mapping[comp_name]}/extracted_regions_merged.bed", sep="\t", header=None).iloc[:, :3]
        unibind_regions.columns = ['chrom', 'start', 'end']
        unibind_regions.start = unibind_regions.start.astype(int)
        unibind_regions.end = unibind_regions.end.astype(int)
        regions = bioframe.overlap(regions, unibind_regions, how='inner')


    degs_0_12_degs, uni = overlapper(degs_0_12, regions, genes)
    degs_0_30_degs, uni = overlapper(degs_0_30, regions, genes)
    degs_0_60_degs, uni = overlapper(degs_0_60, regions, genes)

    uni.to_csv(f"gnn_enrichr_results_comp_wise/{label}_universe.tsv", sep="\t", index=False)

    res_0_12 = enrichrrr(degs_0_12_degs, pathways, universe_3d_names)
    res_0_30 = enrichrrr(degs_0_30_degs, pathways, universe_3d_names)
    res_0_60 = enrichrrr(degs_0_60_degs, pathways, universe_3d_names)

    comparison_dfs = merge_enrs_into_common_df_2([res_0_12, res_0_30, res_0_60], ["0_12", "0_30", "0_60"])

    for res1, res2, res3, pathway in zip(res_0_12, res_0_30, res_0_60, pathways):
        write_results(res1, f"gnn_enrichr_results_comp_wise/{label}_{pathway}_0_12.tsv")
        write_results(res2, f"gnn_enrichr_results_comp_wise/{label}_{pathway}_0_30.tsv")
        write_results(res3, f"gnn_enrichr_results_comp_wise/{label}_{pathway}_0_60.tsv")

    database_names = ["Gene Ontology\nBiological Process", "MSigDB\nHallmark", "NCI-Nature\nPID"]
    
    plot_count = 0
    for i, (df, pathway) in enumerate(zip(comparison_dfs, pathways)):
        fig, ax = plt.subplots(1, 1, figsize=(10, 8))
        if len(df) != 0:

            df = df.loc[df.iloc[:,[0,1,2]].max(axis=1).sort_values(ascending=True).index]
            df = df.iloc[-20:, :]
            b = df.plot.barh(x='Term', ax=ax, color=['#A63446', "#F5B841", '#9DBBAE'])
            b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=0.6, linewidth=3)
            b.set_xlabel(f'-log$_{{10}}$(Adjusted P-value)', fontsize=20)

            b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=.9, linewidth=3)

            b.set_xlabel(f'-log$_{{10}}$ Adjusted P-value', fontsize=12)
            b.set_ylabel(database_names[i], fontsize=20)

            
            # change font size of x and y ticks
            b.tick_params(labelsize=8)
            b.tick_params(axis='both', which='major', labelsize=10)
            b.tick_params(axis='both', which='minor', labelsize=10)

            db_name = database_names[i].replace("\n", "_")

            #fig.suptitle(f"{label}", fontsize=30)
            fig.set_tight_layout(True)
            fig.savefig(f"gnn_enrichr_results_comp_wise/{label}_{db_name}_pathways.svg")
            fig.savefig(f"gnn_enrichr_results_comp_wise/{label}_{db_name}_pathways.png", dpi=300, facecolor="white", edgecolor='none')

        fig.clf()

In [None]:
# comp_files = ["t0_t12_results_0_0", "t0_t12_results_0_1", "t0_t12_results_1_0", "t0_t12_results_1_1"]

# #regions = pd.concat([pd.read_csv(f"/home/carlos/Desktop/manuscripts/notebooks/gnn/{comp}.tsv", sep="\t") for comp in comp_files[:3]]) # combine comp1, comp2, comp3
# #label = "changed_regions"
# regions = pd.concat([pd.read_csv(f"/home/carlos/Desktop/manuscripts/notebooks/gnn/{comp}.tsv", sep="\t") for comp in comp_files[3:]])
# label = "unchanged_regions"

# degs_0_12_degs, _ = overlapper(degs_0_12, regions, genes)
# degs_0_30_degs, _ = overlapper(degs_0_30, regions, genes)
# degs_0_60_degs, _ = overlapper(degs_0_60, regions, genes)

# res_0_12 = enrichrrr(degs_0_12_degs, pathways, universe_3d_names)
# res_0_30 = enrichrrr(degs_0_30_degs, pathways, universe_3d_names)
# res_0_60 = enrichrrr(degs_0_60_degs, pathways, universe_3d_names)

# comparison_dfs = merge_enrs_into_common_df_2([res_0_12, res_0_30, res_0_60], ["0_12", "0_30", "0_60"])

# for res1, res2, res3, pathway in zip(res_0_12, res_0_30, res_0_60, pathways):
#     write_results(res1, f"gnn_enrichr_results_all_vs_all/{label}_{pathway}_0_12.tsv")
#     write_results(res2, f"gnn_enrichr_results_all_vs_all/{label}_{pathway}_0_30.tsv")
#     write_results(res3, f"gnn_enrichr_results_all_vs_all/{label}_{pathway}_0_60.tsv")

# fig, ax = plt.subplots(3, 1, figsize=(10, 20))
# plot_count = 0
# for i, (df, pathway) in enumerate(zip(comparison_dfs, pathways)):

#     if len(df) != 0:
#         df = df.loc[df.iloc[:,[0,1,2]].mean(axis=1).sort_values(ascending=True).index]
#         df = df.iloc[-20:, :]
#         plot_count += 1
#         b = df.plot.barh(x='Term', ax=ax[i], color=['#A63446', "#F5B841", '#9DBBAE'])

#         b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=0.6, linewidth=3)
#         b.set_xlabel(f'-log$_{{10}}$(Adjusted P-value)', fontsize=20)
#         b.set_ylabel(f'{pathway} Term', fontsize=20)

#     if plot_count != 0:
#         fig.suptitle(f"{label}", fontsize=30)
#         fig.set_tight_layout(True)
#         fig.savefig(f"gnn_enrichr_results_all_vs_all/{label}_pathways.svg")
#         fig.savefig(f"gnn_enrichr_results_all_vs_all/{label}_pathways.png", dpi=300, facecolor="white", edgecolor='none')

#         fig.clf()

### GNN plot all vs all / Uniq Common

In [None]:
comparison_dfs_all_vs_all = []

comp_files = ["t0_t12_results_0_0", "t0_t12_results_0_1", "t0_t12_results_1_0", "t0_t12_results_1_1"]

regions = pd.concat([pd.read_csv(f"/home/carlos/Desktop/manuscripts/notebooks/gnn/{comp}.tsv", sep="\t") for comp in comp_files[:3]]) # combine comp1, comp2, comp3
label = "changed_regions"


degs_0_12_degs, _ = overlapper(degs_0_12, regions, genes)
degs_0_30_degs, _ = overlapper(degs_0_30, regions, genes)
degs_0_60_degs, _ = overlapper(degs_0_60, regions, genes)

res_0_12 = enrichrrr(degs_0_12_degs, pathways, universe_3d_names)
res_0_30 = enrichrrr(degs_0_30_degs, pathways, universe_3d_names)
res_0_60 = enrichrrr(degs_0_60_degs, pathways, universe_3d_names)

comparison_dfs = merge_enrs_into_common_df_2([res_0_12, res_0_30, res_0_60], ["0_12", "0_30", "0_60"])

comparison_dfs_all_vs_all.append(comparison_dfs)

regions = pd.concat([pd.read_csv(f"/home/carlos/Desktop/manuscripts/notebooks/gnn/{comp}.tsv", sep="\t") for comp in comp_files[3:]])
label = "unchanged_regions"

degs_0_12_degs, _ = overlapper(degs_0_12, regions, genes)
degs_0_30_degs, _ = overlapper(degs_0_30, regions, genes)
degs_0_60_degs, _ = overlapper(degs_0_60, regions, genes)

res_0_12 = enrichrrr(degs_0_12_degs, pathways, universe_3d_names)
res_0_30 = enrichrrr(degs_0_30_degs, pathways, universe_3d_names)
res_0_60 = enrichrrr(degs_0_60_degs, pathways, universe_3d_names)

comparison_dfs = merge_enrs_into_common_df_2([res_0_12, res_0_30, res_0_60], ["0_12", "0_30", "0_60"])
comparison_dfs_all_vs_all.append(comparison_dfs)


In [None]:
database_names = ["Gene Ontology\nBiological Process", "MSigDB\nHallmark", "NCI-Nature\nPID"]

In [24]:
# Changed regions uniq terms


for idx, (changed, notChanged) in enumerate(zip(comparison_dfs_all_vs_all[0], comparison_dfs_all_vs_all[1])):
    fig , ax = plt.subplots(1, 1, figsize=(10, 8))
    # find common terms 
    common_terms = list(set(changed.index).intersection(set(notChanged.index)))
    # remove common terms from changed 
    df = changed.loc[~changed.index.isin(common_terms)]
    df = df.loc[df.iloc[:,[0,1,2]].max(axis=1).sort_values(ascending=True).index]

    if len(df) != 0:
        df = df.iloc[-15:, :]
        df.columns = ['12min', '30min', '60min', 'Term']
        b = df.plot.barh(x='Term', ax=ax, color=['#A63446', "#F5B841", '#9DBBAE'])
        b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=.9, linewidth=3)

        b.set_xlabel(f'-log$_{{10}}$ Adjusted P-value', fontsize=12)
        b.set_ylabel(database_names[idx], fontsize=20)

        
        # change font size of x and y ticks
        b.tick_params(labelsize=8)
        b.tick_params(axis='both', which='major', labelsize=10)
        b.tick_params(axis='both', which='minor', labelsize=10)

        db_name = database_names[idx].replace("\n", "_")
        fig.set_tight_layout(True)
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_changed_uniq_terms.svg")
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_changed_uniq_terms.png", dpi=300, facecolor="white", edgecolor='none')

    fig.clf()

<Figure size 720x576 with 0 Axes>

<Figure size 720x576 with 0 Axes>

<Figure size 720x576 with 0 Axes>

In [22]:
# Unchanged regions uniq terms

for idx, (changed, notChanged) in enumerate(zip(comparison_dfs_all_vs_all[0], comparison_dfs_all_vs_all[1])):
    fig , ax = plt.subplots(1, 1, figsize=(10, 8))

    common_terms = list(set(changed.index).intersection(set(notChanged.index)))
    # remove common terms from changed 
    df = notChanged.loc[~notChanged.index.isin(common_terms)]
    if len(df) != 0:
        df = df.loc[df.iloc[:,[0,1,2]].max(axis=1).sort_values(ascending=True).index]


    if len(df) != 0:
        df = df.iloc[-15:, :]
        df.columns = ['12min', '30min', '60min', 'Term']
        b = df.plot.barh(x='Term', ax=ax, color=['#A63446', "#F5B841", '#9DBBAE'])
        b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=.9, linewidth=3)

        b.set_xlabel(f'-log$_{{10}}$ Adjusted P-value', fontsize=12)
        b.set_ylabel(database_names[idx], fontsize=20)

        
        # change font size of x and y ticks
        b.tick_params(labelsize=8)
        b.tick_params(axis='both', which='major', labelsize=10)
        b.tick_params(axis='both', which='minor', labelsize=10)

        db_name = database_names[idx].replace("\n", "_")
        fig.set_tight_layout(True)
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_unchanged_uniq_terms.svg")
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_unchanged_uniq_terms.png", dpi=300, facecolor="white", edgecolor='none')
    
    fig.clf()

<Figure size 720x576 with 0 Axes>

<Figure size 720x576 with 0 Axes>

<Figure size 720x576 with 0 Axes>

In [23]:
# Changed regions common terms
for idx, (changed, notChanged) in enumerate(zip(comparison_dfs_all_vs_all[0], comparison_dfs_all_vs_all[1])):
    fig , ax = plt.subplots(1, 1, figsize=(10, 8))

    common_terms = list(set(changed.index).intersection(set(notChanged.index)))

    df = changed.loc[changed.index.isin(common_terms)]

    if len(df) != 0:
        df = df.loc[df.iloc[:,[0,1,2]].max(axis=1).sort_values(ascending=True).index]
    
    if len(df) != 0:
        df = df.iloc[-15:, :]
        df.columns = ['12min', '30min', '60min', 'Term']
        b = df.plot.barh(x='Term', ax=ax, color=['#A63446', "#F5B841", '#9DBBAE'])
        b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=.9, linewidth=3)

        b.set_xlabel(f'-log$_{{10}}$ Adjusted P-value', fontsize=12)
        b.set_ylabel(database_names[idx], fontsize=20)

        
        # change font size of x and y ticks
        b.tick_params(labelsize=8)
        b.tick_params(axis='both', which='major', labelsize=12)
        b.tick_params(axis='both', which='minor', labelsize=12)
        
        db_name = database_names[idx].replace("\n", "_")

        fig.set_tight_layout(True)
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_changed_common_terms.svg")
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_changed_common_terms.png", dpi=300, facecolor="white", edgecolor='none')
    
    fig.clf()

<Figure size 720x576 with 0 Axes>

<Figure size 720x576 with 0 Axes>

<Figure size 720x576 with 0 Axes>

In [None]:
# not Changed regions common terms
for idx, (changed, notChanged) in enumerate(zip(comparison_dfs_all_vs_all[0], comparison_dfs_all_vs_all[1])):
    fig , ax = plt.subplots(1, 1, figsize=(10, 8))

    common_terms = list(set(changed.index).intersection(set(notChanged.index)))
    # remove common terms from changed 
    df = notChanged.loc[notChanged.index.isin(common_terms)]

    if len(df) != 0:
        df = df.loc[df.iloc[:,[0,1,2]].max(axis=1).sort_values(ascending=True).index]
    
    if len(df) != 0:
        df = df.iloc[-15:, :]
        df.columns = ['12min', '30min', '60min', 'Term']
        b = df.plot.barh(x='Term', ax=ax, color=['#A63446', "#F5B841", '#9DBBAE'])
        b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=.9, linewidth=3)

        b.set_xlabel(f'-log$_{{10}}$ Adjusted P-value', fontsize=12)
        b.set_ylabel(database_names[idx], fontsize=20)

        
        # change font size of x and y ticks
        b.tick_params(labelsize=8)
        b.tick_params(axis='both', which='major', labelsize=12)
        b.tick_params(axis='both', which='minor', labelsize=12)
        db_name = database_names[idx].replace("\n", "_")

        fig.set_tight_layout(True)
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_unchanged_common_terms.svg")
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_unchanged_common_terms.png", dpi=300, facecolor="white", edgecolor='none')
    
    fig.clf()
    

### GNN res, expression profile

In [None]:
def map_geneID_to_name(geneID):
    if geneID not in genes.gene_id.values:
        return None
    return genes.loc[genes.gene_id == geneID].external_gene_name.values[0]

def map_txID_to_name(txID):
    if txID not in genes.tx_id.values:
        return None
    return genes.loc[genes.tx_id == txID].external_gene_name.values[0]

In [None]:
order = ['SU_100', 'SU_200', 'SU_300', 'SU_112', 'SU_212', 'SU_312', 'SU_130', 'SU_230', 'SU_330', 'SU_160', 'SU_260', 'SU_360']

df = pd.read_csv(f"/home/carlos/Desktop/projects/rna-seq/quant/SU_100/quant.sf", sep="\t")
df.Name = df.Name.apply(lambda x: x.split(".")[0])
df = df.loc[df.Name.isin(genes.tx_id.values)]
mapped_names = [map_txID_to_name(txID.split(".")[0]) for txID in df.Name.values]

In [None]:
series = []

for name in order:
    df = pd.read_csv(f"/home/carlos/Desktop/projects/rna-seq/quant/{name}/quant.sf", sep="\t")
    df.Name = df.Name.apply(lambda x: x.split(".")[0])
    df = df.loc[df.Name.isin(genes.tx_id.values)]
    df.rename(columns={'TPM': name}, inplace=True)
    series.append(df[name])

tcounts_df = pd.DataFrame(series).T

tcounts_df['geneName'] = mapped_names

for i, name in zip([0, 3, 6, 9], ["Control", "12min", "30min", "60min"]):
    tcounts_df[name] = tcounts_df.iloc[:, i : i + 3].mean(axis=1)

In [None]:
# which_Genes = [
#     "ATF2", "ATF3", "ATF4", 
#     "JUN", "JUNB", "JUND",
#     "FOS", "FOSL1", "FOSL2", "FOSB", 
#     "MAF", "MAFB",
#     "TP53"]

which_Genes = "SURF1;POLH;GTF2B;ADCY6;BRF2;PRIM1;DGUOK;RNMT;SEC61A1;ZWINT;POLD1;RBX1;CDA;NELFE;RFC4".split(";")
plot_df = {
    "geneName": [],
    "Mean": [],
    "time": []
}

for gene_oi in which_Genes:
    for name in ["Control", "12min", "30min", "60min"]:
        plot_df["geneName"].append(gene_oi)
        plot_df["Mean"].append(tcounts_df.loc[tcounts_df.geneName == gene_oi, name].values[0])
        plot_df["time"].append(name)

fig, ax = plt.subplots(figsize=(20, 10), ncols=len(which_Genes))
plot_df = pd.DataFrame(plot_df)
for i, gene_oi in enumerate(which_Genes):
    sns.barplot(x="time", y="Mean", data=plot_df.loc[plot_df.geneName == gene_oi], ax=ax[i])
    ax[i].set_title(gene_oi)
    ax[i].set_ylabel("Mean TPM")
    ax[i].set_xlabel("Time")
    ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=45, horizontalalignment='right')

## Loop anchors

In [None]:
# anchor lost at JUN

In [None]:
comp_files = ["t0_t12_common_loops1", "t0_t12_common_loops2", "t0_t12_specific_loops1", "t0_t12_specific_loops2"]

anchor_universe = pd.concat([get_anchors(f"/home/carlos/Desktop/manuscripts/notebooks/loops/loops_anchors_data/{comp}.tsv") for comp in comp_files]).drop_duplicates().reset_index(drop=True)
anchor_universe_df = bioframe.overlap(anchor_universe, genes, how='inner')
anchor_universe_ids = list(anchor_universe_df.gene_id_.dropna().unique())
anchor_universe_names = list(anchor_universe_df.external_gene_name_.dropna().unique())

In [None]:

labels = ["common_loops1", "common_loops2", "specific_loops1", "specific_loops2"]
for label, comp in zip(labels, comp_files):
    regions = get_anchors(f"/home/carlos/Desktop/manuscripts/notebooks/loops/loops_anchors_data/{comp}.tsv")
    filter_w_unibind = False
    if filter_w_unibind:
        unibind_regions = pd.read_csv(f"/home/carlos/Desktop/manuscripts/notebooks/unibind/anchor_res/t0_t12_2_1/extracted_regions_merged.bed", sep="\t", header=None).iloc[:, :3]
        unibind_regions.columns = ['chrom', 'start', 'end']
        unibind_regions.start = unibind_regions.start.astype(int)
        unibind_regions.end = unibind_regions.end.astype(int)
        regions = bioframe.overlap(regions, unibind_regions, how='inner')

    degs_0_12_degs = overlapper(degs_0_12, regions, genes)
    degs_0_30_degs = overlapper(degs_0_30, regions, genes)
    degs_0_60_degs = overlapper(degs_0_60, regions, genes)

    res_0_12 = enrichrrr(degs_0_12_degs, pathways)#, anchor_universe_names)
    res_0_30 = enrichrrr(degs_0_30_degs, pathways)#, anchor_universe_names)
    res_0_60 = enrichrrr(degs_0_60_degs, pathways)#, anchor_universe_names)

    comparison_dfs = merge_enrs_into_common_df_2([res_0_12, res_0_30, res_0_60], ["0_12", "0_30", "0_60"])

    for res1, res2, res3, pathway in zip(res_0_12, res_0_30, res_0_60, pathways):
        write_results(res1, f"loops_enrichr_results/{label}_{pathway}_0_12.tsv")
        write_results(res2, f"loops_enrichr_results/{label}_{pathway}_0_30.tsv")
        write_results(res3, f"loops_enrichr_results/{label}_{pathway}_0_60.tsv")

    fig, ax = plt.subplots(3, 1, figsize=(10, 20))
    plot_count = 0
    for i, (df, pathway) in enumerate(zip(comparison_dfs, pathways)):
        print(len(df))
        if len(df) != 0:
            plot_count += 1
            b = df.plot.barh(x='Term', ax=ax[i], color=['#A63446', "#F5B841", '#9DBBAE'])
            b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=0.6, linewidth=3)
            b.set_xlabel(f'-log$_{{10}}$(Adjusted P-value)', fontsize=20)
            b.set_ylabel(f'{pathway} Term', fontsize=20)
    if plot_count != 0:
        fig.suptitle(f"{label}", fontsize=30)
        fig.set_tight_layout(True)
        fig.savefig(f"loops_enrichr_results/{label}_pathways.svg")
        fig.savefig(f"loops_enrichr_results/{label}_pathways.png", dpi=300, facecolor="white", edgecolor='none')

    fig.clf()

## Loop anchors all vs all

In [None]:
labels = ["common_loops", "specific_loops1", "specific_loops2"]

for comp_name in ["t0_t12", "t12_t30", "t30_t60"]:#, "t0_t60", "t0_t30", "t12_t60"]:
    comparison_dfs_all_vs_all_loops = []

    regions = pd.concat([get_anchors(f"/home/carlos/Desktop/manuscripts/notebooks/loops/loops_anchors_data_mc_True_AR_0/{comp_name}_{labels[i]}.tsv") for i in [0,1]]).drop_duplicates().reset_index(drop=True)
    degs_0_12_degs, _ = overlapper(degs_0_12, regions, genes)
    degs_0_30_degs, _ = overlapper(degs_0_30, regions, genes)
    degs_0_60_degs, _ = overlapper(degs_0_60, regions, genes)
    res_0_12 = enrichrrr(degs_0_12_degs, pathways)
    res_0_30 = enrichrrr(degs_0_30_degs, pathways)
    res_0_60 = enrichrrr(degs_0_60_degs, pathways)
    for res1, res2, res3, pathway in zip(res_0_12, res_0_30, res_0_60, pathways):
        write_results(res1, f"loops_enrichr_results/{pathway}_LOOP_{comp_name}_left_DEG_0_12.tsv")
        write_results(res2, f"loops_enrichr_results/{pathway}_LOOP_{comp_name}_left_DEG_0_30.tsv")
        write_results(res3, f"loops_enrichr_results/{pathway}_LOOP_{comp_name}_left_DEG_0_60.tsv")

    comparison_dfs = merge_enrs_into_common_df_2([res_0_12, res_0_30, res_0_60], ["0_12", "0_30", "0_60"])
    comparison_dfs_all_vs_all_loops.append(comparison_dfs)

    regions = pd.concat([get_anchors(f"/home/carlos/Desktop/manuscripts/notebooks/loops/loops_anchors_data_mc_True_AR_0/{comp_name}_{labels[i]}.tsv") for i in [0,2]]).drop_duplicates().reset_index(drop=True)

    degs_0_12_degs, _ = overlapper(degs_0_12, regions, genes)
    degs_0_30_degs, _ = overlapper(degs_0_30, regions, genes)
    degs_0_60_degs, _ = overlapper(degs_0_60, regions, genes)

    res_0_12 = enrichrrr(degs_0_12_degs, pathways)
    res_0_30 = enrichrrr(degs_0_30_degs, pathways)
    res_0_60 = enrichrrr(degs_0_60_degs, pathways)
    for res1, res2, res3, pathway in zip(res_0_12, res_0_30, res_0_60, pathways):
        write_results(res1, f"loops_enrichr_results/{pathway}_LOOP_{comp_name}_right_DEG_0_12.tsv")
        write_results(res2, f"loops_enrichr_results/{pathway}_LOOP_{comp_name}_right_DEG_0_30.tsv")
        write_results(res3, f"loops_enrichr_results/{pathway}_LOOP_{comp_name}_right_DEG_0_60.tsv")

    comparison_dfs = merge_enrs_into_common_df_2([res_0_12, res_0_30, res_0_60], ["0_12", "0_30", "0_60"])
    comparison_dfs_all_vs_all_loops.append(comparison_dfs)

    # For example, in t0vs12, left is common and t0 specific, right is common and t12 specific

    # Left regions uniq terms
    database_names = ["Gene Ontology\nBiological Process", "MSigDB\nHallmark", "NCI-Nature\nPID"]


    for idx, (left, right) in enumerate(zip(comparison_dfs_all_vs_all_loops[0], comparison_dfs_all_vs_all_loops[1])):
        fig , ax = plt.subplots(1, 1, figsize=(10, 8))
        common_terms = list(set(left.index).intersection(set(right.index)))
        df = left.loc[~left.index.isin(common_terms)]
        df = df.loc[df.iloc[:,[0,1,2]].max(axis=1).sort_values(ascending=True).index]

        if len(df) != 0:
            df = df.iloc[-15:, :]
            df.columns = ['12min', '30min', '60min', 'Term']
            b = df.plot.barh(x='Term', ax=ax, color=['#A63446', "#F5B841", '#9DBBAE'])
            b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=.9, linewidth=3)

            b.set_xlabel(f'-log$_{{10}}$ Adjusted P-value', fontsize=12)
            b.set_ylabel(database_names[idx], fontsize=20)

            
            # change font size of x and y ticks
            b.tick_params(labelsize=8)
            b.tick_params(axis='both', which='major', labelsize=10)
            b.tick_params(axis='both', which='minor', labelsize=10)

            db_name = database_names[idx].replace("\n", "_")
            fig.set_tight_layout(True)
            #fig.savefig(f"loops_enrichr_results/{db_name}_pathways_left_uniq_terms_{comp_name}.svg")
            fig.savefig(f"loops_enrichr_results/{db_name}_pathways_left_uniq_terms_{comp_name}.png", dpi=300, facecolor="white", edgecolor='none')

        fig.clf()

        # Right regions uniq terms

    for idx, (left, right) in enumerate(zip(comparison_dfs_all_vs_all_loops[0], comparison_dfs_all_vs_all_loops[1])):
        fig , ax = plt.subplots(1, 1, figsize=(10, 8))

        common_terms = list(set(left.index).intersection(set(right.index)))
        # remove common terms from changed 
        df = right.loc[~right.index.isin(common_terms)]
        if len(df) != 0:
            df = df.loc[df.iloc[:,[0,1,2]].max(axis=1).sort_values(ascending=True).index]


        if len(df) != 0:
            df = df.iloc[-15:, :]
            df.columns = ['12min', '30min', '60min', 'Term']
            b = df.plot.barh(x='Term', ax=ax, color=['#A63446', "#F5B841", '#9DBBAE'])
            b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=.9, linewidth=3)

            b.set_xlabel(f'-log$_{{10}}$ Adjusted P-value', fontsize=12)
            b.set_ylabel(database_names[idx], fontsize=20)

            
            # change font size of x and y ticks
            b.tick_params(labelsize=8)
            b.tick_params(axis='both', which='major', labelsize=10)
            b.tick_params(axis='both', which='minor', labelsize=10)

            db_name = database_names[idx].replace("\n", "_")
            fig.set_tight_layout(True)
            #fig.savefig(f"loops_enrichr_results/{db_name}_pathways_right_uniq_terms_{comp_name}.svg")
            fig.savefig(f"loops_enrichr_results/{db_name}_pathways_right_uniq_terms_{comp_name}.png", dpi=300, facecolor="white", edgecolor='none')
        
        fig.clf()

## draw graphs

In [None]:
def annotate_loops(loop_file_path, all_genes, annotate="external_gene_name", flank=10_000):
    loop_df = pd.read_csv(loop_file_path, sep="\t")
    loop_df["indx"] = loop_df.index
    loop_df[f"{annotate}"] = ""

    left_anchors = loop_df.iloc[:, [0,1,2,-1]].rename(columns={"chrom1": "chrom", "start1": "start", "end1": "end"})
    left_anchors["indx"] = left_anchors.index
    left_anchors.start = left_anchors.start - flank
    left_anchors.end = left_anchors.end + flank
    right_anchors = loop_df.iloc[:, [3,4,5,-1]].rename(columns={"chrom2": "chrom", "start2": "start", "end2": "end"})
    right_anchors["indx"] = right_anchors.index
    right_anchors.start = right_anchors.start - flank
    right_anchors.end = right_anchors.end + flank
    
    strand_oriented_genes = all_genes.copy()
    strand_oriented_genes['start'] = all_genes.apply(lambda x: x['start'] if x['strand'] == "+" else x['end'], axis=1)
    strand_oriented_genes['end'] = strand_oriented_genes['start']
    
    left_anchors_ov = bioframe.overlap(left_anchors, strand_oriented_genes, how='left').dropna()
    right_anchors_ov = bioframe.overlap(right_anchors, strand_oriented_genes, how='left').dropna()

    annotate_ = f"{annotate}_"

    for i,row in left_anchors_ov.iterrows():
        loop_df.loc[loop_df.indx == row.indx, f"{annotate}"] += f";{row[annotate_]}"
    for i,row in right_anchors_ov.iterrows():
        loop_df.loc[loop_df.indx == row.indx, f"{annotate}"] += f";{row[annotate_]}"

    loop_df.external_gene_name = loop_df.external_gene_name.apply(lambda x: x[1:] if x.startswith(";") else x)
    loop_df.external_gene_name = loop_df.external_gene_name.apply(lambda x: x if x != "" else "!!!")

    # drop indx column
    loop_df.drop(columns=['indx'], inplace=True)

    return loop_df

In [None]:
venn_df_annotated = annotate_loops("/home/carlos/Desktop/manuscripts/notebooks/loops/venn_df_Labeled.tsv", genes, annotate="external_gene_name", flank=10_000)

In [None]:
venn_df_annotated = venn_df_annotated.loc[venn_df_annotated.external_gene_name != "!!!"]
deseq_lrt_sorted = deseq_lrt.sort_values(by='padj').external_gene_name.values

prev = []
all_common = []

for i, gene_list in enumerate(venn_df_annotated.external_gene_name.values):
    for gene in gene_list.split(";"):
        if gene in deseq_lrt_sorted:
            row = venn_df_annotated.iloc[i,:]
            padj = deseq_lrt.loc[deseq_lrt.external_gene_name == gene].padj.values[0]
            prev.append((gene, padj, row['label'], (row.chrom1, row.start1, row.end1, row.chrom2, row.start2, row.end2)))

In [None]:
# make a dataframe with prev
prev_df = {
    "gene": [],
    "padj": [],
    "label": [],
    "chrom1": [],
    "start1": [],
    "end1": [],
    "chrom2": [],
    "start2": [],
    "end2": []
}

for gene, padj, label, (chrom1, start1, end1, chrom2, start2, end2) in prev:
    prev_df["gene"].append(gene)
    prev_df["padj"].append(padj)
    prev_df["label"].append(label)
    prev_df["chrom1"].append(chrom1)
    prev_df["start1"].append(start1)
    prev_df["end1"].append(end1)
    prev_df["chrom2"].append(chrom2)
    prev_df["start2"].append(start2)
    prev_df["end2"].append(end2)

prev_df = pd.DataFrame(prev_df)
prev_df.sort_values(by= ["chrom1", "start1", "start2"], inplace=True)

filter_coords_df = prev_df.loc[prev_df.label == '1,2,3,4 intersection']

for i,row in filter_coords_df.iterrows():
    chrom1, start1, end1, chrom2, start2, end2 = row.chrom1, row.start1, row.end1, row.chrom2, row.start2, row.end2
    prev_df = prev_df.loc[~((prev_df.chrom1 == chrom1) & (prev_df.start1 == start1) & (prev_df.end1 == end1) & (prev_df.chrom2 == chrom2) & (prev_df.start2 == start2) & (prev_df.end2 == end2))]

prev_df.sort_values(by= "padj", inplace=True)

topN = 20

topGenes = prev_df.gene.unique()[:topN]

prev_df = prev_df.loc[prev_df.gene.isin(topGenes)]
prev_df

In [None]:
def parse_label(gene_df):
    l = [i.split(" ")[0].split(",") for i in gene_df.label.to_list()]
    return [(idx, [int(i) for i in j]) for idx,j in enumerate(l)]

def is_gene_in_coords(coords_df, all_genes):
    strand_oriented_genes = all_genes.copy()

    strand_oriented_genes['start'] = all_genes.apply(lambda x: x['start'] if x['strand'] == "+" else x['end'], axis=1)
    strand_oriented_genes['end'] = strand_oriented_genes['start']

    return bioframe.overlap(coords_df, strand_oriented_genes, how='left').dropna()

import networkx as nx

graphs = []

for i, gene_df in prev_df.groupby('gene').__iter__():
    anchors = {
        "coords": [],
        "timepoints": [],
        "pairs": [],
    }

    for r_idx, row in gene_df.iterrows():
        chromName = row.chrom1
        geneName = row.gene
        anchor1_coord = row.start1
        anchor2_coord = row.start2
        anchors["coords"].append(anchor1_coord)
        timepoint = [int(tp) for tp in row.label.split(" ")[0].split(",")]
        anchors["timepoints"].append(timepoint)
        anchors["pairs"].append(anchor2_coord)
    
    G = nx.Graph()
    for coord, timepoint, pair in zip(anchors["coords"], anchors["timepoints"], anchors["pairs"]):
        G.add_edge(coord, pair, timepoint=timepoint)

    tolerance = 10_000

    for node in G.nodes:
        coord_df = pd.DataFrame({"chrom":[chromName], "start": [node-tolerance], "end": [node+(tolerance*2)]})
        coord_df.start = coord_df.start.astype(int)
        coord_df.end = coord_df.end.astype(int)
        ov = is_gene_in_coords(coord_df, genes)

        if len(ov) != 0:
            if geneName in ov.external_gene_name_.values:
                G.nodes[node]['color'] = "red"
            else:
                G.nodes[node]['color'] = "blue"
        else:
            G.nodes[node]['color'] = "blue"

    import itertools

    nodes_to_remove = []

    for node1, node2 in itertools.combinations(G.nodes, 2):

        # If the absolute difference between the node coordinates is less than or equal to the tolerance
        if abs(node1 - node2) <= tolerance:
            # Merge the nodes by adding the edges of the second node to the first node
            for neighbor, data in G[node2].items():
                if G.has_edge(node1, neighbor):
                    # If the first node already has an edge to the neighbor, add the timepoints of the second node's edge to the first node's edge
                    G[node1][neighbor]['timepoint'] += data['timepoint']
                else:
                    # Otherwise, add the edge from the second node to the first node
                    G.add_edge(node1, neighbor, timepoint=data['timepoint'])
            
            # Add the second node to the list of nodes to be removed
            nodes_to_remove.append(node2)

    # Remove the nodes from the graph
    for node in nodes_to_remove:
        if node in G:
            G.remove_node(node)

    graphs.append((geneName, G))

In [None]:
# draw graph
for geneName, G in graphs:
    edges = G.edges(data=True)
    timepoint_data = [set(data['timepoint']) for u, v, data in edges]
    if [set([1,2,3,4])] == timepoint_data:
        continue

    fig, ax = plt.subplots(figsize=(20, 20))
    pos = nx.spring_layout(G, k=0.5, iterations=100)

    node_colors = [G.nodes[node]['color'] for node in G.nodes]

    nx.draw(G, pos, ax=ax, node_size=1000, node_color=node_colors, edge_color='black', width=1, font_size=18, font_color='red')
    nx.draw_networkx_labels(G, pos, font_size=20, font_color='green')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=nx.get_edge_attributes(G, 'timepoint'), font_size=18, font_color='red')
    fig.savefig(f"/home/carlos/Desktop/manuscripts/notebooks/loops/graphs/{geneName}_graph.png", dpi=300, facecolor="white", edgecolor='none')
    fig.clf()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8), ncols=2, nrows=1, sharey="row",
 gridspec_kw={"wspace": 0.1, 'width_ratios': [3, 4]})

topN_now = 20

deseq_lrt_intersect_topN = deseq_lrt.loc[deseq_lrt.external_gene_name.isin(topGenes)].sort_values(by='padj').head(topN_now)

plot_dict_FC = {
    "geneName": [],
    "Fold Change": [],
    "Time Point": []
}

fc_12 = degs_0_12.loc[degs_0_12.ensembl_gene_id.isin(deseq_lrt_intersect_topN.ensembl_gene_id.values)]
fc_30 = degs_0_30.loc[degs_0_30.ensembl_gene_id.isin(deseq_lrt_intersect_topN.ensembl_gene_id.values)]
fc_60 = degs_0_60.loc[degs_0_60.ensembl_gene_id.isin(deseq_lrt_intersect_topN.ensembl_gene_id.values)]

plot_dict_padj = {
    "geneName": [],
    "padj": [],
}

for gene in deseq_lrt_intersect_topN.external_gene_name.values:
    
    plot_dict_FC["geneName"].append(gene)
    plot_dict_FC["geneName"].append(gene)
    plot_dict_FC["geneName"].append(gene)


    if gene not in fc_12.external_gene_name.values:
        plot_dict_FC["Fold Change"].append(0)
    else:
        plot_dict_FC["Fold Change"].append(fc_12.loc[fc_12.external_gene_name == gene].log2FoldChange.values[0])
    plot_dict_FC["Time Point"].append("12min")

    if gene not in fc_30.external_gene_name.values:
        plot_dict_FC["Fold Change"].append(0)
    else:
        plot_dict_FC["Fold Change"].append(fc_30.loc[fc_30.external_gene_name == gene].log2FoldChange.values[0])
    plot_dict_FC["Time Point"].append("30min")

    if gene not in fc_60.external_gene_name.values:
        plot_dict_FC["Fold Change"].append(0)
    else:
        plot_dict_FC["Fold Change"].append(fc_60.loc[fc_60.external_gene_name == gene].log2FoldChange.values[0])
    plot_dict_FC["Time Point"].append("60min")

    plot_dict_padj["geneName"].append(gene)
    padj = deseq_lrt_intersect_topN.loc[deseq_lrt_intersect_topN.external_gene_name == gene].padj.values[0]
    minus_log10_padj = -np.log10(padj)
    if minus_log10_padj == np.inf:
        minus_log10_padj = 308
    plot_dict_padj["padj"].append(minus_log10_padj)


plot_df_FC = pd.DataFrame(plot_dict_FC)
plot_df_padj = pd.DataFrame(plot_dict_padj)

b1 = sns.barplot(x="padj", y="geneName", data=plot_df_padj, ax=ax[0], palette=["#465775"])
b1.set_xticklabels(ax[0].get_xticklabels(), rotation=45, horizontalalignment='right', fontsize=20)
b1.set_ylabel("Top 20 Genes", fontsize=12)
b1.set_xlabel(f"-log$_{{10}}$ Adjusted p-value", fontsize=12)
b1.tick_params(axis='both', which='major', labelsize=12)
b1.tick_params(axis='both', which='minor', labelsize=12)

b2 = sns.barplot(x="Fold Change", y="geneName", hue="Time Point", data=plot_df_FC, ax=ax[1], palette=['#A63446', "#F5B841", '#9DBBAE'])
b2.set_xticklabels(ax[1].get_xticklabels(), rotation=45, horizontalalignment='right', fontsize=20)
b2.set_ylabel("padj")
b2.set_xlabel("Gene Name")
b2.axvline(x=0, color='black', linestyle='-', lw=0.5)
b2.tick_params(axis='both', which='major', labelsize=12)
b2.tick_params(axis='both', which='minor', labelsize=12)
b2.set_xlabel(f"log$_{{2}}$ Fold Change", fontsize=12) 

# Calculate the number of bars
num_bars = len(plot_df_FC['geneName'].unique())
# Calculate the y-tick locations
y_ticks = np.arange(num_bars)
# Set the y-ticks
ax[0].set_yticks(y_ticks)
ax[1].set_yticks(y_ticks)
# Get the unique gene names
gene_names = plot_df_FC['geneName'].unique()
# Set the y-tick labels back to the gene names
ax[0].set_yticklabels(gene_names)
ax[1].set_yticklabels(gene_names)
# Add grid lines at halfway points
ax[0].set_yticks(y_ticks + 0.5, minor=True)
ax[1].set_yticks(y_ticks + 0.5, minor=True)
ax[0].grid(axis='y', which='minor', linestyle='-', alpha=1, lw=.5, color="black")
ax[1].grid(axis='y', which='minor', linestyle='-', alpha=1, lw=.5, color="black")
ax[1].set_yticklabels(gene_names)

fig.tight_layout()
fig.savefig(f"/home/carlos/Desktop/manuscripts/notebooks/loops/graphs/FC_padj.png", dpi=300, facecolor="white", edgecolor='none')
fig.savefig(f"/home/carlos/Desktop/manuscripts/notebooks/loops/graphs/FC_padj.svg", facecolor="white", edgecolor='none')
fig.clf()

In [None]:
row = genes.loc[genes.external_gene_name == "JUN"]
chromName = row.chrom.values[0]
start = row.start.values[0]
end = row.end.values[0]
strand = row.strand.values[0]

flank_now = 5000

if strand == "+":
    print(f"{chromName}:{start // 10_000 * 10_000}")
    print(f"{chromName}:{start - flank_now}-{start + flank_now}")
else:
    print(f"{chromName}:{end // 10_000 * 10_000}") 
    print(f"{chromName}:{end - flank_now}-{end + flank_now}")

## Get strength changes of loops

In [None]:
import cooltools
import os
resolution = 10_000

samples = ["t0", "t12", "t30", "t60"]
clrs_ = [
    cooler.Cooler(
        f"/home/carlos/Desktop/manuscripts/notebooks/matrices/{sample}_q30.mcool::resolutions/10000"
    )
    for sample in samples
]

hg38_chromsizes = bioframe.fetch_chromsizes("hg38")
hg38_cens = bioframe.fetch_centromeres("hg38")
hg38_arms = bioframe.make_chromarms(hg38_chromsizes, hg38_cens)
hg38_arms = hg38_arms.set_index("chrom").loc[clrs_[0].chromnames].reset_index()

if not os.path.exists("/home/carlos/Desktop/manuscripts/notebooks/loops/expected_t0.10000.csv"):
    # intra-arm expected
    expected_ = [
        cooltools.expected_cis(
            clr,
            view_df=hg38_arms,
            nproc=nproc,
        )
        for clr in clrs_
    ]

    for sample, exp_df in zip(samples, expected_):
        exp_df.to_csv(f"expected_{sample}.10000.csv", index=False)
else:
    expected_ = [
        pd.read_csv(f"/home/carlos/Desktop/manuscripts/notebooks/loops/expected_{sample}.10000.csv")
        for sample in samples
    ]

In [None]:
heatmap_mtx_dict = {}

for geneName, G in graphs:
    edges = G.edges(data=True)
    timepoint_data = [list(data['timepoint']) for u, v, data in edges]
    timepoint_index = [[i-1 for i in tp_list] for tp_list in timepoint_data]

    heatmap_mtx = np.zeros((len(timepoint_index), 4))

    for idx, tp_list in enumerate(timepoint_index):
        for tp in tp_list:
            heatmap_mtx[idx, tp] = 1
    if geneName == "JUN":
        print(heatmap_mtx)
    heatmap_mtx_dict[geneName] = heatmap_mtx

In [None]:
# def P2M(data):
#     total = np.nansum(data)
#     center_idx = data.shape[0] // 2
#     center_score = data[center_idx, center_idx]
#     return center_score / ((total - center_score) / (data.size - 1))

# def central_score(data, n=3):
#     c = data.shape[0] // 2
#     return np.nanmean(data[c - n // 2 : c + n // 2 + 1, c - n // 2 : c + n // 2 + 1])
    
    
# apa_flank = 100_000

# gene_name_loopDF_ = {}
# for geneName, G in graphs:

#     edges = G.edges(data=True)
#     timepoint_data = [set(data['timepoint']) for u, v, data in edges]
#     if [set([1,2,3,4])] == timepoint_data:
#         continue
#     filter_ = venn_df_annotated.external_gene_name.apply(lambda x: True if sum([1 for gene in x.split(";") if gene == geneName]) > 0 else False)
#     loop_df = venn_df_annotated.loc[filter_]
#     loop_df = loop_df.sort_values(by=['start1', 'start2', 'label']).reset_index(drop=True)
    

    
#     i1_i2 = [[] for i in range(len(loop_df))]

#     for i1, row1 in loop_df.iterrows():
#         for i2, row2 in loop_df.iterrows():
#             if i1 != i2:
#                 loop_1_start1 = row1.start1
#                 loop_1_start2 = row1.start2

#                 loop_1_range1 = range(loop_1_start1 - 10_000, loop_1_start1 + 20_000)
#                 loop_1_range2 = range(loop_1_start2 - 10_000, loop_1_start2 + 20_000)

#                 loop_2_start1 = row2.start1
#                 loop_2_start2 = row2.start2

#                 if loop_2_start1 in loop_1_range1 and loop_2_start2 in loop_1_range2:
#                     if loop_1_start1 == loop_2_start1 and loop_1_start2 == loop_2_start2:
#                         continue
#                     i1_i2[i1].append(i2)

#     loop_df['common_pairs'] = i1_i2

#     sample_idx_dict = {
#         0: "t0",
#         1: "t12",
#         2: "t30",
#         3: "t60"
#     }

#     for sample_idx, (clr, expected) in enumerate(zip(clrs_, expected_)): # for each sample
#         mtx_ = cooltools.pileup(clr, loop_df, view_df=hg38_arms, expected_df=expected, flank=apa_flank, nproc=1)
#         scores = [P2M(mtx_[:,:,i]) for i in range(mtx_.shape[2])]
#         loop_df[f"score_{sample_idx_dict[sample_idx]}"] = scores

    
#     loop_df.drop_duplicates(subset=["score_t0", "score_t12", "score_t30", "score_t60"], inplace=True)
    
#     if len(loop_df) != len(edges):
#         print(geneName)
#         print(edges)
#         print(loop_df)
    
#     loop_df.to_csv(f"/home/carlos/Desktop/manuscripts/notebooks/loops/graphs/{geneName}_loop_df.tsv", sep="\t", index=False)
    

#     graph_loops = {
#         "chrom1": [],
#         "start1": [],
#         "end1": [],
#         "chrom2": [],
#         "start2": [],
#         "end2": [],
#     }

#     chromName = loop_df.chrom1.values[0]

#     for edge in edges:
#         n1, n2 = edge[0], edge[1]

#         if n1 > n2:
#             n1, n2 = n2, n1
            
#         graph_loops["chrom1"].append(chromName)
#         graph_loops["start1"].append(n1)
#         graph_loops["end1"].append(n1+10_000)
#         graph_loops["chrom2"].append(chromName)
#         graph_loops["start2"].append(n2)
#         graph_loops["end2"].append(n2+10_000)

#         sample_idx_dict = {
#         0: "t0",
#         1: "t12",
#         2: "t30",
#         3: "t60"
#     }

    

#     graph_loops = pd.DataFrame(graph_loops)

#     for sample_idx, (clr, expected) in enumerate(zip(clrs_, expected_)): # for each sample
#         mtx_ = cooltools.pileup(clr, graph_loops, view_df=hg38_arms, expected_df=expected, flank=apa_flank, nproc=1)
#         scores = [P2M(mtx_[:,:,i]) for i in range(mtx_.shape[2])]
#         graph_loops[f"score_{sample_idx_dict[sample_idx]}"] = scores
    
#     graph_loops.to_csv(f"/home/carlos/Desktop/manuscripts/notebooks/loops/graphs/{geneName}_graph_df.tsv", sep="\t", index=False)
    
#     gene_name_loopDF_[geneName] = graph_loops.copy()

# genes_order = deseq_lrt_intersect_topN.external_gene_name.to_list()
# # sort gene_name_loopDF_ by genes_order, and get rid of duplicates
# gene_name_loopDF_ = {geneName: gene_name_loopDF_[geneName] for geneName in genes_order}

# heatmap_mtx_dict = {}

# for k,v in gene_name_loopDF_.items():
#     heatmap_mtx = v.iloc[:, -4:].values

#     #heatmap_mtx_scaled = (heatmap_mtx - heatmap_mtx.min()) / (heatmap_mtx.max() - heatmap_mtx.min())
#     #heatmap_mtx_scaled = heatmap_mtx / heatmap_mtx.max() 
    
#     #divide each to its max
#     heatmap_mtx_scaled = np.zeros_like(heatmap_mtx)
#     for i in range(heatmap_mtx.shape[0]):
#         for j in range(heatmap_mtx.shape[1]):
#             heatmap_mtx_scaled[i,j] = heatmap_mtx[i,j] / heatmap_mtx[i,:].max()
    
#     heatmap_mtx_dict[k] = heatmap_mtx_scaled

In [None]:
# re order heatmap_mtx_dict
heatmap_mtx_dict = {k: heatmap_mtx_dict[k] for k in topGenes}

In [None]:
from matplotlib.colors import LinearSegmentedColormap
#cmp = LinearSegmentedColormap.from_list("custom_cmp", ["#A63446", "white", "#14213D"], N=256)
cmp = LinearSegmentedColormap.from_list("custom_cmp", ["white", "#465775"], N=256)

fig2, ax_hm = plt.subplots(figsize=(4, 8), ncols=1, nrows=topN, gridspec_kw={"hspace": 0}, sharex=True)

for idx, (k,v) in enumerate(heatmap_mtx_dict.items()):
    s = sns.heatmap(v, ax=ax_hm[idx], cmap=cmp, cbar=False, yticklabels=False, xticklabels=False)


fig2.savefig(f"/home/carlos/Desktop/manuscripts/notebooks/loops/graphs/heatmap.png", dpi=300, facecolor="white", edgecolor='none')
fig2.savefig(f"/home/carlos/Desktop/manuscripts/notebooks/loops/graphs/heatmap.svg", facecolor="white", edgecolor='none')

In [None]:
# def create_colorbar(vmin_value, vmax_value, cmap, ax=None, orientation='vertical'):

#     cbar = plt.colorbar(plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin_value, vmax=vmax_value)),
#                         orientation=orientation, cax=ax,)

#     if orientation == 'horizontal':
#         ax.set_yticks([])
#         ax.set_xticks([vmin_value, vmax_value])
#         cbar.ax.tick_params(labelsize=16)

#     elif orientation == 'vertical':
#         ax.set_xticks([])
#         ax.set_yticks([vmin_value, vmax_value])
#         cbar.ax.tick_params(labelsize=16)

#     plt.xticks(rotation=45, ha='right')

# fig, ax = plt.subplots(figsize=(8, 1))
# create_colorbar(0.25, 1, cmp, orientation='horizontal', ax=ax)

## TADs


In [None]:
tads_df = pd.read_csv("/home/carlos/Desktop/manuscripts/notebooks/boundaries/boundary_switch_1_True.tsv", sep="\t")
clr = cooler.Cooler("/home/carlos/Desktop/manuscripts/notebooks/matrices/t0_q30.mcool::resolutions/10000")
bins = clr.bins()[:]

In [None]:
super_set_idx = list(set(tads_df.idx1.tolist() + tads_df.idx2.tolist()))
super_set_bins = bins.iloc[super_set_idx,:].reset_index(drop=True).dropna(subset=['weight'])

boundary_flanks = 10_000

super_set_bins['start'] -= boundary_flanks
super_set_bins['end'] += boundary_flanks
boundary_universe = bioframe.overlap(super_set_bins, genes, how='inner')
boundary_universe_names = list(boundary_universe.external_gene_name_.dropna().unique())

In [None]:
idx_ = []
for case in ['Preserved', 'Lost']:
    for sample1 in ["Control", "12min", "30min", "60min"]:
        for sample2 in ["Control", "12min", "30min", "60min"]:
            if sample1 != sample2:
                if case == 'Preserved':
                    curr_df = tads_df.loc[(tads_df['sample1'] == sample1) & (tads_df['sample2'] == sample2) & (tads_df['case'].isin(["Shifted", "Preserved"]))]
                    curr_df['bs_change'] = curr_df['bs2'].values - curr_df['bs1'].values
                    curr_df['bs_change_levels'] = pd.qcut(curr_df.bs_change, 4, labels=["1", "2", "3", "4"])

                    print(f"{case}_{sample1}_{sample2}")
                    print(curr_df.groupby('bs_change_levels')['bs_change'].mean())

                    curr_df = curr_df.loc[curr_df['bs_change_levels'].isin(["4"])]

                    idx_.append(curr_df.idx1.to_list())
                else:
                    curr_df = tads_df.loc[(tads_df['sample1'] == sample1) & (tads_df['sample2'] == sample2) & (tads_df['case'] == "Lost")]
                    idx_.append(curr_df.idx1.to_list())


                regions = bins.iloc[curr_df.idx1.to_list(), :].reset_index(drop=True)
                regions['start'] -= boundary_flanks
                regions['end'] += boundary_flanks

                degs_0_12_degs = overlapper(degs_0_12, regions, genes)
                degs_0_60_degs = overlapper(degs_0_60, regions, genes)

                res_0_12 = enrichrrr(degs_0_12_degs, pathways)#, boundary_universe_names)
                res_0_60 = enrichrrr(degs_0_60_degs, pathways)#, boundary_universe_names)

                comparison_dfs = merge_enrs_into_common_df(res_0_12, res_0_30)

                for res1, res2, pathway in zip(res_0_12, res_0_60, pathways):
                    df1= res1.results.sort_values('Adjusted P-value')
                    df2 = res2.results.sort_values('Adjusted P-value')
                    df1_sig = df1.loc[df1['Adjusted P-value'] <= 0.05]
                    df2_sig = df2.loc[df2['Adjusted P-value'] <= 0.05]
                    if len(df1_sig) != 0:
                        df1_sig.to_csv(f"tads_enrichr_results/{case}_{sample1}_{sample2}_{pathway}_0_12.tsv", sep="\t", index=False)
                    if len(df2_sig) != 0:
                        df2_sig.to_csv(f"tads_enrichr_results/{case}_{sample1}_{sample2}_{pathway}_0_60.tsv", sep="\t", index=False)

                fig, ax = plt.subplots(3, 1, figsize=(10, 20))
                plot_count = 0
                for i, (df, pathway) in enumerate(zip(comparison_dfs, pathways)):
                    if len(df) != 0:
                        plot_count += 1
                        b = df.plot.barh(x='Term', ax=ax[i], color=['#A63446', '#9DBBAE'])
                        b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=0.6, linewidth=3)
                        b.set_xlabel(f'-log$_{{10}}$(Adjusted P-value)', fontsize=20)
                        b.set_ylabel(f'{pathway} Term', fontsize=20)
                if plot_count != 0:
                    fig.suptitle(f"{case}_{sample1}_{sample2}", fontsize=30)
                    fig.set_tight_layout(True)
                    fig.savefig(f"tads_enrichr_results/{case}_{sample1}_{sample2}_pathways.svg")
                    fig.savefig(f"tads_enrichr_results/{case}_{sample1}_{sample2}_pathways.png", dpi=300, facecolor="white", edgecolor='none')

                fig.clf()