## Libraries

In [86]:
import bioframe
import numpy as np
import pandas as pd
import gseapy as gp
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm
import glob
import cooler
#bm = gp.Biomart()

pd.options.mode.chained_assignment = None # default='warn'

## Helper functions and files ops

In [None]:
# read gtf
gtf = "/home/carlos/oldies/manuscripts/notebooks/unibind/GRCh38.gtf"
genes_all = bioframe.read_table(gtf, schema='gtf').query('feature=="CDS"')
genes_all.start = genes_all.start.astype(int)
genes_all.end = genes_all.end.astype(int)
genes_all.sort_values(by=['chrom', 'start'], inplace=True)

In [3]:
# use appris or mane

In [9]:
df = pd.read_csv("/home/carlos/oldies/manuscripts/notebooks/RNA/appris_data.appris.txt", sep='\t')
df = df.loc[df["MANE"].isin(['MANE_Select'])] #, 'MANE_Plus_Clinical'])]
mane_tx_list = df['Transcript ID'].tolist()

In [10]:
genes = genes_all.copy()
genes['gene_id'] = [gene_id.split(".")[0] for gene_id in genes.attributes.str.extract(r'gene_id "(.*?)";', expand=False)]
genes['tx_id'] = [tx_id.split(".")[0] for tx_id in genes.attributes.str.extract(r'transcript_id "(.*?)";', expand=False)]
genes['external_gene_name'] = [gene_name.split(".")[0] for gene_name in genes.attributes.str.extract(r'gene_name "(.*?)";', expand=False)]
genes = genes.loc[genes['tx_id'].isin(mane_tx_list)]
genes.sort_values(by=['chrom', 'start'], inplace=True)

In [11]:
tx_adjusted = []
for tx_id, tx_df in genes.groupby('tx_id').__iter__():
    if "+" in tx_df.strand.to_list():
        tx_adjusted.append(tx_df.iloc[0, :])
    elif "-" in tx_df.strand.to_list():
        tx_adjusted.append(tx_df.iloc[-1, :])

genes = pd.concat(tx_adjusted, axis=1).T
genes.reset_index(drop=True, inplace=True)
genes.start = genes.start.astype(int)
genes.end = genes.end.astype(int)

In [7]:
human = pd.Series(gp.get_library_name(organism='Human'))
pathways = human.loc[human.str.contains("MSigDB_Hallmark_2020") | human.str.contains("GO_Biological_Process_2023") | human.str.contains("NCI-Nature_2016")].reset_index(drop=True)
#pathways = human.loc[human.str.contains("MSigDB_Hallmark_2020") | human.str.contains("NCI-Nature_2016") ].reset_index(drop=True)
pathways = {
    pathway: gp.get_library(name=pathway, organism='Human')
    for pathway in pathways
}

In [8]:
def enrichrrr(
    degs_list: list, 
    pathways_dict: dict, 
    universe_list: list = None):
    results = []

    for pathway_name, gene_sets in pathways_dict.items():
        enr = gp.enrichr(gene_list=degs_list,
                     gene_sets=gene_sets,
                     #gene_sets = pathway_name,
                     outdir=None,
                     background=universe_list,
                     verbose=False)
        results.append(enr)
    return results

In [9]:
def overlapper(
    current_degs_df: pd.DataFrame, 
    my_regions_df: pd.DataFrame, 
    all_genes: pd.DataFrame, 
    tss_coord_only: bool = True, # True if you want to use only the TSS coordinates (point-wise), False if you want to create a window around the TSS
    upstream: int =2000, 
    downstream: int =500,
    returnNames = True):
    strand_oriented_genes = all_genes.copy()

    if tss_coord_only == True:
        strand_oriented_genes['start'] = all_genes.apply(lambda x: x['start'] if x['strand'] == "+" else x['end'], axis=1)
        strand_oriented_genes['end'] = strand_oriented_genes['start']
    else:
        strand_oriented_genes['start'] = all_genes.apply(lambda x: x['start'] - upstream if x['strand'] == 1 else x['end'] - downstream, axis=1)
        strand_oriented_genes['end'] = all_genes.apply(lambda x: x['start'] + downstream if x['strand'] == 1 else x['end'] + upstream, axis=1)

    my_regions_universe = bioframe.overlap(strand_oriented_genes, my_regions_df, how='inner')
    degs_filter = current_degs_df.loc[current_degs_df['ensembl_gene_id'].isin(my_regions_universe['gene_id'])]
    
    if returnNames == True:
        return list(degs_filter.external_gene_name.dropna().unique()), my_regions_universe
    else:
        return degs_filter, my_regions_universe

In [10]:
def merge_enrs_into_common_df(res_1, res_2):
    comparison_dfs = []

    for i, (enr1, enr2) in enumerate(zip(res_1, res_2)):
        df1 = enr1.results.sort_values('Adjusted P-value')
        df2 = enr2.results.sort_values('Adjusted P-value')
        sig_terms_df1 = df1.loc[df1['Adjusted P-value'] <= 0.05].Term
        sig_terms_df2 = df2.loc[df2['Adjusted P-value'] <= 0.05].Term
        df1['logPadj'] = -np.log10(df1['Adjusted P-value'])
        df2['logPadj'] = -np.log10(df2['Adjusted P-value'])
        
        # merge dfs
        df = pd.merge(df1, df2, on='Term', suffixes=('_0_12', '_0_60'))
        if i == 2:
            df = df.loc[df['logPadj_0_60'] >= 1.3]
        # keep columns term, logPadj_0_12, logPadj_0_60
        df = df[['Term', 'logPadj_0_12', 'logPadj_0_60']]
        
        df = df.loc[df.Term.isin(sig_terms_df1) | df.Term.isin(sig_terms_df2)]
        df.reset_index(drop=True, inplace=True)
        # sort df based on the mean of logPadj_0_12 and logPadj_0_60, without writing over df
        df = df.loc[df.iloc[:,[1,2]].max(axis=1).sort_values(ascending=True).index]

        if i == 0:
            # remove " (GO):$" from term
            df['Term'] = ["".join(term.split("(GO")[0]) for term in df.Term]
        if i == 2:
            df['Term'] = [" ".join(term.split(" ")[:-3]) for term in df.Term]
        comparison_dfs.append(df)

    return comparison_dfs


def merge_terms(df_list: list):
    pathways = []
    for df in df_list:
        sig_terms = df.loc[df['Adjusted P-value'] <= 0.05].Term.unique()
        pathways += list(sig_terms)
    return list(set(pathways))


def merge_enrs_into_common_df_2(res_list : list, names : list = None):
    comparison_dfs = []
    n_pathways = len(res_list[0])

    for i in range(n_pathways):
        df_list_toMergeTerms = []
        for res in res_list:
            curr_df = res[i].results
            curr_df = curr_df.loc[curr_df['Adjusted P-value'] <= 0.05]
            df_list_toMergeTerms.append(curr_df)
    
        merged_terms = merge_terms(df_list_toMergeTerms)

        db_dfs = []
        for resIdx, res in enumerate(res_list):
            curr_df = res[i].results
            curr_df = curr_df.loc[curr_df['Term'].isin(merged_terms)]
            curr_df.reset_index(drop=True, inplace=True)
            curr_df.loc[:, 'logPadj'] = -np.log10(curr_df['Adjusted P-value'])
            curr_df.sort_values(by='logPadj', inplace=True, ascending=False)
            if names is not None:
                curr_df['whichRes'] = names[resIdx]
            else:
                curr_df['whichRes'] = resIdx
        
            if i == 0:
                curr_df['Term'] = ["".join(term.split("(GO")[0]) for term in curr_df.Term]
            if i == list(range(n_pathways))[-1]:
                curr_df['Term'] = [" ".join(term.split(" ")[:-3]) for term in curr_df.Term]

            db_dfs.append(curr_df)

        df = pd.concat(db_dfs, axis=0)
        comparison_dfs.append(df)

    # for comp_df in comparison_dfs:
    #     if len(list(set(comp_df.groupby('whichRes').count().Term.values))) != 1 and len(comp_df) != 0:
    #         print(comp_df)
    #         print("WARNING: different number of terms in different results")
    #         return None

    comparison_dfs_reformat = []
    for comp_df in comparison_dfs:
        comp_df.sort_values(by=['Term', 'logPadj'], inplace=True, ascending=False)

        comp_df = comp_df.pivot(index='Term', columns='whichRes', values='logPadj')
        comp_df['Term'] = comp_df.index
        # nColumns = len(comp_df.columns) - 1
        # comp_df = comp_df.loc[comp_df.iloc[:,:nColumns].max(axis=1).sort_values(ascending=True).index]

        comp_df.Term = [split_text_into_lines(term) for term in comp_df.Term]
        comparison_dfs_reformat.append(comp_df)

    return comparison_dfs_reformat

def split_text_into_lines(text, max_line_length=30):
    # split text into lines, but it should not split words
    lines = []
    words = text.split(" ")
    line = ""
    for word in words:
        if len(line) + len(word) <= max_line_length:
            line += word + " "
        else:
            lines.append(line)
            line = word + " "
    lines.append(line)  # append the last line
    return "\n".join([line[:-1] for line in lines])  # remove last space from all

def write_results(res, out):
    df= res.results.sort_values('Adjusted P-value')
    df = df.loc[df['Adjusted P-value'] <= 0.05]
    if len(df) != 0:
        df.to_csv(out, sep='\t', index=False)

In [11]:
def get_anchors(regions_file):
    regions_bedpe =  pd.read_csv(regions_file, sep="\t")
    fivePrime_anchors = regions_bedpe[['chrom1', 'start1', 'end1']]
    fivePrime_anchors.columns = ['chrom', 'start', 'end']
    threePrime_anchors = regions_bedpe[['chrom2', 'start2', 'end2']]
    threePrime_anchors.columns = ['chrom', 'start', 'end']
    regions = pd.concat([fivePrime_anchors, threePrime_anchors], axis=0).drop_duplicates().reset_index(drop=True)
    regions.drop_duplicates(inplace=True)
    return regions

We gather anchors from the loops because there might be shared anchors between loops
And there is a possibilty that a gene is regulated by other pair, that is not present in unique anchors (to a timepoint) list
However, we can use the unique anchors to a timepoint to estimate differential TF binding, which is another analysis

In [11]:
degs_0_12 = pd.read_csv("/home/carlos/oldies/manuscripts/notebooks/RNA/t0-t12.degs.tsv", sep="\t")
degs_0_60 = pd.read_csv("/home/carlos/oldies/manuscripts/notebooks/RNA/t0-t60.degs.tsv", sep="\t")
degs_0_30 = pd.read_csv("/home/carlos/oldies/manuscripts/notebooks/RNA/t0-t30.degs.tsv", sep="\t")

deseq_lrt = pd.read_csv("/home/carlos/oldies/manuscripts/notebooks/RNA/all_deseq_lrt.tsv", sep="\t")
deseq_lrt.rename(columns={'gene_id': 'ensembl_gene_id'}, inplace=True)

In [None]:
# search specific genes
gene_oi_ENS = "ENSG00000012061"
df_curr = genes.loc[genes.gene_id == gene_oi_ENS]
if df_curr.strand.to_list()[0] == "-":
    chr_name = df_curr.chrom.to_list()[0]
    start = int(df_curr.end.to_list()[0]) // 10_000 * 10_000
    end = start + 10_000
elif df_curr.strand.to_list()[0] == "+":
    chr_name = df_curr.chrom.to_list()[0]
    start = int(df_curr.start.to_list()[0]) // 10_000 * 10_000
    end = start + 10_000

comp_labels = ["comp1", "comp2", "comp3", "comp4"]
comp_files = ["t0_t12_results_0_0", "t0_t12_results_0_1", "t0_t12_results_1_0", "t0_t12_results_1_1"]

for label, comp in zip(comp_labels, comp_files):
    regions = pd.read_csv(f"/home/carlos/oldies/manuscripts/notebooks/gnn/{comp}.tsv", sep="\t")

    oi_df = regions.loc[(regions.chrom == chr_name) & (regions.start == start)]
    if len(oi_df) > 0:
        print(f"Processing {label}")
        print(oi_df)

## gnn res

In [13]:
paths = [f"/home/carlos/oldies/manuscripts/notebooks/gnn/t0_t12_results_{comp_now}.tsv" for comp_now in ["0_0", "1_0", "0_1", "1_1"]]
all_dfs = [pd.read_csv(df, sep="\t") for df in paths] 
all_regions = pd.concat(all_dfs) # This is the GNN evaluated regions
all_regions = all_regions.iloc[:, :3]
universe_3d_df = bioframe.overlap(all_regions, genes, how='inner')
universe_3d_ids = list(universe_3d_df.gene_id_.dropna().unique())
universe_3d_names = list(universe_3d_df.external_gene_name_.dropna().unique())

In [47]:
comp_labels = ["comp1", "comp2", "comp3", "comp4"]
comp_files = ["t0_t12_results_0_0", "t0_t12_results_0_1", "t0_t12_results_1_0", "t0_t12_results_1_1"]


filter_11 = False
for label, comp in zip(comp_labels, comp_files):
    regions = pd.read_csv(f"/home/carlos/oldies/manuscripts/notebooks/gnn/{comp}.tsv", sep="\t")
    filter_w_unibind = False

    unibind_mapping = {
        "0_1": "0_1_vs_1_0",
        "1_0": "1_0_vs_0_1",
        "0_0": "0_0_vs_1_1",
        "1_1": "1_1_vs_0_0"
    }

    comp_name = comp[-3:]
    if comp_name == "1_1" and filter_11:
        regions = regions.loc[(regions["t0_q30-t12_q30"] == 0) & (regions["t12_q30-t0_q30"] == 0)]

    if filter_w_unibind:
        unibind_regions = pd.read_csv(f"/home/carlos/oldies/manuscripts/notebooks/unibind/gnn_res/gnn_{unibind_mapping[comp_name]}/extracted_regions_merged.bed", sep="\t", header=None).iloc[:, :3]
        unibind_regions.columns = ['chrom', 'start', 'end']
        unibind_regions.start = unibind_regions.start.astype(int)
        unibind_regions.end = unibind_regions.end.astype(int)
        regions = bioframe.overlap(regions, unibind_regions, how='inner')


    degs_0_12_degs, uni = overlapper(degs_0_12, regions, genes)
    degs_0_30_degs, uni = overlapper(degs_0_30, regions, genes)
    degs_0_60_degs, uni = overlapper(degs_0_60, regions, genes)

    uni.to_csv(f"gnn_enrichr_results_comp_wise/{label}_universe.tsv", sep="\t", index=False)

    res_0_12 = enrichrrr(degs_0_12_degs, pathways, universe_3d_names)
    res_0_30 = enrichrrr(degs_0_30_degs, pathways, universe_3d_names)
    res_0_60 = enrichrrr(degs_0_60_degs, pathways, universe_3d_names)

    comparison_dfs = merge_enrs_into_common_df_2([res_0_12, res_0_30, res_0_60], ["0_12", "0_30", "0_60"])

    for res1, res2, res3, pathway in zip(res_0_12, res_0_30, res_0_60, pathways):
        write_results(res1, f"gnn_enrichr_results_comp_wise/{label}_{pathway}_0_12.tsv")
        write_results(res2, f"gnn_enrichr_results_comp_wise/{label}_{pathway}_0_30.tsv")
        write_results(res3, f"gnn_enrichr_results_comp_wise/{label}_{pathway}_0_60.tsv")

    database_names = ["Gene Ontology\nBiological Process", "MSigDB\nHallmark", "NCI-Nature\nPID"]
    
    plot_count = 0
    for i, (df, pathway) in enumerate(zip(comparison_dfs, pathways)):
        fig, ax = plt.subplots(1, 1, figsize=(10, 8))
        if len(df) != 0:

            df = df.loc[df.iloc[:,[0,1,2]].max(axis=1).sort_values(ascending=True).index]
            df = df.iloc[-20:, :]
            b = df.plot.barh(x='Term', ax=ax, color=['#A63446', "#F5B841", '#9DBBAE'])
            b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=0.6, linewidth=3)
            b.set_xlabel(f'-log$_{{10}}$(Adjusted P-value)', fontsize=20)

            b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=.9, linewidth=3)

            b.set_xlabel(f'-log$_{{10}}$ Adjusted P-value', fontsize=12)
            b.set_ylabel(database_names[i], fontsize=20)

            
            # change font size of x and y ticks
            b.tick_params(labelsize=8)
            b.tick_params(axis='both', which='major', labelsize=10)
            b.tick_params(axis='both', which='minor', labelsize=10)

            db_name = database_names[i].replace("\n", "_")

            #fig.suptitle(f"{label}", fontsize=30)
            fig.set_tight_layout(True)
            fig.savefig(f"gnn_enrichr_results_comp_wise/{label}_{db_name}_pathways.svg")
            fig.savefig(f"gnn_enrichr_results_comp_wise/{label}_{db_name}_pathways.png", dpi=300, facecolor="white", edgecolor='none')

        fig.clf()

In [None]:
# comp_files = ["t0_t12_results_0_0", "t0_t12_results_0_1", "t0_t12_results_1_0", "t0_t12_results_1_1"]

# #regions = pd.concat([pd.read_csv(f"/home/carlos/oldies/manuscripts/notebooks/gnn/{comp}.tsv", sep="\t") for comp in comp_files[:3]]) # combine comp1, comp2, comp3
# #label = "changed_regions"
# regions = pd.concat([pd.read_csv(f"/home/carlos/oldies/manuscripts/notebooks/gnn/{comp}.tsv", sep="\t") for comp in comp_files[3:]])
# label = "unchanged_regions"

# degs_0_12_degs, _ = overlapper(degs_0_12, regions, genes)
# degs_0_30_degs, _ = overlapper(degs_0_30, regions, genes)
# degs_0_60_degs, _ = overlapper(degs_0_60, regions, genes)

# res_0_12 = enrichrrr(degs_0_12_degs, pathways, universe_3d_names)
# res_0_30 = enrichrrr(degs_0_30_degs, pathways, universe_3d_names)
# res_0_60 = enrichrrr(degs_0_60_degs, pathways, universe_3d_names)

# comparison_dfs = merge_enrs_into_common_df_2([res_0_12, res_0_30, res_0_60], ["0_12", "0_30", "0_60"])

# for res1, res2, res3, pathway in zip(res_0_12, res_0_30, res_0_60, pathways):
#     write_results(res1, f"gnn_enrichr_results_all_vs_all/{label}_{pathway}_0_12.tsv")
#     write_results(res2, f"gnn_enrichr_results_all_vs_all/{label}_{pathway}_0_30.tsv")
#     write_results(res3, f"gnn_enrichr_results_all_vs_all/{label}_{pathway}_0_60.tsv")

# fig, ax = plt.subplots(3, 1, figsize=(10, 20))
# plot_count = 0
# for i, (df, pathway) in enumerate(zip(comparison_dfs, pathways)):

#     if len(df) != 0:
#         df = df.loc[df.iloc[:,[0,1,2]].mean(axis=1).sort_values(ascending=True).index]
#         df = df.iloc[-20:, :]
#         plot_count += 1
#         b = df.plot.barh(x='Term', ax=ax[i], color=['#A63446', "#F5B841", '#9DBBAE'])

#         b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=0.6, linewidth=3)
#         b.set_xlabel(f'-log$_{{10}}$(Adjusted P-value)', fontsize=20)
#         b.set_ylabel(f'{pathway} Term', fontsize=20)

#     if plot_count != 0:
#         fig.suptitle(f"{label}", fontsize=30)
#         fig.set_tight_layout(True)
#         fig.savefig(f"gnn_enrichr_results_all_vs_all/{label}_pathways.svg")
#         fig.savefig(f"gnn_enrichr_results_all_vs_all/{label}_pathways.png", dpi=300, facecolor="white", edgecolor='none')

#         fig.clf()

### GNN plot all vs all / Uniq Common

In [40]:
comparison_dfs_all_vs_all = []

comp_files = ["t0_t12_results_0_0", "t0_t12_results_0_1", "t0_t12_results_1_0", "t0_t12_results_1_1"]

regions = pd.concat([pd.read_csv(f"/home/carlos/oldies/manuscripts/notebooks/gnn/{comp}.tsv", sep="\t") for comp in comp_files[:3]]) # combine comp1, comp2, comp3
label = "changed_regions"


degs_0_12_degs, _ = overlapper(degs_0_12, regions, genes)
degs_0_30_degs, _ = overlapper(degs_0_30, regions, genes)
degs_0_60_degs, _ = overlapper(degs_0_60, regions, genes)

res_0_12 = enrichrrr(degs_0_12_degs, pathways, universe_3d_names)
res_0_30 = enrichrrr(degs_0_30_degs, pathways, universe_3d_names)
res_0_60 = enrichrrr(degs_0_60_degs, pathways, universe_3d_names)

for res1, res2, res3, pathway in zip(res_0_12, res_0_30, res_0_60, pathways):
    write_results(res1, f"gnn_enrichr_results_all_vs_all/{label}_{pathway}_0_12.tsv")
    write_results(res2, f"gnn_enrichr_results_all_vs_all/{label}_{pathway}_0_30.tsv")
    write_results(res3, f"gnn_enrichr_results_all_vs_all/{label}_{pathway}_0_60.tsv")

comparison_dfs = merge_enrs_into_common_df_2([res_0_12, res_0_30, res_0_60], ["0_12", "0_30", "0_60"])

comparison_dfs_all_vs_all.append(comparison_dfs)


In [41]:
regions = pd.concat([pd.read_csv(f"/home/carlos/oldies/manuscripts/notebooks/gnn/{comp}.tsv", sep="\t") for comp in comp_files[3:]])
label = "unchanged_regions"

degs_0_12_degs, _ = overlapper(degs_0_12, regions, genes)
degs_0_30_degs, _ = overlapper(degs_0_30, regions, genes)
degs_0_60_degs, _ = overlapper(degs_0_60, regions, genes)

res_0_12 = enrichrrr(degs_0_12_degs, pathways, universe_3d_names)
res_0_30 = enrichrrr(degs_0_30_degs, pathways, universe_3d_names)
res_0_60 = enrichrrr(degs_0_60_degs, pathways, universe_3d_names)

for res1, res2, res3, pathway in zip(res_0_12, res_0_30, res_0_60, pathways):
    write_results(res1, f"gnn_enrichr_results_all_vs_all/{label}_{pathway}_0_12.tsv")
    write_results(res2, f"gnn_enrichr_results_all_vs_all/{label}_{pathway}_0_30.tsv")
    write_results(res3, f"gnn_enrichr_results_all_vs_all/{label}_{pathway}_0_60.tsv")

comparison_dfs = merge_enrs_into_common_df_2([res_0_12, res_0_30, res_0_60], ["0_12", "0_30", "0_60"])
comparison_dfs_all_vs_all.append(comparison_dfs)

In [42]:
database_names = ["Gene Ontology\nBiological Process", "MSigDB\nHallmark", "NCI-Nature\nPID"]

In [None]:
# Changed regions uniq terms


for idx, (changed, notChanged) in enumerate(zip(comparison_dfs_all_vs_all[0], comparison_dfs_all_vs_all[1])):
    fig , ax = plt.subplots(1, 1, figsize=(10, 8))
    # find common terms 
    common_terms = list(set(changed.index).intersection(set(notChanged.index)))
    # remove common terms from changed 
    df = changed.loc[~changed.index.isin(common_terms)]
    df = df.loc[df.iloc[:,[0,1,2]].max(axis=1).sort_values(ascending=True).index]

    if len(df) != 0:
        df = df.iloc[-15:, :]
        df.columns = ['12min', '30min', '60min', 'Term']
        b = df.plot.barh(x='Term', ax=ax, color=['#A63446', "#F5B841", '#9DBBAE'])
        b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=.9, linewidth=3)

        b.set_xlabel(f'-log$_{{10}}$ Adjusted P-value', fontsize=12)
        b.set_ylabel(database_names[idx], fontsize=20)

        
        # change font size of x and y ticks
        b.tick_params(labelsize=8)
        b.tick_params(axis='both', which='major', labelsize=10)
        b.tick_params(axis='both', which='minor', labelsize=10)

        db_name = database_names[idx].replace("\n", "_")
        df.to_csv(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_changed_uniq_terms.tsv", sep='\t', index=False)
        fig.set_tight_layout(True)
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_changed_uniq_terms.svg")
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_changed_uniq_terms.png", dpi=300, facecolor="white", edgecolor='none')

    fig.clf()

In [None]:
# Unchanged regions uniq terms

for idx, (changed, notChanged) in enumerate(zip(comparison_dfs_all_vs_all[0], comparison_dfs_all_vs_all[1])):
    fig , ax = plt.subplots(1, 1, figsize=(10, 8))

    common_terms = list(set(changed.index).intersection(set(notChanged.index)))
    # remove common terms from changed 
    df = notChanged.loc[~notChanged.index.isin(common_terms)]
    if len(df) != 0:
        df = df.loc[df.iloc[:,[0,1,2]].max(axis=1).sort_values(ascending=True).index]


    if len(df) != 0:
        df = df.iloc[-15:, :]
        df.columns = ['12min', '30min', '60min', 'Term']
        b = df.plot.barh(x='Term', ax=ax, color=['#A63446', "#F5B841", '#9DBBAE'])
        b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=.9, linewidth=3)

        b.set_xlabel(f'-log$_{{10}}$ Adjusted P-value', fontsize=12)
        b.set_ylabel(database_names[idx], fontsize=20)

        
        # change font size of x and y ticks
        b.tick_params(labelsize=8)
        b.tick_params(axis='both', which='major', labelsize=10)
        b.tick_params(axis='both', which='minor', labelsize=10)

        db_name = database_names[idx].replace("\n", "_")
        df.to_csv(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_unchanged_uniq_terms.tsv", sep='\t', index=False)
        fig.set_tight_layout(True)
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_unchanged_uniq_terms.svg")
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_unchanged_uniq_terms.png", dpi=300, facecolor="white", edgecolor='none')
    
    fig.clf()

In [None]:
# Changed regions common terms
for idx, (changed, notChanged) in enumerate(zip(comparison_dfs_all_vs_all[0], comparison_dfs_all_vs_all[1])):
    fig , ax = plt.subplots(1, 1, figsize=(10, 8))

    common_terms = list(set(changed.index).intersection(set(notChanged.index)))

    df = changed.loc[changed.index.isin(common_terms)]

    if len(df) != 0:
        df = df.loc[df.iloc[:,[0,1,2]].max(axis=1).sort_values(ascending=True).index]
    
    if len(df) != 0:
        df = df.iloc[-15:, :]
        df.columns = ['12min', '30min', '60min', 'Term']
        b = df.plot.barh(x='Term', ax=ax, color=['#A63446', "#F5B841", '#9DBBAE'])
        b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=.9, linewidth=3)

        b.set_xlabel(f'-log$_{{10}}$ Adjusted P-value', fontsize=12)
        b.set_ylabel(database_names[idx], fontsize=20)

        
        # change font size of x and y ticks
        b.tick_params(labelsize=8)
        b.tick_params(axis='both', which='major', labelsize=12)
        b.tick_params(axis='both', which='minor', labelsize=12)
        
        db_name = database_names[idx].replace("\n", "_")
        df.to_csv(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_changed_common_terms.tsv", sep='\t', index=False)
        fig.set_tight_layout(True)
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_changed_common_terms.svg")
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_changed_common_terms.png", dpi=300, facecolor="white", edgecolor='none')
    
    fig.clf()

In [None]:
# not Changed regions common terms
for idx, (changed, notChanged) in enumerate(zip(comparison_dfs_all_vs_all[0], comparison_dfs_all_vs_all[1])):
    fig , ax = plt.subplots(1, 1, figsize=(10, 8))

    common_terms = list(set(changed.index).intersection(set(notChanged.index)))
    # remove common terms from changed 
    df = notChanged.loc[notChanged.index.isin(common_terms)]

    if len(df) != 0:
        df = df.loc[df.iloc[:,[0,1,2]].max(axis=1).sort_values(ascending=True).index]
    
    if len(df) != 0:
        df = df.iloc[-15:, :]
        df.columns = ['12min', '30min', '60min', 'Term']
        b = df.plot.barh(x='Term', ax=ax, color=['#A63446', "#F5B841", '#9DBBAE'])
        b.axvline(x=-np.log10(0.05), color='black', linestyle='--', alpha=.9, linewidth=3)

        b.set_xlabel(f'-log$_{{10}}$ Adjusted P-value', fontsize=12)
        b.set_ylabel(database_names[idx], fontsize=20)

        
        # change font size of x and y ticks
        b.tick_params(labelsize=8)
        b.tick_params(axis='both', which='major', labelsize=12)
        b.tick_params(axis='both', which='minor', labelsize=12)
        db_name = database_names[idx].replace("\n", "_")

        fig.set_tight_layout(True)
        df.to_csv(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_unchanged_common_terms.tsv", sep='\t', index=False)
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_unchanged_common_terms.svg")
        fig.savefig(f"gnn_enrichr_results_all_vs_all_uc/{db_name}_pathways_unchanged_common_terms.png", dpi=300, facecolor="white", edgecolor='none')
    
    fig.clf()
    

### GNN res, expression profile

In [None]:
def map_geneID_to_name(geneID):
    if geneID not in genes.gene_id.values:
        return None
    return genes.loc[genes.gene_id == geneID].external_gene_name.values[0]

def map_txID_to_name(txID):
    if txID not in genes.tx_id.values:
        return None
    return genes.loc[genes.tx_id == txID].external_gene_name.values[0]

In [None]:
order = ['SU_100', 'SU_200', 'SU_300', 'SU_112', 'SU_212', 'SU_312', 'SU_130', 'SU_230', 'SU_330', 'SU_160', 'SU_260', 'SU_360']

df = pd.read_csv(f"/home/carlos/oldies/projects/rna-seq/quant/SU_100/quant.sf", sep="\t")
df.Name = df.Name.apply(lambda x: x.split(".")[0])
df = df.loc[df.Name.isin(genes.tx_id.values)]
mapped_names = [map_txID_to_name(txID.split(".")[0]) for txID in df.Name.values]

In [None]:
series = []

for name in order:
    df = pd.read_csv(f"/home/carlos/oldies/projects/rna-seq/quant/{name}/quant.sf", sep="\t")
    df.Name = df.Name.apply(lambda x: x.split(".")[0])
    df = df.loc[df.Name.isin(genes.tx_id.values)]
    df.rename(columns={'TPM': name}, inplace=True)
    series.append(df[name])

tcounts_df = pd.DataFrame(series).T

tcounts_df['geneName'] = mapped_names

for i, name in zip([0, 3, 6, 9], ["Control", "12min", "30min", "60min"]):
    tcounts_df[name] = tcounts_df.iloc[:, i : i + 3].mean(axis=1)

In [None]:
# which_Genes = [
#     "ATF2", "ATF3", "ATF4", 
#     "JUN", "JUNB", "JUND",
#     "FOS", "FOSL1", "FOSL2", "FOSB", 
#     "MAF", "MAFB",
#     "TP53"]

which_Genes = "SURF1;POLH;GTF2B;ADCY6;BRF2;PRIM1;DGUOK;RNMT;SEC61A1;ZWINT;POLD1;RBX1;CDA;NELFE;RFC4".split(";")
plot_df = {
    "geneName": [],
    "Mean": [],
    "time": []
}

for gene_oi in which_Genes:
    for name in ["Control", "12min", "30min", "60min"]:
        plot_df["geneName"].append(gene_oi)
        plot_df["Mean"].append(tcounts_df.loc[tcounts_df.geneName == gene_oi, name].values[0])
        plot_df["time"].append(name)

fig, ax = plt.subplots(figsize=(20, 10), ncols=len(which_Genes))
plot_df = pd.DataFrame(plot_df)
for i, gene_oi in enumerate(which_Genes):
    sns.barplot(x="time", y="Mean", data=plot_df.loc[plot_df.geneName == gene_oi], ax=ax[i])
    ax[i].set_title(gene_oi)
    ax[i].set_ylabel("Mean TPM")
    ax[i].set_xlabel("Time")
    ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=45, horizontalalignment='right')