In [1]:
import pandas as pd
import numpy as np
from multiprocessing import Pool
from functools import partial
import glob
import os
import plotly.express as px
import math
import matplotlib.pyplot as plt
from matplotlib import colors
from scipy import stats, special
from statsmodels.stats import multitest
import statsmodels.api as sm
import statsmodels.formula.api as smf
import plotly.io as pio
import seaborn as sns

from functools import reduce

plt.rcParams['svg.fonttype'] = 'none'

base_dir = "http://bartzabel.ls.manchester.ac.uk/orozcolab/SNP2Mechanism/"

In [2]:
RNA_nominal_CD4 = pd.read_csv(f"{base_dir}/QTLs/RNA/RNA_nominal_CD4_merged.txt", sep = " ")
RNA_nominal_CD8 = pd.read_csv(f"{base_dir}/QTLs/RNA/RNA_nominal_CD8_merged.txt", sep = " ")
RNA_permuted_CD4 = pd.read_csv(f"{base_dir}/QTLs/RNA/RNA_permuted_CD4_FDR.txt", sep = " ")
RNA_permuted_CD8 = pd.read_csv(f"{base_dir}/QTLs/RNA/RNA_permuted_CD8_FDR.txt", sep = " ")

In [3]:
ATAC_nominal_CD4 = pd.read_csv(f"{base_dir}/QTLs/ATAC/ATAC_nominal_CD4_merged.txt", sep = " ")
ATAC_nominal_CD8 = pd.read_csv(f"{base_dir}/QTLs/ATAC/ATAC_nominal_CD8_merged.txt", sep = " ")
ATAC_permuted_CD4 = pd.read_csv(f"{base_dir}/QTLs/ATAC/ATAC_permuted_CD4_FDR.txt", sep = " ")
ATAC_permuted_CD8 = pd.read_csv(f"{base_dir}/QTLs/ATAC/ATAC_permuted_CD8_FDR.txt", sep = " ")

In [4]:
ins_nominal_CD4 = pd.read_csv(f"{base_dir}/QTLs/HiC/ins_nominal_CD4_merged.txt", sep = " ")
ins_nominal_CD8 = pd.read_csv(f"{base_dir}/QTLs/HiC/ins_nominal_CD8_merged.txt", sep = " ")
ins_permuted_CD4 = pd.read_csv(f"{base_dir}/QTLs/HiC/ins_permuted_CD4_FDR.txt", sep = " ")
ins_permuted_CD8 = pd.read_csv(f"{base_dir}/QTLs/HiC/ins_permuted_CD8_FDR.txt", sep = " ")

In [5]:
loop_nominal_CD4 = pd.read_csv(f"{base_dir}/QTLs/HiC/loop_nominal_CD4_merged.txt", sep = " ")
loop_nominal_CD8 = pd.read_csv(f"{base_dir}/QTLs/HiC/loop_nominal_CD8_merged.txt", sep = " ")
loop_permuted_CD4 = pd.read_csv(f"{base_dir}/QTLs/HiC/loop_permuted_CD4_FDR.txt", sep = " ")
loop_permuted_CD8 = pd.read_csv(f"{base_dir}/QTLs/HiC/loop_permuted_CD8_FDR.txt", sep = " ")

In [6]:
gtf_annotation_df = pd.read_pickle(f"{base_dir}/PsA_cleaned_analysis/gencode_gtf.pickle")
gtf_transcripts = gtf_annotation_df[(gtf_annotation_df["feature"] == "transcript") & (gtf_annotation_df["transcript_type"] == "protein_coding")].dropna(axis=1, how='all')
gtf_transcripts["gene_id"] = gtf_transcripts["gene_id"].str.split(".").str[0]
gtf_transcripts["transcript_id"] = gtf_transcripts["transcript_id"].str.split(".").str[0]
gtf_transcripts["TSS_start"] = gtf_transcripts.apply(lambda x: int(x["start"]) if x["strand"] == "+" else int(x["end"]) ,axis = 1)

## calculate the overlaps between a QTLs for expression and QTLs for chromatin accessibility of the promoters 

In [7]:
from intervaltree import Interval, IntervalTree
def annotated_peaks(target_ATAC):
    # Create a DataFrame for unique peaks
    df_unique_peaks = target_ATAC[['phe_id', 'phe_chr', 'phe_from', 'phe_to']].drop_duplicates()

    # Initialize an empty DataFrame to store the results
    df_result = pd.DataFrame()

    # Process each chromosome separately
    for chromosome in df_unique_peaks['phe_chr'].unique():
        # Subset data for the current chromosome
        df_peaks_chr = df_unique_peaks[df_unique_peaks['phe_chr'] == chromosome].copy()
        gtf_transcripts_chr = gtf_transcripts[gtf_transcripts['seqname'] == chromosome].copy()
        gtf_transcripts_chr.drop_duplicates(subset=['TSS_start', 'gene_id'], inplace=True)

        # Build interval tree for the current chromosome
        tree = IntervalTree()
        for row in gtf_transcripts_chr.itertuples():
            # Extend TSS_start on 2.5kb in both directions to cover nearby peaks
            tree.add(Interval(row.TSS_start - 2500, row.TSS_start + 2500, row.gene_id))

        # Annotate peaks with gene_id
        rows_list = []
        for i, row in df_peaks_chr.iterrows():
            intervals = tree[row.phe_from:row.phe_to]
            gene_ids = set()  # Store gene_ids to prevent duplicates
            for interval in intervals:  # Iterate through all overlapping intervals
                if interval.data not in gene_ids:
                    new_row = row.to_dict()
                    new_row["gene_id"] = interval.data
                    rows_list.append(new_row)
                    gene_ids.add(interval.data)  # Add gene_id to the set

        df_peaks_chr = pd.DataFrame(rows_list)
        
        df_result = pd.concat([df_result, df_peaks_chr[['phe_id', 'gene_id']]])

    # Now we can merge this back to the original peaks DataFrame
    result = pd.merge(target_ATAC, df_result, how='left', on=['phe_id'])
    return result[result["gene_id"].notnull()]

In [8]:
def get_values_RNA_ATAC_promoter(ref,target):
    # in this case I want the ATAC-peak to be close to the TSS of the gene. The problem of course is that there are multiple TSS per gene.
    annotated_target = annotated_peaks(target)

    # now for each gene that has eQTL, I want to know what directionality the SNP has for the peak that overlaps a TSS
    A_B_merged = ref[(ref["FDR"] < 0.1)][["phe_id","var_id","nom_pval","slope"]].merge(annotated_target[["phe_id","var_id","nom_pval","slope","gene_id"]], suffixes = ("_A", "_B"), left_on = ["var_id", "phe_id"], right_on = ["var_id", "gene_id"], how = "left")
    print(len(A_B_merged[A_B_merged["nom_pval_A"] < 0.01])/ len(ref[(ref["FDR"] < 0.1)]))
    print(len(A_B_merged[A_B_merged["nom_pval_B"] < 0.01])/ len(ref[(ref["FDR"] < 0.1)]))
    # fig = px.scatter(A_B_merged, x = "slope_A", y = "slope_B", opacity = 0.2)
    # fig.show()
    df = A_B_merged[A_B_merged["nom_pval_B"] < 0.01]
    # Create concordant mask (both positive or both negative)
    concordant_mask = np.sign(df['slope_A']) == np.sign(df['slope_B'])

    # Create discordant mask (one positive, one negative)
    discordant_mask = np.sign(df['slope_A']) != np.sign(df['slope_B'])

    # Count concordant and discordant rows
    num_concordant = concordant_mask.sum()
    num_discordant = discordant_mask.sum()

    print(f"Number of concordant rows: {num_concordant}")
    print(f"Number of discordant rows: {num_discordant}")
    print(f"ratio: {num_concordant/(num_discordant + num_concordant)}")

In [9]:
get_values_RNA_ATAC_promoter(RNA_permuted_CD8, ATAC_nominal_CD8)

1.0497203231821006
0.3766314481044127
Number of concordant rows: 536
Number of discordant rows: 70
ratio: 0.8844884488448845


In [10]:
get_values_RNA_ATAC_promoter(RNA_permuted_CD4, ATAC_nominal_CD4)

1.0541871921182266
0.39704433497536945
Number of concordant rows: 354
Number of discordant rows: 49
ratio: 0.8784119106699751


## calculate the overlaps between a QTLs for expression and QTLs for insulation score of the domain overlapping of the promoters 
also works for loops for all genes contained between the loop anchors

In [11]:
from intervaltree import Interval, IntervalTree
def annotated_ins(target_ins):
    # Create a DataFrame for unique peaks
    df_unique_peaks = target_ins[['phe_id', 'phe_chr', 'phe_from', 'phe_to']].drop_duplicates()

    # Initialize an empty DataFrame to store the results
    df_result = pd.DataFrame()

    # Process each chromosome separately
    for chromosome in df_unique_peaks['phe_chr'].unique():
        # Subset data for the current chromosome
        df_peaks_chr = df_unique_peaks[df_unique_peaks['phe_chr'] == chromosome].copy()
        gtf_transcripts_chr = gtf_transcripts[gtf_transcripts['seqname'] == chromosome].copy()
        gtf_transcripts_chr.drop_duplicates(subset=['TSS_start', 'gene_id'], inplace=True)

        # Build interval tree for the current chromosome
        tree = IntervalTree()
        for row in gtf_transcripts_chr.itertuples():
            tree.add(Interval(row.TSS_start - 1, row.TSS_start + 1, row.gene_id))

        # Annotate peaks with gene_id
        rows_list = []
        for i, row in df_peaks_chr.iterrows():
            intervals = tree[row.phe_from:row.phe_to]
            gene_ids = set()  # Store gene_ids to prevent duplicates
            for interval in intervals:  # Iterate through all overlapping intervals
                if interval.data not in gene_ids:
                    new_row = row.to_dict()
                    new_row["gene_id"] = interval.data
                    rows_list.append(new_row)
                    gene_ids.add(interval.data)  # Add gene_id to the set
        df_peaks_chr = pd.DataFrame(rows_list)
        
        df_result = pd.concat([df_result, df_peaks_chr[['phe_id', 'gene_id']]])

    # Now we can merge this back to the original peaks DataFrame
    result = pd.merge(target_ins, df_result, how='left', on=['phe_id'])
    return result[result["gene_id"].notnull()]

In [12]:
def get_values_RNA_ins_promoter(ref,target):
    # in this case I want the QTL with the insulation score that overlaps the TSS of the gene. The problem of course is that there are multiple TSS per gene.
    annotated_target = annotated_ins(target)

    # now for each gene that has eQTL, I want to know what directionality the SNP has for the peak that overlaps a TSS
    A_B_merged = ref[(ref["FDR"] < 0.1)][["phe_id","var_id","nom_pval","slope"]].merge(annotated_target[["phe_id","var_id","nom_pval","slope","gene_id"]], suffixes = ("_A", "_B"), left_on = ["var_id", "phe_id"], right_on = ["var_id", "gene_id"], how = "left")
    print(len(A_B_merged[A_B_merged["nom_pval_A"] < 0.01])/ len(ref[(ref["FDR"] < 0.1)]))
    print(len(A_B_merged[A_B_merged["nom_pval_B"] < 0.01])/ len(ref[(ref["FDR"] < 0.1)]))
    # fig = px.scatter(A_B_merged, x = "slope_A", y = "slope_B", opacity = 0.2)
    # fig.show()
    df = A_B_merged[A_B_merged["nom_pval_B"] < 0.0001]
    # Create concordant mask (both positive or both negative)
    concordant_mask = np.sign(df['slope_A']) == np.sign(df['slope_B'])

    # Create discordant mask (one positive, one negative)
    discordant_mask = np.sign(df['slope_A']) != np.sign(df['slope_B'])

    # Count concordant and discordant rows
    num_concordant = concordant_mask.sum()
    num_discordant = discordant_mask.sum()

    print(f"Number of concordant rows: {num_concordant}")
    print(f"Number of discordant rows: {num_discordant}")
    print(f"ratio: {num_concordant/(num_discordant + num_concordant)}")

In [13]:
get_values_RNA_ins_promoter(RNA_permuted_CD8, ins_nominal_CD8)

1.0957116221255438
0.40087010565568676
Number of concordant rows: 265
Number of discordant rows: 33
ratio: 0.889261744966443


In [14]:
get_values_RNA_ins_promoter(RNA_permuted_CD4, ins_nominal_CD4)

1.1192118226600984
0.44532019704433495
Number of concordant rows: 222
Number of discordant rows: 16
ratio: 0.9327731092436975


In [15]:
get_values_RNA_ins_promoter(RNA_permuted_CD8, loop_nominal_CD8)

1.4058421379738968
0.7675574891236793
Number of concordant rows: 144
Number of discordant rows: 67
ratio: 0.6824644549763034


In [16]:
get_values_RNA_ins_promoter(RNA_permuted_CD4, loop_nominal_CD4)

1.4561576354679804
0.8453201970443349
Number of concordant rows: 127
Number of discordant rows: 45
ratio: 0.7383720930232558


## modified version so that it only considers the promoters overlapping the loop anchors

In [17]:
from intervaltree import Interval, IntervalTree
def annotated_loop_anchors(target_loop):
    # Create a DataFrame for unique peaks
    df_unique_loops = target_loop[['phe_id', 'phe_chr', 'phe_from', 'phe_to']].drop_duplicates()

    # Initialize an empty DataFrame to store the results
    df_result = pd.DataFrame()

    # Process each chromosome separately
    for chromosome in df_unique_loops['phe_chr'].unique():
        # Subset data for the current chromosome
        df_loops_chr = df_unique_loops[df_unique_loops['phe_chr'] == chromosome].copy()
        gtf_transcripts_chr = gtf_transcripts[gtf_transcripts['seqname'] == chromosome].copy()
        gtf_transcripts_chr.drop_duplicates(subset=['TSS_start', 'gene_id'], inplace=True)

        # Build interval tree for the current chromosome
        tree = IntervalTree()
        for row in gtf_transcripts_chr.itertuples():
            tree.add(Interval(row.TSS_start - 2500, row.TSS_start + 2500, row.gene_id))

        # Annotate peaks with gene_id
        rows_list = []
        for i, row in df_loops_chr.iterrows():
            intervals_A = tree[row.phe_from:row.phe_from + 5000]
            intervals_B = tree[row.phe_to - 5000:row.phe_to]
            gene_ids = set()  # Store gene_ids to prevent duplicates
            for interval in intervals_A:  # Iterate through all overlapping intervals of anchor A
                if interval.data not in gene_ids:
                    new_row = row.to_dict()
                    new_row["gene_id"] = interval.data
                    rows_list.append(new_row)
                    gene_ids.add(interval.data)  # Add gene_id to the set
            for interval in intervals_B:  # Iterate through all overlapping intervals of anchor B
                if interval.data not in gene_ids:
                    new_row = row.to_dict()
                    new_row["gene_id"] = interval.data
                    rows_list.append(new_row)
                    gene_ids.add(interval.data)  # Add gene_id to the set
        df_loops_chr = pd.DataFrame(rows_list)
        
        df_result = pd.concat([df_result, df_loops_chr[['phe_id', 'gene_id']]])

    # Now we can merge this back to the original peaks DataFrame
    result = pd.merge(target_loop, df_result, how='left', on=['phe_id'])
    return result[result["gene_id"].notnull()]

In [18]:
def get_values_RNA_loop_anchors_promoter(ref,target):
    # in this case I want the eQTL with the loop anchors
    annotated_target = annotated_loop_anchors(target)

    # now for each gene that has eQTL, I want to know what directionality the SNP has for the peak that overlaps a TSS
    A_B_merged = ref[(ref["FDR"] < 0.1)][["phe_id","var_id","nom_pval","slope"]].merge(annotated_target[["phe_id","var_id","nom_pval","slope","gene_id"]], suffixes = ("_A", "_B"), left_on = ["var_id", "phe_id"], right_on = ["var_id", "gene_id"], how = "left")
    print(len(A_B_merged[A_B_merged["nom_pval_A"] < 0.01])/ len(ref[(ref["FDR"] < 0.1)]))
    print(len(A_B_merged[A_B_merged["nom_pval_B"] < 0.01])/ len(ref[(ref["FDR"] < 0.1)]))
    # fig = px.scatter(A_B_merged, x = "slope_A", y = "slope_B", opacity = 0.2)
    # fig.show()
    df = A_B_merged[A_B_merged["nom_pval_B"] < 0.0001]
    # Create concordant mask (both positive or both negative)
    concordant_mask = np.sign(df['slope_A']) == np.sign(df['slope_B'])

    # Create discordant mask (one positive, one negative)
    discordant_mask = np.sign(df['slope_A']) != np.sign(df['slope_B'])

    # Count concordant and discordant rows
    num_concordant = concordant_mask.sum()
    num_discordant = discordant_mask.sum()

    print(f"Number of concordant rows: {num_concordant}")
    print(f"Number of discordant rows: {num_discordant}")
    print(f"ratio: {num_concordant/(num_discordant + num_concordant)}")

In [19]:
get_values_RNA_loop_anchors_promoter(RNA_permuted_CD8, loop_nominal_CD8)


1.0863890615289
0.23430702299564948
Number of concordant rows: 68
Number of discordant rows: 14
ratio: 0.8292682926829268


In [20]:
get_values_RNA_loop_anchors_promoter(RNA_permuted_CD4, loop_nominal_CD4)

1.1014778325123153
0.270935960591133
Number of concordant rows: 52
Number of discordant rows: 7
ratio: 0.8813559322033898


## and finally, do the above but with ATAC peaks rather than genes

In [21]:
from intervaltree import Interval, IntervalTree
def annotated_loop_ATAC(target_loop,ref_ATAC):
    # Create a DataFrame for unique peaks
    df_unique_loops = target_loop[['phe_id', 'phe_chr', 'phe_from', 'phe_to']].drop_duplicates()

    # Initialize an empty DataFrame to store the results
    df_result = pd.DataFrame()

    # Process each chromosome separately
    for chromosome in df_unique_loops['phe_chr'].unique():
        # Subset data for the current chromosome
        df_loops_chr = df_unique_loops[df_unique_loops['phe_chr'] == chromosome].copy()
        ref_ATAC_chr = ref_ATAC[ref_ATAC['phe_chr'] == chromosome].copy()
        ref_ATAC_chr.drop_duplicates(subset=['phe_id'], inplace=True)

        # Build interval tree for the current chromosome
        tree = IntervalTree()
        for row in ref_ATAC_chr.itertuples():
            tree.add(Interval(row.phe_from - 5000, row.phe_to + 5000, row.phe_id))

        # Annotate loops with peak ids
        rows_list = []
        for i, row in df_loops_chr.iterrows():
            intervals = tree[row.phe_from:row.phe_to]
            peak_ids = set()  # Store gene_ids to prevent duplicates
            for interval in intervals:  # Iterate through all overlapping intervals
                if interval.data not in peak_ids:
                    new_row = row.to_dict()
                    new_row["peak_ids"] = interval.data
                    rows_list.append(new_row)
                    peak_ids.add(interval.data)  # Add gene_id to the set
        df_loops_chr = pd.DataFrame(rows_list)
        if len(df_loops_chr) > 1:
            df_result = pd.concat([df_result, df_loops_chr[['phe_id', 'peak_ids']]])

    # Now we can merge this back to the original peaks DataFrame
    result = pd.merge(target_loop, df_result, how='left', on=['phe_id'])
    return result[result["peak_ids"].notnull()]

In [22]:
def get_values_ATAC_loop(ref,target):
    annotated_target = annotated_loop_ATAC(target,ref[(ref["adj_beta_pval"] < 0.1)])

    # now for each gene that has eQTL, I want to know what directionality the SNP has for the peak that overlaps a TSS
    A_B_merged = ref[(ref["FDR"] < 0.1)][["phe_id","var_id","nom_pval","slope"]].merge(annotated_target[["phe_id","var_id","nom_pval","slope","peak_ids"]], suffixes = ("_A", "_B"), left_on = ["var_id", "phe_id"], right_on = ["var_id", "peak_ids"], how = "left")
    print(len(A_B_merged[A_B_merged["nom_pval_A"] < 0.01])/ len(ref[(ref["FDR"] < 0.1)]))
    print(len(A_B_merged[A_B_merged["nom_pval_B"] < 0.01])/ len(ref[(ref["FDR"] < 0.1)]))
    # fig = px.scatter(A_B_merged, x = "slope_A", y = "slope_B", opacity = 0.2)
    # fig.show()
    df = A_B_merged[A_B_merged["nom_pval_B"] < 0.0001]
    # Create concordant mask (both positive or both negative)
    concordant_mask = np.sign(df['slope_A']) == np.sign(df['slope_B'])

    # Create discordant mask (one positive, one negative)
    discordant_mask = np.sign(df['slope_A']) != np.sign(df['slope_B'])

    # Count concordant and discordant rows
    num_concordant = concordant_mask.sum()
    num_discordant = discordant_mask.sum()

    print(f"Number of concordant rows: {num_concordant}")
    print(f"Number of discordant rows: {num_discordant}")
    print(f"ratio: {num_concordant/(num_discordant + num_concordant)}")

In [23]:
get_values_ATAC_loop(ATAC_permuted_CD8, loop_nominal_CD8)

1.5587075435695203
0.9694695331382776
Number of concordant rows: 1438
Number of discordant rows: 404
ratio: 0.7806731813246471


In [24]:
get_values_ATAC_loop(ATAC_permuted_CD4, loop_nominal_CD4)

1.5830318974021704
1.0108516935218679
Number of concordant rows: 1170
Number of discordant rows: 306
ratio: 0.7926829268292683


In [25]:
get_values_ATAC_loop(ATAC_permuted_CD8, ins_nominal_CD8)

1.1052029003943518
0.465335199084086
Number of concordant rows: 1813
Number of discordant rows: 118
ratio: 0.9388917659243915


In [26]:
get_values_ATAC_loop(ATAC_permuted_CD4, ins_nominal_CD4)

1.113942781979612
0.495396251233147
Number of concordant rows: 1476
Number of discordant rows: 98
ratio: 0.9377382465057179


In [27]:
from intervaltree import Interval, IntervalTree
def annotated_loop_anchors_ATAC(target_loop,ref_ATAC):
    # Create a DataFrame for unique peaks
    df_unique_loops = target_loop[['phe_id', 'phe_chr', 'phe_from', 'phe_to']].drop_duplicates()

    # Initialize an empty DataFrame to store the results
    df_result = pd.DataFrame()

    # Process each chromosome separately
    for chromosome in df_unique_loops['phe_chr'].unique():
        # Subset data for the current chromosome
        df_loops_chr = df_unique_loops[df_unique_loops['phe_chr'] == chromosome].copy()
        ref_ATAC_chr = ref_ATAC[ref_ATAC['phe_chr'] == chromosome].copy()
        ref_ATAC_chr.drop_duplicates(subset=['phe_id'], inplace=True)

        # Build interval tree for the current chromosome
        tree = IntervalTree()
        for row in ref_ATAC_chr.itertuples():
            tree.add(Interval(row.phe_from - 5000, row.phe_to + 5000, row.phe_id))

        # Annotate loops with peak ids
        rows_list = []
        for i, row in df_loops_chr.iterrows():
            intervals_A = tree[row.phe_from:row.phe_from + 5000]
            intervals_B = tree[row.phe_to - 5000:row.phe_to]
            peak_ids = set()  # Store peak_ids to prevent duplicates
            for interval in intervals_A:  # Iterate through all overlapping intervals of anchor A
                if interval.data not in peak_ids:
                    new_row = row.to_dict()
                    new_row["peak_ids"] = interval.data
                    rows_list.append(new_row)
                    peak_ids.add(interval.data)  # Add peak_ids to the set
            for interval in intervals_B:  # Iterate through all overlapping intervals of anchor B
                if interval.data not in peak_ids:
                    new_row = row.to_dict()
                    new_row["peak_ids"] = interval.data
                    rows_list.append(new_row)
                    peak_ids.add(interval.data)  # Add peak_ids to the set
        df_loops_chr = pd.DataFrame(rows_list)
        if len(df_loops_chr) > 1:
            df_result = pd.concat([df_result, df_loops_chr[['phe_id', 'peak_ids']]])

    # Now we can merge this back to the original peaks DataFrame
    result = pd.merge(target_loop, df_result, how='left', on=['phe_id'])
    return result[result["peak_ids"].notnull()]

In [28]:
def get_values_ATAC_loop_anchors(ref,target):
    annotated_target = annotated_loop_anchors_ATAC(target,ref[(ref["adj_beta_pval"] < 0.1)])

    # now for each gene that has eQTL, I want to know what directionality the SNP has for the peak that overlaps a TSS
    A_B_merged = ref[(ref["FDR"] < 0.1)][["phe_id","var_id","nom_pval","slope"]].merge(annotated_target[["phe_id","var_id","nom_pval","slope","peak_ids"]], suffixes = ("_A", "_B"), left_on = ["var_id", "phe_id"], right_on = ["var_id", "peak_ids"], how = "left")
    print(len(A_B_merged[A_B_merged["nom_pval_A"] < 0.01])/ len(ref[(ref["FDR"] < 0.1)]))
    print(len(A_B_merged[A_B_merged["nom_pval_B"] < 0.01])/ len(ref[(ref["FDR"] < 0.1)]))
    # fig = px.scatter(A_B_merged, x = "slope_A", y = "slope_B", opacity = 0.2)
    # fig.show()
    df = A_B_merged[A_B_merged["nom_pval_B"] < 0.0001]
    # Create concordant mask (both positive or both negative)
    concordant_mask = np.sign(df['slope_A']) == np.sign(df['slope_B'])

    # Create discordant mask (one positive, one negative)
    discordant_mask = np.sign(df['slope_A']) != np.sign(df['slope_B'])

    # Count concordant and discordant rows
    num_concordant = concordant_mask.sum()
    num_discordant = discordant_mask.sum()

    print(f"Number of concordant rows: {num_concordant}")
    print(f"Number of discordant rows: {num_discordant}")
    print(f"ratio: {num_concordant/(num_discordant + num_concordant)}")

In [29]:
get_values_ATAC_loop_anchors(ATAC_permuted_CD8, loop_nominal_CD8)

1.1289912224907772
0.28088029512784635
Number of concordant rows: 686
Number of discordant rows: 64
ratio: 0.9146666666666666


In [30]:
get_values_ATAC_loop_anchors(ATAC_permuted_CD4, loop_nominal_CD4)

1.134988490628083
0.3003946070371588
Number of concordant rows: 596
Number of discordant rows: 56
ratio: 0.9141104294478528


In [31]:
def get_values_SAME(ref,target):
    A_B_merged = ref[(ref["FDR"] < 0.1)][["phe_id","var_id","nom_pval","slope"]].merge(target[["phe_id","var_id","nom_pval","slope"]], suffixes = ("_A", "_B"), on = ["var_id", "phe_id"], how = "left")
    print(len(A_B_merged[A_B_merged["nom_pval_A"] < 0.01])/ len(ref[(ref["FDR"] < 0.1)]))
    print(len(A_B_merged[A_B_merged["nom_pval_B"] < 0.01])/ len(ref[(ref["FDR"] < 0.1)]))
    # fig = px.scatter(A_B_merged, x = "slope_A", y = "slope_B", opacity = 0.2)
    # fig.show()
    df = A_B_merged[A_B_merged["nom_pval_B"] < 0.0001]
    # Create concordant mask (both positive or both negative)
    concordant_mask = np.sign(df['slope_A']) == np.sign(df['slope_B'])

    # Create discordant mask (one positive, one negative)
    discordant_mask = np.sign(df['slope_A']) != np.sign(df['slope_B'])

    # Count concordant and discordant rows
    num_concordant = concordant_mask.sum()
    num_discordant = discordant_mask.sum()

    print(f"Number of concordant rows: {num_concordant}")
    print(f"Number of discordant rows: {num_discordant}")
    print(f"ratio: {num_concordant/(num_discordant + num_concordant)}")

In [32]:
get_values_SAME(RNA_permuted_CD8,RNA_nominal_CD4)

1.0
0.7638284648850218
Number of concordant rows: 773
Number of discordant rows: 0
ratio: 1.0


In [33]:
get_values_SAME(RNA_permuted_CD4,RNA_nominal_CD8)

1.0
0.8354679802955665
Number of concordant rows: 677
Number of discordant rows: 0
ratio: 1.0


In [34]:
get_values_SAME(ATAC_permuted_CD8,ATAC_nominal_CD4)

1.0
0.7832336852817707
Number of concordant rows: 4601
Number of discordant rows: 0
ratio: 1.0


In [35]:
get_values_SAME(ATAC_permuted_CD4,ATAC_nominal_CD8)

1.0
0.8452811575139757
Number of concordant rows: 4291
Number of discordant rows: 0
ratio: 1.0


In [36]:
get_values_SAME(ins_permuted_CD8,ins_nominal_CD4)

1.0
0.7264606470304201
Number of concordant rows: 3761
Number of discordant rows: 0
ratio: 1.0


In [37]:
get_values_SAME(ins_permuted_CD4,ins_nominal_CD8)

1.0
0.7525069637883008
Number of concordant rows: 3689
Number of discordant rows: 0
ratio: 1.0


In [38]:
get_values_SAME(loop_permuted_CD8,loop_nominal_CD4)

1.0
0.8582677165354331
Number of concordant rows: 514
Number of discordant rows: 0
ratio: 1.0


In [39]:
get_values_SAME(loop_permuted_CD4,loop_nominal_CD8)

1.0
0.9050086355785838
Number of concordant rows: 460
Number of discordant rows: 0
ratio: 1.0
