# script to identify the loops that are differentially interacting

In [1]:
import pandas as pd
import numpy as np
import hicstraw 
from multiprocessing import Pool
from functools import partial
import glob
import os
import plotly.express as px
import math
import matplotlib.pyplot as plt
from matplotlib import colors
from pandarallel import pandarallel
import cooler
import cooltools
import pybedtools as pbed
pandarallel.initialize()
from scipy import stats, special
from statsmodels.stats import multitest
import statsmodels.api as sm
import statsmodels.formula.api as smf
import plotly.io as pio
import seaborn as sns
os.makedirs("/mnt/iusers01/jw01/mdefscs4/scratch/temp_pybedtools/", exist_ok = True)
pbed.helpers.set_tempdir("/mnt/iusers01/jw01/mdefscs4/scratch/temp_pybedtools/")
bed_genome_file = "/mnt/iusers01/jw01/mdefscs4/hg38.genome"

plt.rcParams['svg.fonttype'] = 'none'

base_dir = "/mnt/jw01-aruk-home01/projects/psa_functional_genomics/PsA_cleaned_analysis"

INFO: Pandarallel will run on 28 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## first do CD4 vs CD8

In [2]:
loops_analysed = pd.read_pickle(f"{base_dir}/HiC_analysis/extracting_loop_counts/aggregated_counts/aggregated_normalized_loops_CD4_CD8.pk")
metadata_hic = pd.read_csv(f"{base_dir}/metadata/cleaned_HiC_metadata.csv", index_col = 0)

In [12]:
# melt the loop dataframe and add info about the sample from the metadata
loops_analysed_melted = pd.melt(loops_analysed, id_vars=['chrA', 'A_start', 'A_end', 'chrB', 'B_start', 'B_end', 'FDR', 'DETECTION_SCALE', 'distance_bin'], 
        value_vars=loops_analysed.columns.difference(['chrA', 'A_start', 'A_end', 'chrB', 'B_start', 'B_end', 'FDR', 'DETECTION_SCALE', 'distance_bin']),
        var_name="sample",value_name="interaction_strength")
loops_analysed_melted = loops_analysed_melted.merge(metadata_hic, left_on = "sample", right_on = "folder_name", how = "inner")

In [6]:
# this is the test we're doing for each loop.
# basically fit a linear model. We tried different parameters, using robust etc, but none make a big difference.
# biggest differences are in how you actually retrieve and normalize the loops prior to this step.
def calculate_ols_test_CD4_CD8(df, formula = "interaction_strength~cell_type+female_sex"):
    s = smf.ols(formula, data = df)
    r = s.fit()
    return pd.Series({"p_value":r.pvalues["cell_type[T.CD8]"], # unfortunately this has to be set manually
                    "base_mean":df["interaction_strength"].mean(),
                    "CD4_mean":r.params.Intercept,
                    "CD8_mean":r.params.Intercept + r.params["cell_type[T.CD8]"],})

In [13]:
# apply the test to all loops
loops_analysed_p_val = loops_analysed_melted[loops_analysed_melted["cell_type"].isin(["CD4","CD8"])].groupby(
    ['chrA', 'A_start', 'A_end', 'chrB', 'B_start', 'B_end', 'distance_bin']).parallel_apply(
    calculate_ols_test_CD4_CD8)

In [14]:
# apply FDR correction
loops_analysed_p_val = loops_analysed_p_val.reset_index()
loops_analysed_p_val["FDR"] = multitest.fdrcorrection(loops_analysed_p_val["p_value"], alpha = 0.1)[1]

In [15]:
loops_analysed_p_val.to_csv(f"{base_dir}/HiC_analysis/output_results/DE_loops_CD8_vs_CD4.csv")

# Do CD8 vs CD8_SF

In [3]:
loops_analysed = pd.read_pickle(f"{base_dir}/HiC_analysis/extracting_loop_counts/aggregated_counts/aggregated_normalized_loops_CD8_CD8SF.pk")

In [4]:
# melt the loop dataframe and add info about the sample from the metadata
loops_analysed_melted = pd.melt(loops_analysed, id_vars=['chrA', 'A_start', 'A_end', 'chrB', 'B_start', 'B_end', 'FDR', 'DETECTION_SCALE', 'distance_bin'], 
        value_vars=loops_analysed.columns.difference(['chrA', 'A_start', 'A_end', 'chrB', 'B_start', 'B_end', 'FDR', 'DETECTION_SCALE', 'distance_bin']),
        var_name="sample",value_name="interaction_strength")
loops_analysed_melted = loops_analysed_melted.merge(metadata_hic, left_on = "sample", right_on = "folder_name", how = "inner")

In [5]:
def calculate_ols_test_CD8_CD8SF(df, formula = "interaction_strength~cell_type+female_sex"):
    """Well this basically works only for CD8 and CD8SF"""
    s = smf.ols(formula, data = df)
    r = s.fit()
    return pd.Series({"p_value":r.pvalues["cell_type[T.CD8_SF]"], 
                    "base_mean":df["interaction_strength"].mean(),
                    "CD8_mean":r.params.Intercept,
                    "CD8_SF_mean":r.params.Intercept + r.params["cell_type[T.CD8_SF]"],})

In [None]:
# apply the test to all loops
loops_analysed_p_val = loops_analysed_melted[loops_analysed_melted["cell_type"].isin(["CD8","CD8_SF"])].groupby(
    ['chrA', 'A_start', 'A_end', 'chrB', 'B_start', 'B_end', 'distance_bin']).parallel_apply(
    calculate_ols_test_CD8_CD8SF)

In [None]:
loops_analysed_p_val = loops_analysed_p_val.reset_index()
loops_analysed_p_val["FDR"] = multitest.fdrcorrection(loops_analysed_p_val["p_value"], alpha = 0.1)[1]

In [None]:
loops_analysed_p_val.to_csv(f"{base_dir}/HiC_analysis/output_results/DE_loops_CD8_vs_CD8SF.csv")

## simple permutation test to see that there is not false discovery

In [3]:
loops_analysed = pd.read_pickle(f"{base_dir}/HiC_analysis/extracting_loop_counts/aggregated_counts/aggregated_normalized_loops_CD4_CD8.pk")
metadata_hic = pd.read_csv(f"{base_dir}/metadata/cleaned_HiC_metadata.csv", index_col = 0)

In [4]:
loops_analysed = loops_analysed[loops_analysed["chrA"] == "1"]
metadata_hic = metadata_hic[metadata_hic["cell_type"].isin(["CD4","CD8"])]

In [None]:
for i in range(40):
    # randomly permute the cell_type column
    metadata_hic["cell_type"] = metadata_hic["cell_type"].sample(frac=1).reset_index(drop=True)

    # melt the loop dataframe and add info about the sample from the metadata
    loops_analysed_melted = pd.melt(loops_analysed, id_vars=['chrA', 'A_start', 'A_end', 'chrB', 'B_start', 'B_end', 'FDR', 'DETECTION_SCALE', 'distance_bin'], 
            value_vars=loops_analysed.columns.difference(['chrA', 'A_start', 'A_end', 'chrB', 'B_start', 'B_end', 'FDR', 'DETECTION_SCALE', 'distance_bin']),
            var_name="sample",value_name="interaction_strength")
    loops_analysed_melted = loops_analysed_melted.merge(metadata_hic, left_on = "sample", right_on = "folder_name", how = "inner")

    # apply the test to all loops
    loops_analysed_p_val = loops_analysed_melted[loops_analysed_melted["cell_type"].isin(["CD4","CD8"])].groupby(
        ['chrA', 'A_start', 'A_end', 'chrB', 'B_start', 'B_end', 'distance_bin']).parallel_apply(
        calculate_ols_test_CD4_CD8)
    
    # apply FDR correction
    loops_analysed_p_val = loops_analysed_p_val.reset_index()
    loops_analysed_p_val["FDR"] = multitest.fdrcorrection(loops_analysed_p_val["p_value"], alpha = 0.1)[1]
    print(len(loops_analysed_p_val[loops_analysed_p_val["FDR"] < 0.1]))