# script to identify the bins that have a differential insulation score

In [2]:
import pandas as pd
import numpy as np
import hicstraw 
from multiprocessing import Pool
from functools import partial
import glob
import os
import plotly.express as px
import math
import matplotlib.pyplot as plt
from matplotlib import colors
from pandarallel import pandarallel
import cooler
import cooltools
import pybedtools as pbed
pandarallel.initialize()
from scipy import stats, special
from statsmodels.stats import multitest
import statsmodels.api as sm
import statsmodels.formula.api as smf
import plotly.io as pio
import seaborn as sns
os.makedirs("/mnt/iusers01/jw01/mdefscs4/scratch/temp_pybedtools/", exist_ok = True)
pbed.helpers.set_tempdir("/mnt/iusers01/jw01/mdefscs4/scratch/temp_pybedtools/")
bed_genome_file = "/mnt/iusers01/jw01/mdefscs4/hg38.genome"

plt.rcParams['svg.fonttype'] = 'none'

base_dir = "/mnt/jw01-aruk-home01/projects/psa_functional_genomics/PsA_cleaned_analysis"

INFO: Pandarallel will run on 28 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
import sys
sys.path.append(f"{base_dir}/data_functions")
from quantile_norm import quantile_normalize

In [4]:
metadata_hic = pd.read_csv(f"{base_dir}/metadata/cleaned_HiC_metadata.csv", index_col = 0)

## read the tables from the precomputed folder

In [5]:
all_df = {}
for key, row in metadata_hic.iterrows():
    sample = row["folder_name"]
    insulation_table = pd.read_csv(f"{base_dir}/HiC_analysis/insulation_score/ins_tables/{sample}.csv.gz", dtype = {1:str,4:str},index_col = 0)
    all_df[sample] = insulation_table

In [6]:
def merge_dfs(dataframes_dict):
    first_df = list(dataframes_dict.values())[0]
    merged_df = first_df[['chrom', 'start', 'end', 'region']]
    for key, df in dataframes_dict.items():
        merged_df = pd.merge(merged_df, df[['chrom', 'start', 'end', 'region', 'log2_insulation_score_100000']], on=['chrom', 'start', 'end', 'region'])
        col = df['log2_insulation_score_100000'].name
        merged_df = merged_df.rename(columns={col: key})
    return merged_df

merged_df = merge_dfs(all_df)

In [7]:
# apply quantile normalization to the insulation scores across samples
merged_df.iloc[:,4:] = quantile_normalize(merged_df.iloc[:,4:])

In [8]:
merged_df["chrom"] = "chr" + merged_df["chrom"] 
merged_df = merged_df.drop(columns = ["region"])
merged_df.to_csv(f"{base_dir}/HiC_analysis/insulation_score/aggregated_norm_ins_scores.csv.gz")

## run differential analysis test
CD4 vs CD8

In [15]:
melted_df = merged_df.melt(id_vars=['chrom', 'start', 'end'], value_vars=list(merged_df.columns[4:]), var_name='sample', value_name='log2_insulation_score_100000')

In [16]:
melted_df = melted_df.merge(metadata_hic, left_on = "sample", right_on = "folder_name")

In [26]:
def calculate_ols_test_CD4_CD8(df, formula = "log2_insulation_score_100000 ~ cell_type + female_sex"):
    if len(df[~df["log2_insulation_score_100000"].isnull()]) < 8:
        return pd.Series({"p_value":None, 
                    "base_mean":None,})
    s = smf.ols(formula, data = df)
    r = s.fit()
    return pd.Series({"p_value":r.pvalues["cell_type[T.CD8]"], 
                    "base_mean":df["log2_insulation_score_100000"].mean(),})

In [27]:
melted_df_subset = melted_df[(melted_df["cell_type"].isin(["CD4","CD8"]))]
melted_df_test_p_val = melted_df_subset.groupby(['chrom', 'start', 'end']).parallel_apply(calculate_ols_test_CD4_CD8)

In [28]:
melted_df_test_p_val = melted_df_test_p_val.reset_index()
melted_df_test_p_val = melted_df_test_p_val.fillna(1)

melted_df_test_p_val["FDR"] = multitest.fdrcorrection(melted_df_test_p_val["p_value"], alpha = 0.1)[1]

In [31]:
melted_df_test_p_val.to_csv(f"{base_dir}/HiC_analysis/output_results/DE_ins_CD4_vs_CD8.csv")

In [32]:
melted_df_test_p_val["log_pval"] = -np.log(melted_df_test_p_val["p_value"])
melted_df_test_p_val["chrom start end log_pval".split()].to_csv(
    f"{base_dir}/HiC_analysis/output_results/DE_ins_CD4_vs_CD8.bed", 
    sep ="\t", index = False)

## CD8 vs CD8SF

In [34]:
def calculate_ols_test_CD8_CD8SF(df, formula = "log2_insulation_score_100000 ~ cell_type + female_sex"):
    if len(df[~df["log2_insulation_score_100000"].isnull()]) < 8:
        return pd.Series({"p_value":None, 
                    "base_mean":None,})
    s = smf.ols(formula, data = df)
    r = s.fit()
    return pd.Series({"p_value":r.pvalues["cell_type[T.CD8_SF]"], 
                    "base_mean":df["log2_insulation_score_100000"].mean(),})

In [35]:
melted_df_subset = melted_df[(melted_df["cell_type"].isin(["CD8","CD8_SF"]))]
melted_df_test_p_val = melted_df_subset.groupby(['chrom', 'start', 'end']).parallel_apply(calculate_ols_test_CD8_CD8SF)

In [36]:
melted_df_test_p_val = melted_df_test_p_val.reset_index()
melted_df_test_p_val = melted_df_test_p_val.fillna(1)

melted_df_test_p_val["FDR"] = multitest.fdrcorrection(melted_df_test_p_val["p_value"], alpha = 0.1)[1]

In [38]:
melted_df_test_p_val.to_csv(f"{base_dir}/HiC_analysis/output_results/DE_ins_CD8_vs_CD8SF.csv")

In [39]:
melted_df_test_p_val["log_pval"] = -np.log(melted_df_test_p_val["p_value"])
melted_df_test_p_val["chrom start end log_pval".split()].to_csv(
    f"{base_dir}/HiC_analysis/output_results/DE_ins_CD8_vs_CD8SF.bed", 
    sep ="\t", index = False)