# reviewers request was to use beta-binomial model instead of simple binomial test. 
This requires using apeglm, which is an R package. this script converts the counts so that it's ready for apeglm

In [1]:
import pandas as pd
import numpy as np
from multiprocessing import Pool
from functools import partial
import glob
import os
import math
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import gzip
import io
import pickle
import pybedtools as pbed
from functools import reduce

base_dir = "http://bartzabel.ls.manchester.ac.uk/orozcolab/SNP2Mechanism/"

In [2]:
metadata_hic = pd.read_csv(f"{base_dir}/metadata/cleaned_HiC_metadata.csv", index_col=0)

In [3]:
data_CD4 = pickle.load(open(".local/aggregated_data_CD4_slop10kb_separatepval.pk", "rb"))
data_CD8 = pickle.load(open(".local/aggregated_data_CD8_slop10kb_separatepval.pk", "rb"))

In [4]:
all_snps = []
for chromosome_snps in data_CD4:
    for snp, values in chromosome_snps.items():
        if len(values) > 10: # 4 always there, 6 for 3 samples
            all_snps.append(snp)

In [5]:
tot_counts_CD4 = pd.DataFrame(index = all_snps, columns = metadata_hic[metadata_hic["cell_type"] == "CD4"]["folder_name"].to_list())
ase_counts_CD4 = pd.DataFrame(index = all_snps, columns = metadata_hic[metadata_hic["cell_type"] == "CD4"]["folder_name"].to_list())                                        

In [6]:
for chromosome_snps in data_CD4:
    for snp, values in chromosome_snps.items():
        if len(values) > 10: # 4 always there, 6 for 3 samples
            for k,v in  values.items():
                if k.endswith("_0"):
                    ase_counts_CD4.at[snp, k.split("_0")[0]] = int(v)
                    tot_counts_CD4.at[snp, k.split("_0")[0]] = int(v + values[k.split("_0")[0] + "_1"])

In [30]:
tot_merged_CD4 = tot_counts_CD4.sum(axis = 1)
ase_merged_CD4 = ase_counts_CD4.sum(axis = 1)
total_allele_counts_CD4 = pd.DataFrame(ase_merged_CD4)
total_allele_counts_CD4.columns = ["REF_counts"]
total_allele_counts_CD4["ALT_counts"] = tot_merged_CD4 - ase_merged_CD4
total_allele_counts_CD4.to_csv(f".local/hic_goingtoR/hic_CD4_allele_counts.csv")

In [15]:
tot_counts_CD4.to_csv(f".local/hic_goingtoR/hic_CD4_tot_counts.csv")
ase_counts_CD4.to_csv(f".local/hic_goingtoR/hic_CD4_ASE_counts.csv")

In [19]:
all_snps = []
for chromosome_snps in data_CD8:
    for snp, values in chromosome_snps.items():
        if len(values) > 10: # 4 always there, 6 for 3 samples
            all_snps.append(snp)

In [20]:
tot_counts_CD8 = pd.DataFrame(index = all_snps, columns = metadata_hic[metadata_hic["cell_type"] == "CD8"]["folder_name"].to_list())
ase_counts_CD8 = pd.DataFrame(index = all_snps, columns = metadata_hic[metadata_hic["cell_type"] == "CD8"]["folder_name"].to_list())                                        

In [21]:
for chromosome_snps in data_CD8:
    for snp, values in chromosome_snps.items():
        if len(values) > 10: # 4 always there, 6 for 3 samples
            for k,v in  values.items():
                if k.endswith("_0"):
                    ase_counts_CD8.at[snp, k.split("_0")[0]] = int(v)
                    tot_counts_CD8.at[snp, k.split("_0")[0]] = int(v + values[k.split("_0")[0] + "_1"])


In [22]:
tot_merged_CD8 = tot_counts_CD8.sum(axis = 1)
ase_merged_CD8 = ase_counts_CD8.sum(axis = 1)
total_allele_counts_CD8 = pd.DataFrame(ase_merged_CD8)
total_allele_counts_CD8.columns = ["REF_counts"]
total_allele_counts_CD8["ALT_counts"] = tot_merged_CD8 - ase_merged_CD8
total_allele_counts_CD8.to_csv(f".local/hic_goingtoR/hic_CD8_allele_counts.csv")

In [19]:
tot_counts_CD8.to_csv(f".local/hic_goingtoR/hic_CD8_tot_counts.csv")
ase_counts_CD8.to_csv(f".local/hic_goingtoR/hic_CD8_ASE_counts.csv")

In [31]:
all_snps = []
for chromosome_snps in data_CD4:
    for snp, values in chromosome_snps.items():
            all_snps.append(snp)
for chromosome_snps in data_CD8:
    for snp, values in chromosome_snps.items():
            all_snps.append(snp)

In [32]:
all_snps = sorted(list(set(all_snps)))

In [33]:
tot_counts_ALL = pd.DataFrame(index = all_snps, columns = metadata_hic["folder_name"].to_list())
ase_counts_ALL = pd.DataFrame(index = all_snps, columns = metadata_hic["folder_name"].to_list())      

In [34]:
for chromosome_snps in data_CD4:
    for snp, values in chromosome_snps.items():
        for k,v in  values.items():
            if k.endswith("_0"):
                ase_counts_ALL.at[snp, k.split("_0")[0]] = int(v)
                tot_counts_ALL.at[snp, k.split("_0")[0]] = int(v + values[k.split("_0")[0] + "_1"])
for chromosome_snps in data_CD8:
    for snp, values in chromosome_snps.items():
        for k,v in  values.items():
            if k.endswith("_0"):
                ase_counts_ALL.at[snp, k.split("_0")[0]] = int(v)
                tot_counts_ALL.at[snp, k.split("_0")[0]] = int(v + values[k.split("_0")[0] + "_1"])

In [35]:
tot_counts_ALL = tot_counts_ALL.dropna(thresh = 6)
ase_counts_ALL = ase_counts_ALL.dropna(thresh = 6)
tot_counts_ALL.to_csv(f".local/hic_goingtoR/hic_ALL_tot_counts.csv")
ase_counts_ALL.to_csv(f".local/hic_goingtoR/hic_ALL_ASE_counts.csv")

In [36]:
tot_merged_ALL = tot_counts_ALL.sum(axis = 1)
ase_merged_ALL = ase_counts_ALL.sum(axis = 1)
total_allele_counts_ALL = pd.DataFrame(ase_merged_ALL)
total_allele_counts_ALL.columns = ["REF_counts"]
total_allele_counts_ALL["ALT_counts"] = tot_merged_ALL - ase_merged_ALL
total_allele_counts_ALL.to_csv(f".local/hic_goingtoR/hic_ALL_allele_counts.csv")