In [1]:
import numpy as np

np.seterr(divide="ignore", invalid="ignore")
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import os, subprocess

import cooler
import cooltools.lib.plotting
from cooltools.api.saddle import saddle_strength

import cooltools
import bioframe
import multiprocess as mp

from tqdm import tqdm
import multiprocessing

import seaborn as sns

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

  @numba.jit  # (nopython=True)
  @numba.jit  # (nopython=True)


In [4]:
resolution = 100_000
NPROC = 8

In [5]:
clr = cooler.Cooler(
    f"/home/carlos/Desktop/manuscripts/notebooks/matrices/t0_q30.mcool::/resolutions/{resolution}"
)

view_df = pd.DataFrame(
    {
        "chrom": clr.chromnames,
        "start": 0,
        "end": clr.chromsizes.values,
        "name": clr.chromnames,
    }
)
bins = clr.bins()[:]

In [None]:
# Use RNAPII data for phasing

import bbi

RNAPII = bins[["chrom", "start", "end"]].copy()
stacks = bbi.stackup(
    "/home/carlos/Desktop/manuscripts/notebooks/matrices/ENCFF144IVU.bigWig",
    RNAPII["chrom"],
    RNAPII["start"],
    RNAPII["end"],
    bins=1,
)
from sklearn.preprocessing import MinMaxScaler

x_norm = MinMaxScaler().fit_transform(stacks)
RNAPII.loc[:, "RNAPII"] = x_norm

In [None]:
# hg38_genome = bioframe.load_fasta('../matrices/GRCh38.fa')
# gc_cov = bioframe.frac_gc(bins[['chrom', 'start', 'end']], hg38_genome)
# gc_cov.to_csv('hg38_gc_cov_100kb.tsv',index=False,sep='\t')

GC = pd.read_csv("hg38_gc_cov_100kb.tsv", sep="\t")

In [None]:
RNASEQ = bins[["chrom", "start", "end"]].copy()
stacks_ = [bbi.stackup(
    f"/home/carlos/Desktop/manuscripts/notebooks/compartments/merged_t{sampleCode}.bw",
    RNASEQ["chrom"],
    RNASEQ["start"],
    RNASEQ["end"],
    bins=1,
) for sampleCode in ["00", 12 , 30, 60]]

for stack, sampleCode in zip(stacks_, ["t0", "t12", "t30", "t60"]):
    x_norm = MinMaxScaler().fit_transform(stack)
    RNASEQ.loc[:, sampleCode] = x_norm

In [None]:
# phasing track tou use: RNAPII , RNASEQ or GC

phasing_track = "RNAPII"

if phasing_track == "RNAPII":
    phasing = RNAPII.copy()
elif phasing_track == "GC":
    phasing = GC.copy()
elif phasing_track == "RNASEQ":
    phasing = RNASEQ.copy()


In [None]:

samples = [0, 12, 30, 60]
eigen_data = []
expected_ = []
n_eig = 3

from tqdm import tqdm

pbar = tqdm(total=len(samples))
for sample_idx, sample in enumerate(samples):
    clr = cooler.Cooler(
        f"/home/carlos/Desktop/manuscripts/notebooks/matrices/t{sample}_q30.mcool::/resolutions/{resolution}"
    )

    map_ = mp.Pool(NPROC).map

    if phasing_track == "RNASEQ":
        phasing_now = phasing[["chrom", "start", "end", f"t{sample}"]].copy()
        cis_eigs = cooltools.eigs_cis(
            clr, phasing_now, view_df=view_df, n_eigs=n_eig, map=map_, sort_metric="pearsonr"
        )
    else:
        cis_eigs = cooltools.eigs_cis(
            clr, phasing, view_df=view_df, n_eigs=n_eig, map=map_, sort_metric="pearsonr"
        )
    eigen_data.append(cis_eigs)

    expected = cooltools.expected_cis(clr=clr, view_df=view_df, nproc=NPROC)
    expected_.append(expected)
    pbar.update(1)

In [None]:
e_vect_dfs = []
for i, e_dfs_ in enumerate(eigen_data):    
    for chrName in clr.chromnames:
        e_vect = []
        for eig_idx in range(1,4):
            e_df = e_dfs_[1].loc[e_dfs_[1].chrom == chrName].copy()
            e_vector = e_df[f"E{eig_idx}"]
            non_na_bool = ~e_vector.isna().values

            bw_now = phasing.loc[phasing["chrom"] == chrName].copy()

            e_vector = e_vector[non_na_bool]

            if phasing_track == "RNASEQ":
                bw_now = bw_now[f"t{samples[i]}"]
                bw_now = bw_now[non_na_bool]
            elif phasing_track == "GC":
                bw_now = bw_now["GC"][non_na_bool]
            elif phasing_track == "RNAPII":
                bw_now = bw_now["RNAPII"][non_na_bool]

            corr_Data = np.corrcoef(e_vector, bw_now)
            corr = corr_Data[0,1]
            e_vect.append((corr, chrName, f"t{samples[i]}", eig_idx))
        
        e_vect = sorted(e_vect, key=lambda x: x[0], reverse=True)

        prev = []
        for k in range(3):
            for j in range(3):
                if set((k,j)) in prev:
                    continue
                else:
                    prev.append(set((k,j)))
                if k != j:
                    val = np.abs(e_vect[k][0] - e_vect[j][0])
                    if val <= 0.1 and (k == 0 or j == 0):
                        # print(sorted(e_vect, key=lambda x: x[0], reverse=True))
                        # print(sorted(e_vect, key=lambda x: x[3], reverse=False))
                        # print(k,j, val)
                        if sorted(e_vect, key=lambda x: x[0], reverse=True) != sorted(e_vect, key=lambda x: x[3], reverse=False):
                            print("!!!!!!!!!!!!! Not Equal !!!!!!!!!!!!!")
                            print("\n")

        # make dataframe
        e_vect = pd.DataFrame(e_vect, columns=[f"{phasing_track}_corr", "chrom", "sample", "Eig_vector"])
        e_vect['Eig_vector'] = e_vect['Eig_vector'].astype(str) 
        e_vect['Eig_vector'] = [f"E{eig}" for eig in e_vect['Eig_vector']]
        e_vect_dfs.append(e_vect)

e_vect_dfs = pd.concat(e_vect_dfs)
e_vect_dfs.to_csv(f"evs/eigenvector_correlation_{phasing_track}.tsv", sep="\t", index=False)

In [7]:
df_base = {'chrom': [chromName for chromName in clr.chromnames for _ in range(3)],
              'Eig_vector': [f"E{eig}" for _ in range(len(clr.chromnames)) for eig in range(1,4)]
}
samples = [0, 12, 30, 60]
df_base = pd.DataFrame(df_base)
for sample in samples:
    sampleName = f"t{sample}"
    df_per_sample = df_base.copy()
    for df_name in ['GC', 'RNASEQ', 'RNAPII']:
        df = pd.read_csv(f"evs/eigenvector_correlation_{df_name}.tsv", sep="\t")
        df = df[df['sample'] == sampleName]
        df_per_sample[f"{df_name}_corr"] = df[f"{df_name}_corr"].values
    
    # reorder columns
    df_per_sample = df_per_sample[['Eig_vector', 'chrom', 'GC_corr', 'RNASEQ_corr', 'RNAPII_corr']]
    df_per_sample.to_csv(f"evs/eigenvector_correlation_{sampleName}.tsv", sep="\t", index=False)
    df_per_sample.to_excel(f"evs/eigenvector_correlation_{sampleName}.xlsx", index=False)