In [1]:
from pathlib import Path
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import itertools

from dask import dataframe as dd
from functools import reduce
import matplotlib.pyplot as plt
import pickle
from scipy.stats import wasserstein_distance
from scipy.spatial.distance import pdist, squareform
import scipy.signal
from scipy.ndimage import uniform_filter1d
from sklearn.decomposition import PCA

from matplotlib.backends.backend_pdf import PdfPages
plt.rcParams['figure.dpi'] = 300
import sys
import plotly.express as px
import matplotlib.backends.backend_pdf

In [4]:
# find current working directoy
cwd_path = Path().absolute()
data_root = cwd_path.parent.parent / "data"
experiment = "CellCulture"
chrom_list = [f"chr{x}" for x in range(1, 20)]
chrom_list.append("chrX")


In [5]:
def prep_distance_mtx(df, reference_df, use_markers):

    df = df[~df[name].str.contains("X")]
    # get the right order of eigen
    final_df = df.merge(reference_df)

    # get the coordinates
    coord = final_df[use_markers].values
    coord_info = final_df[["100kb name", "chrom", "100kb bin"]]
    # calculate distance matrix
    mtx = squareform(pdist(coord))

    return (mtx, coord_info, final_df["eigen"].values)

def calc_eigen(chrom, mtx, coord_info, ref_eigen, how = "intra"):
    pos_mask = (coord_info["chrom"] == chrom).values
    neg_mask = ~pos_mask
    
    if how == "intra":
        select_vector = mtx[:, pos_mask][pos_mask, :]
        pca = PCA(n_components=1)
        pca.fit(select_vector.T)
        count_1eg = pca.components_[0]
        
        # decide sign
        corr = np.corrcoef(count_1eg, ref_eigen[pos_mask])[0,1]
        sign = np.sign(corr)
        return sign * count_1eg, ref_eigen[pos_mask], coord_info.iloc[pos_mask, :]

    if how == "inter":
        select_vector = mtx[:, neg_mask][pos_mask, :]
        pca = PCA(n_components=1)
        pca.fit(select_vector.T)
        count_1eg = pca.components_[0]
        
        # decide sign
        corr = np.corrcoef(count_1eg, ref_eigen[pos_mask])[0,1]
        sign = np.sign(corr)
        return sign * count_1eg, ref_eigen[pos_mask], coord_info.iloc[pos_mask, :]

In [6]:
# read all celltypes information
cell_types = ["E14", "NMuMG"]
res = "100kb"
name = f"{res} name"
df_dict = {}
for ct in cell_types:
    # start a container for dataframes
    file = glob.glob(str(data_root / experiment / "IF_ensemble"/ f"*{res}*{ct}.csv"))[0]
    # read in all information
    df_dict[ct] = pd.read_csv(file)
    
# read in HiC eigen for sign correction
eigen_dict = {}
eigen_dict["E14"] = pd.read_csv(str(data_root / experiment / "AB_compartment" / "ES_eigen_100kb.csv"))            
eigen_dict["NMuMG"] = pd.read_csv(str(data_root / experiment / "AB_compartment" / "NMuMG_eigen_100kb.csv"))

### Calculate eigen values using combination of markers, here H3K27ac, H3K27me3, H3K9me3, LaminB1


In [7]:
calc_eigens = {}
for ct in cell_types:
    use_markers = ["H3K27ac", "H3K27me3", "H3K9me3", "LaminB1"]
    mtx, coord_info, ref_eg = prep_distance_mtx(df_dict[ct], eigen_dict[ct], use_markers) 
    dfs = []
    for chrom in chrom_list[:-1]:
        count_eg, HiC_eg, sub_df = calc_eigen(chrom, mtx, coord_info, ref_eg, how = "intra")
        sub_df["Fish_eigen"] = count_eg
        sub_df["HiC_eigen"] = HiC_eg
        dfs.append(sub_df)
    df = pd.concat(dfs)
    calc_eigens[ct] = df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df["Fish_eigen"] = count_eg
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df["HiC_eigen"] = HiC_eg
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df["Fish_eigen"] = count_eg
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

KeyboardInterrupt: 