In [2]:
import pandas as pd
import numpy as np
import cooler
import matplotlib.pyplot as plt
import matplotlib.colors
from matplotlib.patches import Patch
import scipy.io


from sklearn.decomposition import PCA
from scipy import stats
import networkx as nx
from sklearn.metrics import jaccard_score

import os
import sys
from importlib import reload

import filters
import pore_c_utils as pcu
import binning_utils as binning
import plotting as hicPlot

In [3]:
assembly = pcu.loadAssembly("GRCm39_Assembly.txt")

chromDict = dict(zip(assembly['RefSeq accession'],assembly['Chromosome'].apply(lambda x : f"chr{x}")))

# the reverse for easy lookup
chromDict_r = {v: k for k, v in chromDict.items()}

chromOrder = list(chromDict.values())
chromOrder

['chr1',
 'chr2',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chrX',
 'chrY']

In [4]:
scHICfilepath = f"/nfs/turbo/umms-indikar/shared/projects/poreC/nagano_2017_data/nagano_1MB_raw_BALANCED.scool"
cellList = list(cooler.fileops.list_coolers(scHICfilepath))
print(f"{len(cellList)=}")

len(cellList)=3882


In [8]:
sampleSize = 25
chromosome = ['chr2']
sample = np.random.choice(cellList, sampleSize, replace=False)

hic = {}

for s in sample:
    filetag = "_".join(s.split("_")[0:2]).replace("/", "")    
    A, hicChromInfo, hicIndex = pcu.loadNagano2017SingleCell(scHICfilepath, s, chromOrder, balance=False)
    chrHicInd = pcu.getIndices(hicChromInfo, hicIndex, chromosome, 'region')
    
    # reindex the matrix
    A = A[chrHicInd, :][:, chrHicInd]
    print(f"{filetag=} {A.shape=}")
    
    hic[filetag] = A
    
print('done.')

filetag='Diploid_25' A.shape=(182, 182)
filetag='Diploid_7' A.shape=(182, 182)
filetag='Diploid_9' A.shape=(182, 182)
filetag='Diploid_5' A.shape=(182, 182)
filetag='Diploid_13' A.shape=(182, 182)
filetag='Diploid_22' A.shape=(182, 182)
filetag='Diploid_3' A.shape=(182, 182)
filetag='Diploid_8' A.shape=(182, 182)
filetag='Diploid_25' A.shape=(182, 182)
filetag='Diploid_23' A.shape=(182, 182)
filetag='Diploid_21' A.shape=(182, 182)
filetag='Diploid_22' A.shape=(182, 182)
filetag='Diploid_12' A.shape=(182, 182)
filetag='Diploid_7' A.shape=(182, 182)
filetag='Diploid_26' A.shape=(182, 182)
filetag='Diploid_3' A.shape=(182, 182)
filetag='Diploid_14' A.shape=(182, 182)
filetag='Diploid_7' A.shape=(182, 182)
filetag='Diploid_26' A.shape=(182, 182)
filetag='Diploid_32' A.shape=(182, 182)
filetag='Diploid_21' A.shape=(182, 182)
filetag='Diploid_25' A.shape=(182, 182)
filetag='Diploid_7' A.shape=(182, 182)
filetag='Diploid_4' A.shape=(182, 182)
filetag='Diploid_9' A.shape=(182, 182)
done.


In [30]:
retain = 0.001
sparseSample = {}


for runId, A in hic.items():
    
    Abinary = np.where(A > 0, 1, 0)
    downSampled = Abinary.copy()
    contactIndices = np.argwhere(Abinary == 1)
    
    retainN = int((1-retain) * len(contactIndices) // 2)
    
    sampleIndices = sorted(np.random.choice(range(len(contactIndices)), retainN, replace=False))
    
    # remove random contacts 
    downSampled[contactIndices[sampleIndices, 0], contactIndices[sampleIndices, 1]] = 0
    downSampled[contactIndices[sampleIndices, 1], contactIndices[sampleIndices, 0]] = 0
#     fig = plt.figure()
    
#     plt.subplot(121)
#     plt.imshow(Abinary, cmap='binary')
    
#     plt.subplot(122)
#     plt.imshow(downSampled, cmap='binary')
    sparseSample[runId] = downSampled

    

print('done')

done
