In [1]:
import pandas as pd
import numpy as np
import cooler
import matplotlib.pyplot as plt

import os
import sys
import pore_c_utils as pcu
from importlib import reload

In [2]:
resolution = 1000000 # 1MB

In [25]:
assembly = pcu.loadAssembly("GRCm39_Assembly.txt")

chromDict = dict(zip(assembly['RefSeq accession'],assembly['Chromosome'].apply(lambda x : f"chr{x}")))
chromDict

{'NC_000067.7': 'chr1',
 'NC_000068.8': 'chr2',
 'NC_000069.7': 'chr3',
 'NC_000070.7': 'chr4',
 'NC_000071.7': 'chr5',
 'NC_000072.7': 'chr6',
 'NC_000073.7': 'chr7',
 'NC_000074.7': 'chr8',
 'NC_000075.7': 'chr9',
 'NC_000076.7': 'chr10',
 'NC_000077.7': 'chr11',
 'NC_000078.7': 'chr12',
 'NC_000079.7': 'chr13',
 'NC_000080.7': 'chr14',
 'NC_000081.7': 'chr15',
 'NC_000082.7': 'chr16',
 'NC_000083.7': 'chr17',
 'NC_000084.7': 'chr18',
 'NC_000085.7': 'chr19',
 'NC_000086.8': 'chrX',
 'NC_000087.8': 'chrY'}

In [4]:
# load pore data coolers 
scPoreCfilepath = "/nfs/turbo/umms-indikar/shared/tools/Pore-C-Snakemake/results_ES5to12/matrix/"

cellNums = [1, 2, 4, 5, 6, 7] # dropping cell 3 due to sparsity
filetags = [f"run0{x}" for x in cellNums]

porec = {}

for cell in filetags:
    
    fullPath = f"{scPoreCfilepath}NlaIII_{cell}_GRCm39_unphased.matrix.mcool"
    clr = cooler.Cooler(f'{fullPath}::resolutions/{resolution}')
    
    porec[cell] = clr
    
porec.keys()

dict_keys(['run01', 'run02', 'run04', 'run05', 'run06', 'run07'])

In [6]:
scHICfilepath = f"/nfs/turbo/umms-indikar/shared/projects/poreC/nagano_2017_data/nagano_1MB_raw_BALANCED.scool"
cellList = list(cooler.fileops.list_coolers(scHICfilepath))

sampleSize = len(porec) # sample 6 randomly
sample = np.random.choice(cellList, sampleSize, replace=False)

hic = {}

for s in sample:
    filetag = "_".join(s.split("_")[0:2]).replace("/", "")

    clr = cooler.Cooler(f"{scHICfilepath}::{s}")
    
    hic[filetag] = clr
    
hic.keys()

dict_keys(['Diploid_3', 'Diploid_13', 'Diploid_10', 'Diploid_23', 'Diploid_12'])

In [41]:
hKey = list(hic.keys())[0]
pKey = list(porec.keys())[0]


def getChromosomeInfo(coolerObject):
    newRows = []
    for chrom in coolerObject.chromnames:
        chromStart, chromEnd =  coolerObject.extent(chrom)
        
        row = {
            'region' : chrom,
            'start' : chromStart,
            'end' : chromEnd,
            'length' : chromEnd - chromStart,
        }
        
        newRows.append(row)
    return pd.DataFrame(newRows)
    

hicChrom = getChromosomeInfo(hic[hKey])
print(f"{hicChrom.shape=}")

hicChrom

hicChrom.shape=(35, 4)


Unnamed: 0,region,start,end,length
0,chr10,0,130,130
1,chr11,130,252,122
2,chr12,252,374,122
3,chr13,374,495,121
4,chr13_random,495,496,1
5,chr14,496,622,126
6,chr15,622,726,104
7,chr16,726,825,99
8,chr16_random,825,826,1
9,chr17,826,922,96


In [42]:
porecChrom = getChromosomeInfo(porec[pKey])

porecChrom['region'] = porecChrom['region'].astype(str)

# translate refseq to chromosome names
porecChrom['chromName'] = porecChrom['region'].map(chromDict) 

# WARNING: dropping non chromosomal reagions
porecChrom = porecChrom[porecChrom['chromName'].notna()].reset_index(drop=True)

porecChrom

Unnamed: 0,region,start,end,length,chromName
0,NC_000067.7,0,196,196,chr1
1,NC_000068.8,202,384,182,chr2
2,NC_000069.7,384,544,160,chr3
3,NC_000070.7,544,701,157,chr4
4,NC_000071.7,702,854,152,chr5
5,NC_000072.7,859,1009,150,chr6
6,NC_000073.7,1009,1154,145,chr7
7,NC_000074.7,1155,1286,131,chr8
8,NC_000075.7,1286,1411,125,chr9
9,NC_000076.7,1411,1542,131,chr10


In [None]:
break

In [None]:
poreCclr = poreC['run01']

poreCRegions = {}

for i in poreCclr.chromnames:
    if 'NC' in i:
        extent = poreCclr.extent(i)
        poreCRegions[i] = extent
    
print(f"{len(poreCRegions)=}")

# get the indices of all chromosome positions
porecIndices = []

for k, v in poreCRegions.items():
    binRange = list(np.arange(v[0], v[1]))
    porecIndices += binRange
    
print(f"{len(porecIndices)=}")

In [None]:
hiCclr = hiC[filetag]

hiCRegions = {}

for i in hiCclr.chromnames:
    if not 'random' in i:
        extent = hiCclr.extent(i)
        hiCRegions[i] = extent
    
print(f"{len(hiCRegions)=}")

# get the indices of all chromosome positions
hicIndices = []

for k, v in hiCRegions.items():
    binRange = list(np.arange(v[0], v[1]))
    hicIndices += binRange
    
print(f"{len(hicIndices)=}")

In [None]:
zippedKeys = list(zip(poreC.keys(), hiC.keys()))


for pKey, hKey in zippedKeys:
    print("------------------------")
    print(f"{pKey=}")
    print(f"{hKey=}")
    
    pMat = poreC[pKey].matrix(balance=True)[:]
#     hMat = hiC[hKey].matrix(balance=True)[:]
    
    print(f"{pMat.shape}")
    
    test = pMat[chromIndices,:][:,chromIndices]
    print(f"{test.shape}")
    
#     print(f"{hMat.shape}")
    
    break
    

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 10, 10

zippedKeys = list(zip(poreC.keys(), hiC.keys()))

for pKey, hKey in zippedKeys:
    
    print("------------------------")
    print(f"{pKey=}")
    print(f"{hKey=}")
    pMat = poreC[pKey].matrix(balance=True)[:]
    hMat = hiC[hKey].matrix(balance=True)[:]
    
    # binarize both
    pMatBinary = np.where(pMat > 0, 1, 0)
    hMatBinary = np.where(hMat > 0, 1, 0)
    
    
    sumPoreC = np.triu(pMatBinary).sum()
    sumPoreCDiag = np.diag(pMatBinary).sum()
    
    sumhiC = np.triu(hMatBinary).sum()
    sumhiCDiag = np.diag(hMatBinary).sum()
    
    print()
    print(f"{sumPoreC=}")
    print(f"{sumPoreCDiag=}")
    print(f"{sumPoreCDiag/sumPoreC=:.3f}")
    
    print()
    print(f"{sumhiC=}")
    print(f"{sumhiCDiag=}")
    print(f"{sumhiCDiag/sumPoreC=:.3f}")
    

#     fig, ax = plt.subplots(1, 2) 
    
#     ax[0].matshow(pMatBinary, cmap='binary')
#     ax[0].set_title(f'scPore-C Contacts: {pKey}', y=1.2)

#     ax[1].matshow(hMatBinary, cmap='binary')
#     ax[1].set_title(f'scHi-C Contacts: {hKey}', y=1.2)
    
#     plt.show()
    

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 10, 10

zippedKeys = list(zip(poreC.keys(), hiC.keys()))

for pKey, hKey in zippedKeys:
    
    print("------------------------")
    print(f"{pKey=}")
    print(f"{hKey=}")
    pMat = poreC[pKey].matrix(balance=True)[:]
    hMat = hiC[hKey].matrix(balance=True)[:]
    
    # binarize both
    pMatBinary = np.where(pMat > 0, 1, 0)
    hMatBinary = np.where(hMat > 0, 1, 0)
    
    A = np.zeros(hMat.shape)
    
    # truncate the y chrome of the porec data
    n = hMat.shape[0]
    pMatBinary = pMatBinary[0:n, 0:n]
    
    # get upper and lower triangles
    pMatTri = np.triu(pMatBinary)
    hMatTri = np.tril(hMatBinary)
    
    A += pMatTri
    A += hMatTri
    
    ABinary = np.where(A > 0, 1, 0)
    
    plt.matshow(ABinary, cmap='binary')
    plt.title(f'Raw scPore-C Contacts {pKey}')
    plt.text((n//2), n+100, f"Raw scHi-C Contacts {hKey}", 
             horizontalalignment='center', fontsize=12)
    
    
    plt.show()

In [None]:


psuedoPoreC = np.zeros(poreC['run01'].shape)

for k,v in poreC.items():
    psuedoPoreC += v.matrix(balance=True)[:]
    

# kinda hacky
psuedoHiC = np.zeros(hiC[list(hiC.keys())[0]].shape)
    
for k,v in hiC.items():
    psuedoHiC += v.matrix(balance=True)[:]

print(f"{psuedoPoreC.shape=}")
print(f"{psuedoHiC.shape=}")


In [None]:
pMatBinary = np.where(psuedoPoreC > 0, 1, 0)
hMatBinary = np.where(psuedoHiC > 0, 1, 0)

A = np.zeros(psuedoHiC.shape)

# truncate the y chrome of the porec data
n = psuedoHiC.shape[0]
pMatBinary = pMatBinary[0:n, 0:n]

# get upper and lower triangles
pMatTri = np.triu(pMatBinary)
hMatTri = np.tril(hMatBinary)

A += pMatTri
A += hMatTri

ABinary = np.where(A > 0, 1, 0)

plt.matshow(ABinary, cmap='binary')
plt.title(f'Raw scPore-C Contacts Ensemble')
plt.text((n//2), n+100, f"Raw scHi-C Contacts Ensemble", 
         horizontalalignment='center', fontsize=12)


In [None]:
plt.close()
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 3, 3

# chromosome 2
zippedKeys = list(zip(poreC.keys(), hiC.keys()))

for pKey, hKey in zippedKeys:
    
    print("------------------------")
    print(f"{pKey=}")
    print(f"{hKey=}")
    pMat = poreC[pKey].matrix(balance=True).fetch('NC_000068.8')[:]
    hMat = hiC[hKey].matrix(balance=True).fetch('chr2')[:]
    
    print(f"{pMat.shape}")
    print(f"{hMat.shape}")
    
    
    # binarize both
    pMatBinary = np.where(pMat > 0, 1, 0)
    hMatBinary = np.where(hMat > 0, 1, 0)
    
    # get upper and lower triangles
    pMatTri = np.triu(pMatBinary)
    hMatTri = np.tril(hMatBinary)
    
    A = np.zeros(pMat.shape)
    
    A += pMatTri
    A += hMatTri
    
    ABinary = np.where(A > 0, 1, 0)
    
    plt.matshow(ABinary, cmap='binary')
    plt.title(f'Raw scPore-C Contacts {pKey}')
    plt.text((n//2), n+100, f"Raw scHi-C Contacts {hKey}", 
             horizontalalignment='center', fontsize=12)
    
    plt.show()
    
    break

In [None]:
# A = np.zeros(psuedoHiC.shape)

# # truncate the y chrome of the porec data
# n = psuedoHiC.shape[0]
# psuedoPoreC = psuedoPoreC[0:n, 0:n]

# # get upper and lower triangles
# pMatTri = np.triu(psuedoPoreC)
# hMatTri = np.tril(psuedoHiC)

# A += pMatTri
# A += hMatTri

# plt.matshow(A, cmap='Reds')
# plt.title(f'Raw scPore-C Contacts Ensemble')
# plt.text((n//2), n+100, f"Raw scHi-C Contacts Ensemble", 
#          horizontalalignment='center', fontsize=12)