# Notebook for data inspection

Imports

In [2]:
import time
import os
import importlib

from pprint import pprint
from collections import defaultdict
from pathlib import Path
from more_itertools import grouper
from itertools import islice, chain, zip_longest, repeat

import numpy as np
import pandas as pd

import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import gridspec
import matplotlib.axes as ax
import seaborn as sns

import scipy
from scipy import io
from scipy.signal import convolve
from scipy.sparse import coo_matrix, csr_matrix
from scipy.stats import pearsonr
from scipy.cluster.hierarchy import cut_tree

from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder

from Modules import utils, plot_utils

Useful global variables

In [3]:
importlib.reload(utils)
data_dir = '../shared_folder'
writing_dir = '../shared_folder'
# writing_dir='../'
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

## Read checks

Bombyx long reads vs short reads

In [None]:
with np.load(f'{data_dir}/Bombyx/raw_data/X_long.npz') as f:
    long_reads = f['reads']
with np.load(f'{data_dir}/Bombyx/raw_data/X_reads.npz') as f:
    reads = f['reads']

In [None]:
total = 0
for i in range(len(long_reads)):
    assert(len(long_reads[i]) == len(long_reads[i].rstrip()))
    total += len(long_reads[i])
print(total)
total2 = 0
for i in range(len(reads)):
    assert(len(reads[i]) == len(reads[i].rstrip()))
    total2 += len(reads[i])
print(total2)

Total number of reads vs number of reads without Ns in training data

In [None]:
with np.load(f'{data_dir}/Judith-H3K9me3/raw_data/Control_reads.npz') as f:
    reads = f['reads']
print(reads.shape)
# sequences = utils.remove_reads_with_N(reads, tolerance=0)
# print(len(sequences))

(40956444,)


Total number of reads vs number of reads without Ns in post-selected data

In [None]:
data = 'CENPT'
model_name = 'model_inception2'
threshold = 0.75
with np.load(f'{data_dir}/{data}/results/{model_name}/seqs_{data}_over_{threshold}_with_{model_name}.npz') as f:
    reads = f['reads']
print(reads.shape)
sequences = utils.remove_reads_with_N(reads, tolerance=0)
print(len(sequences))

Read length check

In [None]:
dico = utils.check_read_lengths(reads)
dico = dict(sorted(dico.items(), reverse=True))
print('{read_length: nb_reads}')
print(dico)

Duplicate checks, in total vs without Ns

In [None]:
with np.load(f'{data_dir}/H3K9me3/dataset30M.npz') as f:
    reads = f['x_test']

In [None]:
# with np.load(f'{data_dir}/HEK293-ZFAT/raw_data/Control_reads_deduped.npz') as f:
#     reads = f['reads']
# print(reads.shape)
# sequences = utils.remove_reads_with_N(reads, tolerance=0)
# print(len(sequences))

print('Looking for read duplicates')
dico = utils.find_duplicates(reads, one_hot=False, batch_size=50_000_000, print_freq=20_000_000)
values = list(dico.values())
print('Looking for duplicate level duplicates')
dico2 = utils.find_duplicates(values, batch_size=100_000_000, print_freq=20_000_000)
dico2 = dict(sorted(dico2.items(), reverse=True))
print('{duplicate_level: nb_unique_reads}')
print(dico2)
print('top 5 duplicate level reads:', list(dico2.keys())[:5])

# print('Looking for read duplicates')
# dico = utils.find_duplicates(sequences, one_hot=False, batch_size=50_000_000, print_freq=20_000_000)
# values = list(dico.values())
# print('Looking for duplicate level duplicates')
# dico2 = utils.find_duplicates(values, batch_size=100_000_000, print_freq=20_000_000)
# dico2 = dict(sorted(dico2.items(), reverse=True))
# print('{duplicate_level: nb_unique_reads}')
# print(dico2)
# print('top 5 duplicate level reads:', list(dico2.keys())[:5])

Looking for read duplicates
1 batches
Processing batch 0
seq 20000000/40956444 duplicates
seq 40000000/40956444 duplicates
seq 40956444/40956444 duplicates
Looking for duplicate level duplicates
1 batches
Processing batch 0
seq 20000000/39066493 duplicates
seq 39066493/39066493 duplicates
{duplicate_level: nb_unique_reads}
{126: 1, 125: 1, 121: 1, 112: 1, 110: 1, 109: 1, 108: 1, 104: 2, 99: 1, 98: 2, 97: 2, 96: 1, 95: 1, 94: 2, 93: 1, 92: 1, 91: 1, 90: 1, 89: 1, 87: 1, 85: 2, 84: 1, 82: 2, 81: 4, 80: 2, 79: 3, 78: 1, 77: 2, 76: 2, 75: 2, 74: 6, 73: 8, 72: 4, 71: 7, 70: 7, 69: 5, 68: 5, 67: 11, 66: 10, 65: 11, 64: 13, 63: 16, 62: 10, 61: 16, 60: 18, 59: 22, 58: 17, 57: 21, 56: 32, 55: 19, 54: 35, 53: 43, 52: 33, 51: 32, 50: 52, 49: 46, 48: 35, 47: 46, 46: 53, 45: 53, 44: 61, 43: 86, 42: 87, 41: 89, 40: 88, 39: 105, 38: 101, 37: 122, 36: 142, 35: 152, 34: 173, 33: 171, 32: 203, 31: 197, 30: 230, 29: 261, 28: 315, 27: 326, 26: 388, 25: 389, 24: 457, 23: 529, 22: 573, 21: 705, 20: 804, 19:

In [None]:
selected_reads = [k for k, v in dico.items() if v == 110]
print(selected_reads)
print(dico[selected_reads[0]])

['CAGACTTTACAAACAGAGTGTTTCCTAACTGCTCTATGAAAAGAAAGGTTAAACTCTGTGAGTTGAACGCACACATCACAAAGGAGTTTCTGAGAATCATT']
110


Human telomere sequences check

In [None]:
# human telomere sequences
seq_list = [
    'CTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC',
    'CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA',
    'CCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTA',
    'ACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT',
    'TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACC',
    'AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC',
    'GGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA',
    'GTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG',
    'TTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGG',
    'TAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGG',
    'AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGT',
    'GGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT'
]
for seq in seq_list:
    if seq in dico.keys():
        print(dico[seq], seq)
    else:
        print(0, seq)

Plot duplicate levels

In [None]:
x = list(dico.keys())
y = list(dico.values())
plt.plot(x, y)
plt.yscale('log')
plt.xscale('log')
plt.ylabel('occurences')
plt.xlabel('duplicate level')
plt.show()
plt.close()

Inspect simple dataset labels

In [None]:
with np.load(f'{data_dir}/Judith-H3K9me3/dataset.npz') as dataset:
    y_train = dataset['y_train']
    y_valid = dataset['y_valid']
    y_test = dataset['y_test']
print('total number of reads:',len(y_train) + len(y_valid) + len(y_test))
print('train:', len(y_train), 'valid:', len(y_valid), 'test:', len(y_test))
print('pos_train:', len(y_train[y_train == 1]), 'pos_valid', len(y_valid[y_valid == 1]), 'pos_test', len(y_test[y_test == 1]))

total number of reads: 60000000
train: 42000000 valid: 9000000 test: 9000000
pos_train: 21000000 pos_valid 4500000 pos_test 4500000


Inspect sharded dataset

In [6]:
with np.load(f'{data_dir}/H3K9me3-GSE175752/sharded_dataset/train_12.npz') as f:
    ids = f['ids']
    one_hots = f['one_hots']

In [7]:
print(ids.shape)
print(one_hots.shape)
counter = 0
for i, val in zip(ids, one_hots):
    if counter > 10:
        break
    print(i, utils.one_hot_decode(val))
    counter += 1

(14803404,)
(14803404, 76, 4)
@SRR14678338.22955188 22955188 length=76 ATAAATATGTATTGAGCTTTTTCTATATGCCAGCAACTAGTGGAAATATAGCATTCAACAAGACAAACATTGTCCT
@SRR14678332.23065510 23065510 length=76 TGCTTGACTTCTTGTCTCTAGTATTTGAAATCTTCCTTGCATATGATTGTCTCATTACCTTCCTAAAATCTAGTTC
@SRR14678339.536513 536513 length=76 AGTATGAAAGAATAGAAGTGTTTCTCATAGCATCATGTCATCCTTCTGGAACCTAAGCACGTTCTAGTGAGAATGG
@SRR14678336.26889983 26889983 length=76 CTCTTTTGACATCAAACCCTTTCCCCTTTTTTCTGCGCCATGTGTGAAGGACAGGCAAATGGTCTGAGAAGAGAAC
@SRR14678339.17193672 17193672 length=76 GTGGTGGTGCGTGCTTCTAATCCCAGCTACTCAGGAGGCTGAGGCAGAAGAACCGCTTGAACCGGGAGGCAGAGGT
@SRR14678330.13389704 13389704 length=76 GTTGTCATTTCCAATTAAGCAAGGGTTGTAATACAGGGAAGTGTGTTTCTAAAATTGTGAAATTGTTCTTATCTAT
@SRR14678339.20987926 20987926 length=76 AAGAATGCCTTTAAGCAATTTTCTGCCCTGGGTGGGCCAGGTGTTCCTTGCCCTCATTCTGGTAAACCCACAACCT
@SRR14678330.21345252 21345252 length=76 GTAGTTTGGATATTTGTCCTGCAAATCGCAAGTTGAAATTTGATCCCCACTTTTGGACTTAGAGCCTATTGAGAGG
@SRR14678342.6755082 6755082 l

In [6]:
hg38_chr_names = {
    1: 'NC_000001.11',
    2: 'NC_000002.12',
    3: 'NC_000003.12',
    4: 'NC_000004.12',
    5: 'NC_000005.10',
    6: 'NC_000006.12',
    7: 'NC_000007.14',
    8: 'NC_000008.11',
    9: 'NC_000009.12',
    10: 'NC_000010.11',
    11: 'NC_000011.10',
    12: 'NC_000012.12',
    13: 'NC_000013.11',
    14: 'NC_000014.9',
    15: 'NC_000015.10',
    16: 'NC_000016.10',
    17: 'NC_000017.11',
    18: 'NC_000018.10',
    19: 'NC_000019.10',
    20: 'NC_000020.11',
    21: 'NC_000021.9',
    22: 'NC_000022.11',
    'X': 'NC_000023.11',
    'Y': 'NC_000024.10'}
for chr_id in hg38_chr_names.keys():
    # load one_hot_encoded sequence
    genome_file = f'{data_dir}/Human/assembly/GRCh38/chr{chr_id}.npz'
    with np.load(genome_file) as f:
        one_hot = f['one_hot_genome']
        print(chr_id, one_hot.shape)

1 (248956422, 4)
2 (242193529, 4)
3 (198295559, 4)
4 (190214555, 4)
5 (181538259, 4)
6 (170805979, 4)
7 (159345973, 4)
8 (145138636, 4)
9 (138394717, 4)
10 (133797422, 4)
11 (135086622, 4)
12 (133275309, 4)
13 (114364328, 4)
14 (107043718, 4)
15 (101991189, 4)
16 (90338345, 4)
17 (83257441, 4)
18 (80373285, 4)
19 (58617616, 4)
20 (64444167, 4)
21 (46709983, 4)
22 (50818468, 4)
X (156040895, 4)
Y (57227415, 4)
