# Background creation and parameter exploration

## Explored
- Creation of a flat background/seq
- Influencial parameters in creating a flat background/seq

## Goal
- Obtain a standard set of parameters for creating a flat seq.

In [1]:
# Importing libraries

from basenji import dataset, seqnn, dna_io,stream
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import bioframe 
import pandas as pd
import numpy as np 
import os
import json
from io import StringIO
import seaborn as sns
import akita_utils 
import tensorflow as tf
import pysam
import h5py

from skimage.measure import block_reduce

# Getting genomic data

In [2]:
genome_fasta = '/project/fudenber_735/genomes/mm10/mm10.fa'
genome_open = pysam.Fastafile(genome_fasta)

# Loading akita model

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
print(tf.__version__)

head_i = 1 #mouse
model_num = 1 #which fold to use

base_dir = '/project/fudenber_735/tensorflow_models/akita/v2/models/'
model_dir = base_dir+"/f"+str(model_num)+"c0/train/"
model_file  = model_dir+'/model'+str(head_i)+'_best.h5'

params_file = model_dir+'/params.json'
with open(params_file) as params_open:
    params = json.load(params_open)
    params_model = params['model']
    params_train = params['train']
seq_length = params_model['seq_length']
params_model['verbose'] = False
seqnn_model = seqnn.SeqNN(params_model)

print('built')
seqnn_model.restore(model_file, head_i=head_i)
print('restored')

hic_diags = params_model['diagonal_offset']
try:
    target_crop = params_model['trunk'][-2]['cropping']
except:
    target_crop = params_model['target_crop']
target_length_cropped = int((seq_length//2048 - target_crop*2 - hic_diags)* ((seq_length//2048 - target_crop*2 - hic_diags) +1)/2) 
target_map_size = seq_length//2048  - target_crop*2 
triu_tup = np.triu_indices(target_map_size,2)
target_map_size, target_length_cropped, triu_tup[0].shape

2.4.1


2022-12-04 22:58:13.664961: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


built
restored


(512, 130305, (130305,))

# Fetching chromosome data to explore background creation

In [4]:
chromsizes = bioframe.read_chromsizes('/project/fudenber_735/tensorflow_models/akita/v2/data/mm10/sequences.bed')
dframe = pd.DataFrame(chromsizes)
dframe['end'] = dframe['length']+ 1310720
dframe = dframe.reset_index()
dframe.rename(columns = {'index' : 'chrom', 'length':'start'}, inplace = True)
df = bioframe.frac_gc(dframe, bioframe.load_fasta('/project/fudenber_735/genomes/mm10/mm10.fa'), return_input=True)

df

Unnamed: 0,chrom,start,end,GC
0,chr1,40415232,41725952,0.407777
1,chr1,42708992,44019712,0.424985
2,chr1,42053632,43364352,0.424847
3,chr1,41725952,43036672,0.411407
4,chr1,43036672,44347392,0.417725
...,...,...,...,...
6033,chrX,99241984,100552704,0.424162
6034,chrX,120532992,121843712,0.367963
6035,chrX,139360256,140670976,0.407839
6036,chrX,128737280,130048000,0.370827


In [5]:
# visualizing the gc content distribution

# plt.hist( df['GC'].values, 100);
# np.percentile(df['GC'].dropna().values, np.linspace(1,99,5))

# Generating a sample for down stream analysis from GC content distribution

In [6]:
super_set = []
error = 0.0001

for gc in np.percentile(df['GC'].dropna().values, np.linspace(1,99,50)):
    for i in range(df.shape[0]):
        if gc-error <= df['GC'].values[i] <= gc+error:
            super_set += [i]
            break

super_set = list(set(super_set)); print(f'Whole distribution: {super_set}')
sample_set = super_set; print(f'Sampled datapoints: {sample_set}')

Whole distribution: [128, 1, 771, 135, 778, 1034, 267, 11, 2570, 399, 273, 22, 538, 283, 285, 803, 41, 173, 1072, 560, 570, 315, 572, 190, 1216, 706, 462, 81, 850, 600, 345, 218, 352, 1633, 353, 229, 1510, 102, 360, 235, 496, 754, 114, 242, 1909, 375, 2424, 249, 1147, 1406]
Sampled datapoints: [128, 1, 771, 135, 778, 1034, 267, 11, 2570, 399, 273, 22, 538, 283, 285, 803, 41, 173, 1072, 560, 570, 315, 572, 190, 1216, 706, 462, 81, 850, 600, 345, 218, 352, 1633, 353, 229, 1510, 102, 360, 235, 496, 754, 114, 242, 1909, 375, 2424, 249, 1147, 1406]


# Making predictions for the sampled data

In [7]:
predictions=[]

for ind in set(sample_set):
    chrom, start, end, gc = df.iloc[ind][['chrom','start','end','GC']]
    genome_open = pysam.Fastafile(genome_fasta)
    seq = genome_open.fetch(chrom, start, end).upper()
    seq_1hot = dna_io.dna_1hot(seq)
    predictions.append(seq_1hot)
    
predictions = np.array(predictions)
predictions = seqnn_model.predict(predictions, batch_size=6)

2022-12-04 23:00:33.103945: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-12-04 23:00:33.104613: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2100000000 Hz


# Creating flat backroung seqs from sampled data

In [8]:
# Paramaters for all analysis downstream
max_iters = 10
batch_size = 5
shuffle_k = 8 
ctcf_thresh = 8 
scores_thresh = 5500 
scores_pixelwise_thresh = .05
new_dataframe = df.iloc[[ind for ind in set(sample_set)]]
# ---------------------------------------------------------------

# For comparison further down
shuffle_set = [2,4,8] # shuffling basepairs to sample for comparison
ctcf_thresh_set = [8,15,30] # number of ctcf motifs to sample for comparison

In [None]:
flat_seqs = akita_utils.create_flat_seqs(seqnn_model=seqnn_model, 
                                        genome_fasta=genome_fasta, 
                                        seq_length=seq_length, 
                                        dataframe=new_dataframe, 
                                        max_iters = max_iters, 
                                        batch_size = batch_size, 
                                        shuffle_k = shuffle_k, 
                                        ctcf_thresh = ctcf_thresh, 
                                        scores_thresh = scores_thresh, 
                                        scores_pixelwise_thresh = scores_pixelwise_thresh )

no success but last iteration kept, final time 112.16254425048828
success: best seq, thresh 6570.0  pixelwise 0.0459 time 22.11513113975525


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


no success but last iteration kept, final time 110.52381920814514
no success but last iteration kept, final time 112.13505816459656
success: best seq, thresh 5540.0  pixelwise 0.04678 time 21.53683829307556
success: best seq, thresh 6692.0  pixelwise 0.04953 time 33.164093017578125
success: best seq, thresh 5944.0  pixelwise 0.047 time 10.661543130874634
no success but last iteration kept, final time 106.16006755828857
no success but last iteration kept, final time 108.46061110496521
success: best seq, thresh 6820.0  pixelwise 0.04813 time 30.193559646606445
no success but last iteration kept, final time 107.43526458740234
no success but last iteration kept, final time 109.7754316329956
no success but last iteration kept, final time 108.247891664505
no success but last iteration kept, final time 107.8652548789978
no success but last iteration kept, final time 108.85998511314392
no success but last iteration kept, final time 107.58494901657104
success: best seq, thresh 7788.0  pixelwise

# Visualizing the created flat seqs' contact maps

In [None]:
fig = plt.figure(figsize=(6* 2 , 6 *  len(sample_set) ), constrained_layout=True)     
spec = fig.add_gridspec(ncols=2, nrows=len(sample_set), hspace=0.1, wspace=0.1)#
           
target_ind = 0
vmin=-0.4; vmax=0.4
plot_lim_min=0.1




for ti in range(len(sample_set)):
    back_pred = predictions[ti]
    gc        = flat_seqs[ti][-1]
    
    
    ref_map = akita_utils.ut_dense(back_pred, hic_diags)
    _, axs = plt.subplots(1, 2, figsize=(12, 4))
    ref_map_ti = ref_map[..., ti]
    # TEMP: reduce resolution
    ref_map_ti = block_reduce(ref_map_ti, (2, 2), np.mean)
    # vmin = min(ref_map_ti.min(), ref_map_ti.min())
    # vmax = max(ref_map_ti.max(), ref_map_ti.max())
    # vmin = min(-plot_lim_min, vmin)
    # vmax = max(plot_lim_min, vmax)
    sns.heatmap(
        ref_map_ti,
        ax=axs[0],
        center=0,
        vmin=vmin,
        vmax=vmax,
        cmap="RdBu_r",
        xticklabels=False,
        yticklabels=False,
    )

# for ti in range(len(sample_set)):
    flat_pred = flat_seqs[ti][1]
#     gc        = flat_seqs[ti][-1]
    
    ref_map = akita_utils.ut_dense(flat_pred, hic_diags)
    # _, axs = plt.subplots(1, 1, figsize=(4, 4))
    ref_map_ti = ref_map[..., ti]
    # TEMP: reduce resolution
    ref_map_ti = block_reduce(ref_map_ti, (2, 2), np.mean)
    # vmin = min(ref_map_ti.min(), ref_map_ti.min())
    # vmax = max(ref_map_ti.max(), ref_map_ti.max())
    # vmin = min(-plot_lim_min, vmin)
    # vmax = max(plot_lim_min, vmax)
    sns.heatmap(
        ref_map_ti,
        ax=axs[1],
        center=0,
        vmin=vmin,
        vmax=vmax,
        cmap="RdBu_r",
        xticklabels=False,
        yticklabels=False,
    )    
    
    
    
    

# ref_preds = flat_pred
# ref_map = akita_utils.ut_dense(ref_preds, hic_diags)
# _, axs = plt.subplots(1, ref_preds.shape[-1], figsize=(24, 4))
# for ti in range(ref_preds.shape[-1]):
#     ref_map_ti = ref_map[..., ti]
#     # TEMP: reduce resolution
#     # ref_map_ti = block_reduce(ref_map_ti, (2, 2), np.mean)
#     # vmin = min(ref_map_ti.min(), ref_map_ti.min())
#     # vmax = max(ref_map_ti.max(), ref_map_ti.max())
#     # vmin = min(-plot_lim_min, vmin)
#     # vmax = max(plot_lim_min, vmax)
#     sns.heatmap(
#         ref_map_ti,
#         ax=axs[ti],
#         center=0,
#         vmin=vmin,
#         vmax=vmax,
#         cmap="RdBu_r",
#         xticklabels=False,
#         yticklabels=False,
#     )

# General distribution of scores

In [None]:
scores_before = {}

for gc in sample_set:
    new_dataframe = df.iloc[[gc]]
    for k in shuffle_set:
        # print(gc,k)
        scores_before[gc,k] = akita_utils.custom_calculate_scores(  seqnn_model=seqnn_model, 
                                                                    genome_fasta=genome_fasta, 
                                                                    seq_length=seq_length, 
                                                                    dataframe=new_dataframe, 
                                                                    max_iters = max_iters, 
                                                                    batch_size = batch_size, 
                                                                    shuffle_k = k, 
                                                                    ctcf_thresh = ctcf_thresh, 
                                                                    scores_thresh = scores_thresh,
                                                                    scores_pixelwise_thresh = scores_pixelwise_thresh)
        
# scores_before

# Visualing the scores' distribution for the created flat seqs

In [None]:
fig = plt.figure(figsize=(6* len(shuffle_set) , 6 *  len(sample_set) ), constrained_layout=True)     
spec = fig.add_gridspec(ncols=len(shuffle_set), nrows=len(sample_set), hspace=0.1, wspace=0.1)
           
for ind in sample_set:
    chrom, start, end, gc = df.iloc[ind][['chrom','start','end','GC']]
    for k in shuffle_set:
        ax1 = fig.add_subplot(spec[sample_set.index(ind),shuffle_set.index(k)])
        
        temp_scores_before = []
        for i in scores_before[ind,k]:
            temp_scores_before =+ i
        
        kde_df_before = pd.DataFrame(temp_scores_before, columns=["score"])
        sns.kdeplot(data=kde_df_before, x="score", bw_adjust=.2, fill=True)
        
        plt.title(f'GC_{gc} k_{k} score distributions')
        # plt.legend()

plt.show()
plt.close()

# Some observations

As the gc content lowers, low scores (which are prefered) are obtained. 
Generally lower scores are obtained in k=8. 

This implies k=8 is the best option over a wide range of gc content distribution and thus will be used futher on.

In [None]:
# shuffle_set = [4,8]

# Now we focus on how to maskout some CTCF motifs to lower the structure in our maps (this is discussed in detail in the mutation method notebook)


# Some observations

There are three ways to go about this;
- just randomly shuffle the whole seq
- scan for CTCF and randomly shuffle the motifs
- scan for CTCF and manually shuffle the motifs

## RESULTS
In terms of scores, all these cases tend to produce the same distribution.
In terms of time to a successfull iteration, there are slight differences as observed  in the mutation method notebook. This could be very much dependent on the GC content of the seq under investigation.

# Now we try different thresholds to the minimum CTCF motifs in a seq above which we start the shuffling process (ctcf_thresh)


In [None]:
ctcf_thresh_after = {}
for gc in sample_set:
    new_dataframe = df.iloc[[gc]]
    for score in ctcf_thresh_set:
        ctcf_thresh_after[gc,score] = akita_utils.custom_calculate_scores(seqnn_model = seqnn_model, 
                                                            genome_fasta = genome_fasta, 
                                                            seq_length = seq_length, 
                                                            dataframe = new_dataframe, 
                                                            max_iters = max_iters, 
                                                            batch_size = batch_size, 
                                                            shuffle_k = 8, 
                                                            ctcf_thresh = score,#ctcf_thresh, 
                                                            scores_thresh = scores_thresh,#score, 
                                                            scores_pixelwise_thresh = scores_pixelwise_thresh,
                                                            masking = True)

# Visualing the scores' distribution for the created seqs

In [None]:
fig1 = plt.figure(figsize=(6* len(ctcf_thresh_set) , 6 *  len(sample_set) ), constrained_layout=True)     
spec1 = fig1.add_gridspec(ncols=len(ctcf_thresh_set), nrows=len(sample_set), hspace=0.1, wspace=0.1)#
           
for ind in sample_set:
    chrom, start, end, gc = df.iloc[ind][['chrom','start','end','GC']]
    for score in ctcf_thresh_set:
        ax1 = fig1.add_subplot(spec1[sample_set.index(ind),ctcf_thresh_set.index(score)])
        
        temp_ctcf_thresh_after = []
        for i in ctcf_thresh_after[ind,score]:
            temp_ctcf_thresh_after =+ i
        
        kde_df_after = pd.DataFrame(temp_ctcf_thresh_after, columns=["score"])
        sns.kdeplot(data=kde_df_after, x="score", bw_adjust=.2, fill=True)
        # ax1.legend()        
        plt.title(f'GC_{gc} scores_thresh_{score}')

plt.show()
plt.close()

# Some observations


flat maps creation has little or no dependence on ctcf threshold and more dependant on gc content

# Now we focus on effect of masking and how the basepairs are shuffled.  

In [None]:
ctcf_shuffle_after = {}

for gc in sample_set:
    new_dataframe = df.iloc[[gc]]
    for k in shuffle_set:
        ctcf_shuffle_after[gc,k] = akita_utils.custom_calculate_scores(seqnn_model = seqnn_model, 
                                                                    genome_fasta = genome_fasta, 
                                                                    seq_length = seq_length, 
                                                                    dataframe = new_dataframe, 
                                                                    max_iters = max_iters, 
                                                                    batch_size = batch_size, 
                                                                    shuffle_k = k, 
                                                                    ctcf_thresh = ctcf_thresh, 
                                                                    scores_thresh = scores_thresh, 
                                                                    scores_pixelwise_thresh = scores_pixelwise_thresh,
                                                                    masking = False)

# Visualizing the resultant score distributions.

In [None]:
fig = plt.figure(figsize=(6* len(shuffle_set) , 6 *  len(sample_set) ), constrained_layout=True)     
spec = fig.add_gridspec(ncols=len(shuffle_set), nrows=len(sample_set), hspace=0.1, wspace=0.1)
           
for ind in sample_set:
    chrom, start, end, gc = df.iloc[ind][['chrom','start','end','GC']]
    for k in shuffle_set:
        ax1 = fig.add_subplot(spec[sample_set.index(ind),shuffle_set.index(k)])
        # plt.hist(scores_after[gc,k],5)
        
        temp_scores_shuffle_after = []
        for i in ctcf_shuffle_after[ind,k]: #scores_shuffle_after
            temp_scores_shuffle_after =+ i
            
        temp_scores_before = []
        for i in scores_before[ind,k]:
            temp_scores_before =+ i
        
        kde_df_after = pd.DataFrame(temp_scores_shuffle_after, columns=["score"])
        kde_df_before = pd.DataFrame(temp_scores_before, columns=["score"])
        sns.kdeplot(data=kde_df_after, x="score", bw_adjust=.2, label='after',fill=True)
        sns.kdeplot(data=kde_df_before, x="score", bw_adjust=.2, label='before',fill=True)#
        ax1.legend()
        plt.title(f'GC_{gc} k_{k} before and after masking')
plt.show()
plt.close()

# Final observations

The first parameter for focus when desighing a background seq is the gc content of the initial seq.
Seq with lower gc are prefered

The second parameter is the shuffling basepairs, 8 are prefferd

Third is whether masking is necessary. According to preliminary results, the difference in the scores and time to successfull iteration seems to be minimal but further analysis is underway.