In [1]:
import pandas as pd
import pysam
import numpy as np
import akita_utils
import seaborn as sns
import bioframe as bf

In [2]:
from akita_utils.dna_utils import scan_motif, dna_1hot
from akita_utils.format_io import read_jaspar_to_numpy, read_rmsk
from akita_utils.tsv_gen_utils import filter_by_overlap_num

2023-10-13 10:50:22.125445: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home1/smaruj/software/GSL/lib:/home1/smaruj/software/HTSLIB/lib
2023-10-13 10:50:22.125493: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# reading tsv with all mouse ctcf motifs overlapping genomic boundaries

In [3]:
all_ctcf_path = "/project/fudenber_735/tensorflow_models/akita/v2/analysis/to_insert_boundaries.motifs.ctcf.mm10.tsv"
df = pd.read_csv(all_ctcf_path, sep="\t")

Since the set of motifs is the same for all backgrounds, we can focus on a subset of the table corresponding to one background (e.g. the 0th background)

In [4]:
df = df[df["background_index"] == 0]

# removing some not necessary columns from the table for clarity
columns_to_keep = ["boundary_index", "chrom", "boundary_end", "index", "num_ctcf", "span", "boundary_start", "strand", "start", "end"]
df = df[columns_to_keep]

# filtering

## 1. by ctcf

In [5]:
# jaspar tsv
jaspar_file = "/project/fudenber_735/motifs/mm10/jaspar/MA0139.1.tsv.gz"
jaspar_df = bf.read_table(jaspar_file, schema="jaspar")

# filtering
filtered_df = filter_by_overlap_num(df,
                    filter_df=jaspar_df,
                     max_overlap_num=1)

In [6]:
len(df) - len(filtered_df)

2351

2351 sites have been excluded from the analysis since they overlap another ctcf sites

## 2. by rmsk (a table with repeatable elements e.g. SINE, LINE, ect.)

In [7]:
rmsk_file = "/project/fudenber_735/genomes/mm10/database/rmsk.txt.gz"
rmsk_df = read_rmsk(rmsk_file)

In [8]:
filtered_df = filter_by_overlap_num(filtered_df,
                        rmsk_df,
                        expand_window=20,
                        working_df_cols = ["chrom","start","end"])

In [9]:
len(filtered_df)

7560

More than half of the sites have been filtered out.    
It's okay since we prefer to work with smaller, but high-quality sites (meaning that the effect of their insertion  disruption is not affected by overlapping ctct sites or repeatable genomic elements).

# adding seq_id

In [10]:
# so we can identify the same ctcf sites between experiments

seq_id = [seq_index for seq_index in range(len(filtered_df))]
filtered_df["seq_id"] = seq_id

# tsv saving

In [12]:
filtered_df.to_csv("./filtered_base_mouse_ctcf.tsv", sep = "\t", index=False)

# ATTENTION   
## All the code below should be removed at some point in the future.   
## It has been a part of the development process.

# ctcf motifs scanning

In [None]:
# Sites' strand annotation is based on original sites' annotation from the jaspar database.
# Since we want to study potentially orientation-dependent phenomenon, let's double check that our (local) method that scans ctcf gives the same strand as jaspar's method.

In [13]:
genome_file = "/project/fudenber_735/genomes/mm10/mm10.fa"

# reading CTCF motif from the jaspar file
motif_matrix = read_jaspar_to_numpy()

In [14]:
def get_forward_score(s, flank_bp=20):
    
    motif_matrix = read_jaspar_to_numpy()
    dna = dna_1hot(genome_open.fetch(s.chrom, s.start - flank_bp, s.end + flank_bp).upper())
    
    max_forward_score = scan_motif(dna, motif_matrix, strand="forward").max()
    
    return max_forward_score


def get_reverse_score(s, flank_bp=20):
    
    motif_matrix = read_jaspar_to_numpy()
    dna = dna_1hot(genome_open.fetch(s.chrom, s.start - flank_bp, s.end + flank_bp).upper())
    
    max_reverse_score = scan_motif(dna, motif_matrix, strand="reverse").max()
    
    return max_reverse_score


def get_scan_strand(s):
    
    if s.forward_scan >= s.reverse_scan:
        return "+"
    else:
        return "-"


def check_mismatch(s):
    if s.strand != s.scan_strand:
        return 1
    else:
        return 0


def get_rf_diff(s):
    diff = s.forward_scan - s.reverse_scan
    if diff >= 0:
        return diff
    else:
        return -1 * diff

In [15]:
genome_open = pysam.Fastafile(genome_file)

filtered_df.loc[:, ("forward_scan")] = filtered_df.apply(get_forward_score, axis=1)
filtered_df.loc[:, ("reverse_scan")] = filtered_df.apply(get_reverse_score, axis=1)
filtered_df.loc[:, ("scan_strand")] = filtered_df.apply(get_scan_strand, axis=1)
filtered_df.loc[:, ("mismatch")] = filtered_df.apply(check_mismatch, axis=1)
filtered_df.loc[:, ("rf_diff")] = filtered_df.apply(get_rf_diff, axis=1)

genome_open.close()

2023-10-13 10:52:52.966495: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home1/smaruj/software/GSL/lib:/home1/smaruj/software/HTSLIB/lib
2023-10-13 10:52:52.966540: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-10-13 10:52:52.966588: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (b02-07.hpc.usc.edu): /proc/driver/nvidia/version does not exist
2023-10-13 10:52:52.966933: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Do we have any sites with mismatches?

In [16]:
mismatch_df = filtered_df[filtered_df["mismatch"] == 1]

In [17]:
mismatched_ids = list(mismatch_df.seq_id)

In [18]:
len(mismatched_ids)

54

In [19]:
with open("/home1/smaruj/akita_utils/bin/insert_virtual_dots_vs_boundaries/analysis/mismatched_ids.txt", "w") as outfile:
    for id in mismatched_ids:
        outfile.write(str(id))
        outfile.write("\n")