In [1]:
import pandas as pd
import pysam
import numpy as np
import akita_utils

import seaborn as sns

import bioframe as bf

In [2]:
mismatched_strand_path = "./filtered_base_mouse_ctcf_mismatched.tsv"
mismatched_strand_df = pd.read_csv(mismatched_strand_path, sep="\t")

In [3]:
mismatched_strand_df = mismatched_strand_df.drop(columns=['boundary_index', 'boundary_end', 'index', 'num_ctcf', 'span',
       'boundary_start', 'count', 'count.1', 'rf_diff', 'nf_forward_reverse'])

In [4]:
mismatched_strand_df.head()

Unnamed: 0,chrom,strand,start,end,forward_scan,reverse_scan,scan_strand,mismatch,nf_forward,nf_reverse,nf_strand_scan
0,chr1,+,128367145,128367164,9.578156,9.731974,-,1,9.578156,9.230873,+
1,chr1,+,160197509,160197528,9.499476,9.555386,-,1,9.499476,6.302327,+
2,chr1,-,161735161,161735180,9.41067,9.245034,+,1,8.55164,9.245034,-
3,chr1,+,184790806,184790825,9.374499,9.45561,-,1,9.374499,8.901645,+
4,chr2,-,3840133,3840152,9.989558,9.828651,+,1,9.661364,9.828651,-


In [5]:
# column explanation:
# 
# "strand" is taken from our tsv, it is the same as jaspar strand
# mismatch = 1 means that there is mismatch between jaspar (/tsv) strand and our scanning
# "scan strand" is our scanning within windows with 20 bp flanks around
# nf_strand_scan is our scanning without flanks

In [6]:
fimo_no_flanks = pd.read_csv("./fimo_out_no_flanks/fimo.tsv", sep="\t")
fimo_no_flanks = fimo_no_flanks.drop([52, 53, 54])

In [7]:
fimo_with_flanks = pd.read_csv("./fimo_out_with_flanks/fimo.tsv", sep="\t")
fimo_with_flanks = fimo_with_flanks.drop([62, 63, 64])

In [8]:
def get_chr(id):
    return id.split("_")[0].split(":")[1]


def get_start(id):
    return int(id.split("_")[1].split(":")[1])


def get_end(id):
    return int(id.split("_")[2].split(":")[1])

# FIMO no flanks

In [9]:
fimo_no_flanks["chrom"] = fimo_no_flanks.sequence_name.apply(get_chr)
fimo_no_flanks["start"] = fimo_no_flanks.sequence_name.apply(get_start)
fimo_no_flanks["end"] = fimo_no_flanks.sequence_name.apply(get_end)

In [10]:
sim_fimo_no_flanks = fimo_no_flanks[["start", "end", "chrom", "strand"]]

In [11]:
import bioframe as bf

In [12]:
summary_df = bf.overlap(mismatched_strand_df, sim_fimo_no_flanks, how="outer")

In [13]:
summary_df = summary_df.drop(columns=["mismatch", "start_", "end_", "chrom_"])

In [14]:
summary_df = summary_df.rename(columns={"strand": "jaspar_strand",
                                        "forward_scan": "forward_scan_20",
                                      "reverse_scan": "reverse_scan_20",
                                      "scan_strand": "strand_scan_20",
                                      "nf_forward": "forward_scan_0",
                                       "nf_reverse": "reverse_scan_0",
                                       "nf_strand_scan": "strand_scan_0",
                                      "strand_" : "strand_fimo_0"})

In [15]:
# The fasta file has 54 sequences, so FIMO didn't detect motifs in 2 sequences. Which ones?

In [16]:
summary_df[(summary_df["strand_fimo_0"] != "+") & (summary_df["strand_fimo_0"] != "-")]

Unnamed: 0,chrom,jaspar_strand,start,end,forward_scan_20,reverse_scan_20,strand_scan_20,forward_scan_0,reverse_scan_0,strand_scan_0,strand_fimo_0
12,chr3,-,64921722,64921741,9.862562,9.840314,+,9.339435,9.840314,-,
30,chr11,+,95673551,95673570,9.429882,9.577741,-,9.429882,6.174221,+,


# FIMO with flanks

In [17]:
fimo_with_flanks["chrom"] = fimo_with_flanks.sequence_name.apply(get_chr)
fimo_with_flanks["start"] = fimo_with_flanks.sequence_name.apply(get_start)
fimo_with_flanks["end"] = fimo_with_flanks.sequence_name.apply(get_end)

In [18]:
sim_fimo_with_flanks = fimo_with_flanks[["start", "end", "chrom", "strand"]]

In [19]:
summary_df = bf.overlap(summary_df, sim_fimo_with_flanks, how="outer")

In [20]:
summary_df = summary_df.drop(columns=["start_", "end_", "chrom_"])

In [21]:
summary_df = summary_df.rename(columns={"strand_": "strand_fimo_20"})

In [22]:
summary_df.head(10)

Unnamed: 0,chrom,jaspar_strand,start,end,forward_scan_20,reverse_scan_20,strand_scan_20,forward_scan_0,reverse_scan_0,strand_scan_0,strand_fimo_0,strand_fimo_20
0,chr8,+,16716934,16716953,9.44511,9.661276,-,9.44511,9.107311,+,+,+
1,chr8,+,18942052,18942071,9.268708,9.296381,-,9.268708,7.789968,+,+,+
2,chr8,+,18942052,18942071,9.268708,9.296381,-,9.268708,7.789968,+,+,-
3,chr8,-,71375791,71375810,9.752677,9.415325,+,8.987117,9.415325,-,-,-
4,chr8,-,71375791,71375810,9.752677,9.415325,+,8.987117,9.415325,-,-,+
5,chr15,+,74759468,74759487,9.199394,9.331507,-,9.199394,6.699575,+,+,+
6,chr15,-,88295524,88295543,9.679678,9.673682,+,9.257872,9.673682,-,-,-
7,chr15,-,100456055,100456074,9.477926,9.245367,+,7.40713,9.245367,-,-,-
8,chr3,-,11083624,11083643,9.798263,9.584266,+,9.069188,9.584266,-,-,-
9,chr3,-,54790591,54790610,9.845497,9.788777,+,8.895056,9.788777,-,-,-


In [29]:
only_strands = summary_df[["chrom", "start", "jaspar_strand", "strand_scan_20", "strand_scan_0", "strand_fimo_0", "strand_fimo_20"]].sort_values(by="start")

In [30]:
only_strands.head(31)

Unnamed: 0,chrom,start,jaspar_strand,strand_scan_20,strand_scan_0,strand_fimo_0,strand_fimo_20
24,chr16,3743928,-,+,-,-,+
23,chr16,3743928,-,+,-,-,-
59,chr2,3840133,-,+,-,-,-
41,chr18,5157953,-,+,-,-,-
34,chr11,5956078,-,+,-,-,-
8,chr3,11083624,-,+,-,-,-
0,chr8,16716934,+,-,+,+,+
61,chr2,18221059,-,+,-,-,-
60,chr2,18221059,-,+,-,-,-
1,chr8,18942052,+,-,+,+,+


In [31]:
only_strands.tail(31)

Unnamed: 0,chrom,start,jaspar_strand,strand_scan_20,strand_scan_0,strand_fimo_0,strand_fimo_20
32,chr12,86999732,+,-,+,+,+
33,chr12,86999732,+,-,+,+,-
6,chr15,88295524,-,+,-,-,-
51,chr7,89903554,+,-,+,+,+
27,chr5,91413459,+,-,+,+,+
36,chr11,95673551,+,-,+,,
44,chr9,96887023,-,+,+,-,-
52,chr7,97344938,-,+,-,-,-
7,chr15,100456055,-,+,-,-,-
53,chr7,100881076,+,-,+,+,+
