### How many bidirectional CTCF positions are there in the genome? 

In [1]:
import pandas as pd
import pyranges as pr

ctcf_motifs = pr.PyRanges(pd.read_table("data/ctcf/MA0139.1.tsv", 
                                        names=["Chromosome", "Start", "End", "Name", "Score", "Score1", "Strand"]), int64=True) \
                                            .merge(strand=True, count=True)
pos_motifs = ctcf_motifs.subset(lambda df: df.Strand == "+")
rev_motifs = ctcf_motifs.subset(lambda df: df.Strand == "-")

# shows there are a roughly equal ratio of positive to negative motifs
len(pos_motifs)/(len(pos_motifs) + len(rev_motifs))

# roughly 2.3% (n=9373)
bidirectional = pos_motifs.intersect(rev_motifs, strandedness=False)

### What proportion of bidirectional CTCF sites are actually bound? 

In [27]:
from AnalysisTools.ctcf_site_tools import ctcf_chip

# About 14.6% (n=1365)
print(len(bidirectional.intersect(ctcf_chip))/len(bidirectional))
bidirectional_bound = ctcf_chip.intersect(bidirectional)

0.14563106796116504


### What percent of all motifs bind?

In [3]:
from AnalysisTools.ctcf_site_tools import ctcf_motif

# Only 4.4%
print(len(ctcf_motif.intersect(ctcf_chip))/len(ctcf_motif))

0.043776598259166286


A higher proportion of bidirectional motifs bind CTCF than single direction motifs

### What proportion of all bound CTCF motifs have asymmetrical CpG sites? 

In [4]:
from concurrent import futures
from AnalysisTools import ctcf_site_tools as ctcf

root_path = "data/duplex_data/"
files = ["cbm2/CBM_2_rep1.masked.bed", "cbm2/CBM_2_rep2.masked.bed",
         "cbm3/CBM_3_rep1.sorted.bam.bed", "cbm3/CBM_3_rep2.sorted.bam.bed"]

file_paths = [root_path + file for file in files]

print("Loading data")
with futures.ProcessPoolExecutor(len(file_paths)) as load_executor:
    all_duplex_modbeds = [load_executor.submit(ctcf.read_merge, path, False, replicate+1) for replicate, path in enumerate(file_paths)]
    all_duplex_modbeds = [modbed.result() for modbed in all_duplex_modbeds]
    all_duplex_reads = pd.concat(modbed.pattern_df for modbed in all_duplex_modbeds)

Loading data
Found 12318787 sites in data/duplex_data/cbm3/CBM_3_rep1.sorted.bam.bed
Found 12513589 sites in data/duplex_data/cbm2/CBM_2_rep1.masked.bed
Found 12632176 sites in data/duplex_data/cbm2/CBM_2_rep2.masked.bed
Found 12607984 sites in data/duplex_data/cbm3/CBM_3_rep2.sorted.bam.bed


In [17]:
all_duplex_sites = all_duplex_reads.groupby(["Chromosome", "Start", "End"], observed=True)\
    .sum(numeric_only=True)\
        .drop(columns="Replicate")\
        .query("readCount >= 5")\
        .reset_index()\
        .eval("Asymmetrical = MH + HM + CM + MC + CH + HC")

In [62]:
asymmetrical_containing = pr.PyRanges(all_duplex_sites.loc[all_duplex_sites.eval("Asymmetrical > 0")], int64=True)
asymmetrical_majority = pr.PyRanges(all_duplex_sites.loc[all_duplex_sites.eval("Asymmetrical >= (readCount/2)")], int64=True)
asymmetrical_minority = pr.PyRanges(all_duplex_sites.loc[all_duplex_sites.eval("Asymmetrical >= (readCount/3)")], int64=True)

In [22]:
len(ctcf_chip.count_overlaps(asymmetrical_containing).subset(lambda df: df.NumberOverlaps > 0))/len(ctcf_chip)

0.629910793406983

63.0% of all CTCF peaks contain at least one asymmetrical CpG site across reads. 

In [23]:
len(ctcf_chip.count_overlaps(asymmetrical_majority).subset(lambda df: df.NumberOverlaps > 0))/len(ctcf_chip)

0.061233532226620195

But only 6.1% overlap a majority asymmetrical CpG site. 

### What proportion of bound bidirectional CTCF motifs have asymmetrical CpG sites? 

In [29]:
len(bidirectional_bound.count_overlaps(asymmetrical_containing).subset(lambda df: df.NumberOverlaps > 0))/len(bidirectional_bound)

0.13772893772893774

In [101]:
bidirectional_bound.count_overlaps(asymmetrical_containing).subset(lambda df: df.NumberOverlaps > 0)

Unnamed: 0,Chromosome,Start,End,FoldDifference,pValue,qValue,NumberOverlaps
0,chr1,6929426,6929442,12.43020,32.8458,29.72820,1
1,chr1,7467272,7467289,19.57860,79.1246,75.49170,1
2,chr1,16063959,16063975,13.57840,66.2908,62.81250,1
3,chr1,19126743,19126759,7.35032,18.3673,15.43370,2
4,chr1,40926953,40926970,18.44570,49.4407,46.14660,1
...,...,...,...,...,...,...,...
183,chrY,90772968,90772985,3.56775,12.0592,9.23583,2
184,chrY,90772701,90772718,11.30250,65.2424,61.77560,2
185,chrY,90772968,90772985,11.30250,65.2424,61.77560,2
186,chrY,90775856,90775873,10.47740,54.5542,51.20800,1


Only 13.8% of bidirectional CTCF sites overlap a CpG site that contains asymmetrical reads

In [30]:
len(bidirectional_bound.count_overlaps(asymmetrical_majority).subset(lambda df: df.NumberOverlaps > 0))/len(bidirectional_bound)

0.003663003663003663

In [38]:
bidirectional_bound.count_overlaps(asymmetrical_majority).subset(lambda df: df.NumberOverlaps > 0)

Unnamed: 0,Chromosome,Start,End,FoldDifference,pValue,qValue,NumberOverlaps
0,chr2,129380134,129380151,5.24912,12.6201,9.78457,1
1,chr8,122774309,122774326,6.08308,11.6627,8.8466,1
2,chr12,90696550,90696567,19.34,57.5721,54.1911,1
3,chr15,86289932,86289938,32.567,159.763,154.445,1
4,chr18,24837242,24837259,4.14462,7.99138,5.26802,1


Very few 0.3% of bidirectional CTCF sites contain asymmetrical majority positions. 

In [64]:
len(bidirectional_bound.count_overlaps(asymmetrical_minority).subset(lambda df: df.NumberOverlaps > 0))/len(bidirectional_bound)

0.00805860805860806

### How many bidirectional bound CTCF sites overlap majority C:C positions? 

In [44]:
cc_majority = pr.PyRanges(all_duplex_sites.loc[all_duplex_sites.eval("CC >= (readCount/2)")], int64=True)

len(bidirectional_bound.count_overlaps(cc_majority).subset(lambda df: df.NumberOverlaps > 0))/len(bidirectional_bound)

0.2703296703296703

27% (n=369) are majority CC

### How many bidirectional bound CTCF sites overlap majority 5mC:5mC positions? 

In [47]:
mm_majority = pr.PyRanges(all_duplex_sites.loc[all_duplex_sites.eval("MM >= (readCount/2)")], int64=True)

len(bidirectional_bound.count_overlaps(mm_majority).subset(lambda df: df.NumberOverlaps > 0))/len(bidirectional_bound)

0.012454212454212455

### What are bidirectional motifs made of? 

In [96]:
all_bidirectional_sites = pr.PyRanges(all_duplex_sites, int64=True).intersect(bidirectional_bound).as_df()

In [100]:
all_bidirectional_sites["Asymmetrical"].sum()/all_bidirectional_sites["readCount"].sum()

0.044693237796686075

In [None]:
for col in ["CC",	"CH",	"CM",	"HC",	"HH",	"HM",	"MC",	"MH",	"MM"]:
    all_bidirectional_sites[col] = all_bidirectional_sites.eval(f"({col}/readCount)*100")
    
all_bidirectional_sites[["CC",	"CH",	"CM",	"HC",	"HH",	"HM",	"MC",	"MH",	"MM"]].mean()

In [61]:
all_bound_sites = pr.PyRanges(all_duplex_sites, int64=True).intersect(ctcf_chip).as_df()

for col in ["CC",	"CH",	"CM",	"HC",	"HH",	"HM",	"MC",	"MH",	"MM"]:
    all_bound_sites[col] = all_bound_sites.eval(f"({col}/readCount)*100")

all_bound_sites[["CC",	"CH",	"CM",	"HC",	"HH",	"HM",	"MC",	"MH",	"MM"]].mean()

Pattern
CC    84.936788
CH     1.033573
CM     1.039718
HC     1.031736
HH     1.640138
HM     1.832340
MC     1.019602
MH     1.808567
MM     5.657538
dtype: float64

Basically identical to the average for CTCF bound motifs. Slightly more C:C

### Of the bound bidirectional motifs where asymmetrical reads are at least present, what genes are associated? 

In [138]:
genes = pr.PyRanges(pr.read_bed("feature_references/genes/mm39_GENCODEVM33_bed8.bed").as_df(), int64=True)

bidirectional_intersecting_genes = pd.Series(genes.join(asymmetrical_containing.intersect(bidirectional_bound))\
    .as_df()["Name"].unique())

In [139]:
bidirectional_intersecting_genes.to_csv("data/ctcf/bidirectional_asymmetrical_genes.txt", index=False, header=False)

In [140]:
bidirectional_intersecting_genes

0         Cd247
1          St18
2          Dgkd
3       Gm33667
4         Gpa33
         ...   
97       Bcap31
98     Slc25a53
99      Gm15726
100     Gm15247
101     Gm21860
Length: 102, dtype: object

Of the bound bidirectional motifs where CC is the majority state, what genes are associated? 

In [120]:
cc_intersecting_genes = pd.Series(genes.join(cc_majority.intersect(bidirectional_bound))\
    .as_df()["Name"].unique())\
    .to_csv("data/ctcf/bidirectional_cc_genes.txt", index=False, header=False)