# Compiling the BAF/NURD targets based on SMARCA4 K562 ChIP-seq and HDAC1 ChIP-seq from ENCODE:

# INPUT: 

    * NONE, will be downloaded.

# OUTPUT: 

    * targets/

# Environment Setup

In [None]:
work_dir = './'
import os
os.chdir( work_dir )

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

bedtools_dir = 'third_party_tools/bedtools2/bin/'

data_dir = 'targets/'
data_dir2 = '231009_index/'
data_dir3 = 'edit_capture/data/DE_out_parse_corrected2/superbseq/'
data_dir4 = 'edit_capture/data/DE_out_parse_corrected2/sample/'
out_dir = data_dir

In [None]:
!mkdir {out_plots}

In [None]:
!mkdir {data_dir}

# Downloading the data

In [None]:
# SMARCA4 K562 ChIP-seq peaks.
!wget https://www.encodeproject.org/files/ENCFF267OGF/@@download/ENCFF267OGF.bed.gz -O {data_dir}smarca4_k562_peaks.bed.gz
!echo y | gzip -d {data_dir}smarca4_k562_peaks.bed.gz

In [None]:
!head -3 {data_dir}smarca4_k562_peaks.bed

In [None]:
!cut -f1,2,3 {data_dir}smarca4_k562_peaks.bed | sort -k1,1 -k2,2n | uniq > {data_dir}smarca4_k562_peaks.regions.bed

In [None]:
!head -3 {data_dir}smarca4_k562_peaks.regions.bed

In [None]:
!wget https://www.encodeproject.org/files/ENCFF432KJA/@@download/ENCFF432KJA.bed.gz -O {data_dir}hdac1_k562_peaks.bed.gz
!echo y | gzip -d {data_dir}hdac1_k562_peaks.bed.gz

In [None]:
!cut -f1,2,3 {data_dir}hdac1_k562_peaks.bed | sort -k1,1 -k2,2n | uniq > {data_dir}hdac1_k562_peaks.regions.bed

In [None]:
!head -3 {data_dir}hdac1_k562_peaks.regions.bed

In [None]:
!wget https://www.encodeproject.org/files/ENCFF652ZZF/@@download/ENCFF652ZZF.bed.gz -O {data_dir}hdac2_k562_peaks.bed.gz
!echo y | gzip -d {data_dir}hdac2_k562_peaks.bed.gz

In [None]:
!cut -f1,2,3 {data_dir}hdac2_k562_peaks.bed | sort -k1,1 -k2,2n | uniq > {data_dir}hdac2_k562_peaks.regions.bed

In [None]:
!head -3 {data_dir}hdac2_k562_peaks.regions.bed

***Also ATAC-seq to narrow to accessible genes!***

In [None]:
!wget https://www.encodeproject.org/files/ENCFF333TAT/@@download/ENCFF333TAT.bed.gz -O {data_dir}atac_k562_peaks.bed.gz
!echo y | gzip -d {data_dir}atac_k562_peaks.bed.gz

In [None]:
!cut -f1,2,3 {data_dir}atac_k562_peaks.bed | sort -k1,1 -k2,2n | uniq > {data_dir}atac_k562_peaks.regions.bed

In [None]:
!head -3 {data_dir}atac_k562_peaks.regions.bed

In [None]:
!wget https://www.encodeproject.org/files/ENCFF985QBS/@@download/ENCFF985QBS.bed.gz -O {data_dir}chd4_k562_peaks.bed.gz
!echo y | gzip -d {data_dir}chd4_k562_peaks.bed.gz

In [None]:
!cut -f1,2,3 {data_dir}chd4_k562_peaks.bed | sort -k1,1 -k2,2n | uniq > {data_dir}chd4_k562_peaks.regions.bed

In [None]:
!head -3 {data_dir}chd4_k562_peaks.regions.bed

***NRF1 targets!!!***

In [None]:
!wget https://www.encodeproject.org/files/ENCFF259YUE/@@download/ENCFF259YUE.bed.gz -O {data_dir}nrf1_k562_peaks.bed.gz
!echo y | gzip -d {data_dir}nrf1_k562_peaks.bed.gz

In [None]:
!cut -f1,2,3 {data_dir}nrf1_k562_peaks.bed | sort -k1,1 -k2,2n | uniq > {data_dir}nrf1_k562_peaks.regions.bed

In [None]:
!head -3 {data_dir}nrf1_k562_peaks.regions.bed

# Determining gene TSS

In [None]:
!zcat {data_dir2}Homo_sapiens.GRCh38.110.gtf.gz | grep "	gene	" | cut -f1,4,5,7,9 > {data_dir}hg38_genes.txt
!head -3 {data_dir}hg38_genes.txt

In [None]:
#### Parsing to get transcription start sites!
gene_info = open(f'{data_dir}hg38_genes.txt', 'r')
tss_bed = open(f'{data_dir}hg38_tss.bed', 'w')

window=200
for line in gene_info:

    #### Positional information.
    info_split = line.split('\t')

    chr_ = f'chr{info_split[0]}'
    if info_split[3]=="+":
        tss_pos = int(info_split[1])
    else:
        tss_pos = int(info_split[2])

    region_start, region_end = tss_pos - window, tss_pos + window

    #### Gene information
    gene_split = info_split[-1].split(';')
    gene_name = [split_ for split_ in gene_split if 'gene_name' in split_]
    if len(gene_name) > 0:
        gene_name = gene_name[0].split('"')[1]
    else:
        gene_name = [split_ for split_ in gene_split if 'gene_id' in split_][0].split('"')[1]

    #### Writing out the information.
    tss_bed.write(f'{chr_}\t{region_start}\t{region_end}\t{gene_name}\n')

gene_info.close()
tss_bed.close()

In [None]:
!cat {data_dir}hg38_tss.bed | sort -k1,1 -k2,2n | uniq > {data_dir}hg38_tss.sorted.bed

In [None]:
!head {data_dir}hg38_tss.sorted.bed

In [None]:
!wc -l {data_dir}hg38_tss.sorted.bed

***Looking good!***

# Intersecting to get the genes!

***First let's get genes with an open promoter***

BAF complex: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4968166/

In [None]:
!{bedtools_dir}bedtools intersect -wa -a {data_dir}hg38_tss.sorted.bed -b {data_dir}atac_k562_peaks.regions.bed | sort -k1,1 -k2,2n | uniq > {data_dir}atac_k562_genes.bed
!head {data_dir}atac_k562_genes.bed

In [None]:
!wc -l {data_dir}atac_k562_genes.bed

***Now getting smarca4 marked genes with open promoters***

In [None]:
!{bedtools_dir}bedtools intersect -wa -a {data_dir}atac_k562_genes.bed -b {data_dir}smarca4_k562_peaks.regions.bed | sort -k1,1 -k2,2n | uniq > {data_dir}smarca4_k562_genes.bed
!head {data_dir}smarca4_k562_genes.bed

In [None]:
!cat {data_dir}smarca4_k562_genes.bed | cut -f4 | sort | uniq > {data_dir}smarca4_k562_genes.txt
!wc -l {data_dir}smarca4_k562_genes.txt

Seems like a reasonable number!

***Checking these in the ENCODE genome browser verified they do have SMARCA4 K562 CHIP-seq promoter!***

***Much more reasonable number!!!***

***Try with chd4 only***

In [None]:
!{bedtools_dir}bedtools intersect -wa -a {data_dir}hg38_tss.sorted.bed -b {data_dir}chd4_k562_peaks.regions.bed | sort -k1,1 -k2,2n | uniq > {data_dir}chd4_k562_genes.bed
!head {data_dir}chd4_k562_genes.bed

In [None]:
!cat {data_dir}chd4_k562_genes.bed | cut -f4 | sort | uniq > {data_dir}chd4_k562_genes.txt
!wc -l {data_dir}chd4_k562_genes.txt

In [None]:
!{bedtools_dir}bedtools intersect -wa -a {data_dir}hg38_tss.sorted.bed -b {data_dir}atac_k562_peaks.regions.bed | sort -k1,1 -k2,2n | uniq > {data_dir}atac_k562_genes.bed
!head {data_dir}atac_k562_genes.bed

# Candidate targets based on enhancer - gene associations!!

Speaking with Alex, he said the SMARCA4 ChIP-seq peaks that intersect with promoters would likely enrich for p-BAF, which is one of 3 BAF complexes that is more promoter associated. SMARCA4 is found in all BAF complexes. However, ARID1A is found in the more enhancer-associated BAF complexes, so would be worth finding gene targets based on enhancers as well. SO will also try and get a set of targets based on enhancers also!!!

Reading about this, this resource appears very useful, and has cell-line / cell type specific targets:

    https://academic.oup.com/nar/article/48/D1/D58/5628925

Specifically, can get the K562 enhancer - gene associations from this file (NOTE checked the coordinates in the genome browser and look to correspond to hg19!):

    http://www.enhanceratlas.org/data/AllEPs/hs/K562_EP.txt
    

In [None]:
!wget http://www.enhanceratlas.org/data/AllEPs/hs/K562_EP.txt -O {data_dir}K562_EP.hg19.txt

In [None]:
!head -3 {data_dir}K562_EP.hg19.txt

In [None]:
#### Let's determine a cutoff for a good association first!
enhancer_gene = pd.read_csv(f'{data_dir}K562_EP.hg19.txt', sep='\t', header=None)
enhancer_gene.head(3)

In [None]:
scores = enhancer_gene[1].values
scores

In [None]:
plt.hist(scores, bins=100)
plt.show()

In [None]:
score_cutoff = 2
sum(scores >= score_cutoff)

In [None]:
#### Let's load it and reformat it...
enh_to_gene_file = open(f'{data_dir}K562_EP.hg19.txt', 'r')
enh_to_gene_bed_file = open(f'{data_dir}K562_EP.hg19.bed', 'w')
for line in enh_to_gene_file:
    score = float( line.split('\t')[1] ) 
    if score < score_cutoff: # Only keep confident links.
        continue
        
    info = line.split('\t')[0].split('$')
    enh, gene = info[0], info[1]
    
    enh_chr, enh_start, enh_end = enh.split(':')[0], enh.split(':')[1].split('-')[0], enh.split(':')[1].split('-')[1].split('_')[0]

    enh_to_gene_bed_file.write(f'{enh_chr}\t{enh_start}\t{enh_end}\t{gene}\n')

enh_to_gene_bed_file.close()
enh_to_gene_file.close()

In [None]:
!head {data_dir}K562_EP.hg19.bed

In [None]:
!ls -s -h {data_dir}K562_EP.hg19.bed

In [None]:
#### Used liftOver to convert to hg38 coords: https://genome.ucsc.edu/cgi-bin/hgLiftOver
!head {data_dir}K562_EP.hg19.lifted-to-hg38.bed

In [None]:
#### Removing the extra column added by liftOver
!cut -f1-4 {data_dir}K562_EP.hg19.lifted-to-hg38.bed > {data_dir}K562_EP.hg19.lifted-to-hg38.sub.bed

In [None]:
!head -3 {data_dir}K562_EP.hg19.lifted-to-hg38.sub.bed

In [None]:
!{bedtools_dir}bedtools intersect -wa -a {data_dir}K562_EP.hg19.lifted-to-hg38.sub.bed -b {data_dir}atac_k562_peaks.regions.bed | sort -k1,1 -k2,2n | uniq > {data_dir}K562_EP.hg19.lifted-to-hg38.sub.atac.bed
!head -3 {data_dir}K562_EP.hg19.lifted-to-hg38.sub.atac.bed


In [None]:
!wc -l {data_dir}K562_EP.hg19.lifted-to-hg38.sub.atac.bed

In [None]:
!{bedtools_dir}bedtools intersect -wa -a {data_dir}K562_EP.hg19.lifted-to-hg38.sub.atac.bed -b {data_dir}smarca4_k562_peaks.regions.bed | sort -k1,1 -k2,2n | uniq > {data_dir}smarca4_k562_genes.enhancer.bed
!head {data_dir}smarca4_k562_genes.enhancer.bed

In [None]:
!wc -l {data_dir}smarca4_k562_genes.enhancer.bed

In [None]:
!cat {data_dir}smarca4_k562_genes.enhancer.bed | cut -f4 | sort | uniq > {data_dir}smarca4_k562_genes.enhancer.genes.txt 
!wc -l {data_dir}smarca4_k562_genes.enhancer.genes.txt

In [None]:
!head {data_dir}smarca4_k562_genes.enhancer.genes.txt

In [None]:
### Checking overlap with the promoter genes...
smarca4_prom_genes = set([line.strip('\n') for line in open(f'{data_dir}smarca4_k562_genes.txt', 'r')])
smarca4_enh_genes = set([line.strip('\n') for line in open(f'{data_dir}smarca4_k562_genes.enhancer.genes.txt', 'r')])

len(smarca4_prom_genes), len(smarca4_enh_genes)

In [None]:
from matplotlib_venn import venn2

plt.figure(figsize=(8, 8))
venn = venn2([smarca4_prom_genes, smarca4_enh_genes], ('Smarca4 promoter genes', 'Smarca4 enhancer genes'))

# Display the Venn diagram
plt.title("Overlap between SMARCA4 targets by promoter versus by enhancer")
plt.show()

***Some level of overlap, but they do look like largely unique sets of genes! Good to know***

# NRF1 targets

In [None]:
!{bedtools_dir}bedtools intersect -wa -a {data_dir}hg38_tss.sorted.bed -b {data_dir}nrf1_k562_peaks.regions.bed | sort -k1,1 -k2,2n | uniq > {data_dir}nrf1_k562_peaks.genes.bed
!head {data_dir}nrf1_k562_peaks.genes.bed


In [None]:
!cut -f4 {data_dir}nrf1_k562_peaks.genes.bed | sort | uniq > {data_dir}nrf1_k562_peaks.genes.txt

In [None]:
!head {data_dir}nrf1_k562_peaks.genes.txt

***Trying to subset to open chromatin***

In [None]:
!{bedtools_dir}bedtools intersect -wa -a {data_dir}nrf1_k562_peaks.genes.bed -b {data_dir}atac_k562_genes.bed | sort -k1,1 -k2,2n | uniq > {data_dir}nrf1_k562_peaks.genes.open.bed
!head {data_dir}nrf1_k562_peaks.genes.open.bed


In [None]:
!cut -f4 {data_dir}nrf1_k562_peaks.genes.open.bed | sort | uniq > {data_dir}nrf1_k562_peaks.genes.open.txt

In [None]:
!head {data_dir}nrf1_k562_peaks.genes.open.txt