In [1]:
#### IMPORT LIBS AND FXS ####
import os, sys, glob, subprocess
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import ticker
import re

# Supress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
#### ALIGN AND CALL PEAKS ####

## PARAMS
# DATA
data_dir = "/groups/doudna/projects/mtrinidad_projects/ChIPSeq_HSKW/KMW_060324_ChIP/FASTQS/"
r1_suffix = "_L001_R1_001.fastq.gz"
r2_suffix = "_L001_R2_001.fastq.gz"
r1s = [x for x in glob.glob(f"{data_dir}*/*{r1_suffix}", recursive = True)]
bases = list(set([re.split(r"_S[\d]*_", x.split("/")[-1])[0] for x in r1s]))

# ALIGNMENTS
bam_suffix = "_sorted.bam"
bam_dir = f"/groups/doudna/projects/mtrinidad_projects/ChIPSeq_HSKW/KMW_060324_ChIP/Alignments/"
if not os.path.exists(bam_dir):
    os.mkdir(bam_dir)

# PEAK-CALLING
peak_dir = "/groups/doudna/projects/mtrinidad_projects/ChIPSeq_HSKW/KMW_060324_ChIP/Peaks/"
if not os.path.exists(peak_dir):
    os.mkdir(peak_dir)
macs2_dir = "/home/mtrinidad/miniconda3/envs/ChIP/bin/macs2"

# THREADS
threads = "$SLURM_CPUS_ON_NODE"

# REFERENCES
ref_dir = "/groups/doudna/team_resources/shared_databases/human_reference_genomes/hg38/GENCODE_GRCh38p14_v44/GRCh38_PRI_bt"

# BACKGROUND SUBTRACTION BAMS: exploratory peak calling setup (Publication uses pseudoreplicates to explore enrichment. See PseudoRep_and_RPKM.ipynb)
background_sample_dict = {
                          'ChIP__KMW_060324_dSpRYapo_UCLA':[f"{bam_dir}ChIP__KMW_060324_dSpRYapo_input_UCLA{bam_suffix}"],
                          'ChIP__KMW_060324_dSpRY_UCLA':[f"{bam_dir}ChIP__KMW_060324_dSpRYapo_input_UCLA{bam_suffix}"],
                          'ChIP__KMW_060324_dWTapo_UCLA':[f"{bam_dir}ChIP__KMW_060324_dWTapo_input_UCLA{bam_suffix}"],
                          'ChIP__KMW_060324_dWT_UCLA':[f"{bam_dir}ChIP__KMW_060324_dWTapo_input_UCLA{bam_suffix}"]
                        }

# SBATCH HEADER AND BASH SCRIPT
bash_script = "Alignments_and_Peak_Calling.sh"
bash_lines = ["#!/bin/bash", "#SBATCH -p standard", "#SBATCH --job-name ALIGN_MACS2", 
               "#SBATCH -o %j.out", "#SBATCH -e %j.err", "## ACTIVATE CONDA", 
              'eval "$(conda shell.bash hook)"', 'conda activate ChIP']


## ALIGN WITH BOWTIE
bams = []
bash_lines.append("#### ALIGN WITH BOWTIE ####")
print("#### ALIGN WITH BOWTIE #### ")
for r1 in r1s:
    base = re.split(r"_S[\d]*_", r1.split("/")[-1])[0]
    sample_id = re.split(r1_suffix, r1.split("/")[-1])[0]
    r2 = r1.replace(r1_suffix, r2_suffix)
    sorted_bam = f"{bam_dir}{sample_id}{bam_suffix}"
    bams.append(sorted_bam)
    bash_lines.append(f"## {sample_id}")
    bowtie_align = (f"bowtie2 -x {ref_dir} -1 {r1} -2 {r2} -p {threads} "
                    f"--no-discordant --local --no-mixed --maxins 1000 | " # Minimize large gaps (confuses MACS2)
                    f"samtools view -bSh - | samtools sort -@ {threads} -o {sorted_bam}")
    bash_lines.append(bowtie_align)
    index_str = f"samtools index {sorted_bam} -@ {threads}"
    bash_lines.append(index_str)

## CALL PEAKS
bash_lines.append(f"#### CALL PEAKS ####")
p_val = 0.01
print(f"#### CALL PEAKS ####", bams)
for bam in bams:
    if "input_" in bam:
        continue
    base = re.split(r"_S[\d]*_", bam.split("/")[-1])[0]
    sample_id = re.split(bam_suffix, bam.split("/")[-1])[0]
    input_ctrl_bam = background_sample_dict[base]
    bash_lines.append(f"## {sample_id}")
    print(f"## {sample_id}")
    
    macs2_peaks = (f"{macs2_dir} callpeak -t {bam} -c {' '.join(input_ctrl_bam)} "
                   f"-g hs -n {sample_id} -f BAM -B --SPMR -p {p_val} --outdir {peak_dir}  "
                   f"2> {peak_dir}{sample_id}_macs2.log")
    bash_lines.append(macs2_peaks)

## MAKE BIGWIGS
bash_lines.append(f"#### MAKE BIGWIGS ####")
dedup_bws = []

for bam in bams:
    base = re.split(r"_S[\d]*_", bam.split("/")[-1])[0]
    sample_id = re.split(bam_suffix, bam.split("/")[-1])[0]
    bash_lines.append(f"## {sample_id}")
    
    # CPM DEDUPLICATED
    dedup_bw = f"{bam_dir}{sample_id}_CPM_Norm_DEDUP.bigWig"
    bw_dedup = (f"bamCoverage -b {bam} -o {dedup_bw} --normalizeUsing CPM "
                f"--numberOfProcessors {threads} --ignoreDuplicates --binSize 1")
    bash_lines.append(bw_dedup)
    
    # RAW WITH DUPLICATES
    dup_raw_bw = f"{bam_dir}{sample_id}_RAW_DUP.bigWig"
    bw_dup = (f"bamCoverage -b {bam} -o {dup_raw_bw} "
                    f"--numberOfProcessors {threads} --binSize 1")
    bash_lines.append(bw_dup)
    

## CORRELATION
bash_lines.append(f"#### CORRELATION ####")

# GET NPZ
npz = f"{bam_dir}MultiBamSummary_vDeDupBWs.npz"
counts = f"{bam_dir}MultiBamSummary_Counts_vDeDupBWs.npz"
bw_suffix = "_RAW_DEDUP.bigWig"
labels = [x.split("/")[-1].replace(bw_suffix, "") for x in dedup_bws]

make_npz = (f'multiBigwigSummary bins -b {" ".join(dedup_bws)} --numberOfProcessors {threads} '
            f'--labels {" ".join(labels)} --outFileName {npz}')
bash_lines.append(make_npz)

# PLOT
get_corr = (f"plotCorrelation -in {npz} "
            "--corMethod pearson "
            "--skipZeros "
            "--plotTitle 'Pearson Correlation of BAM Coverage' "
            "--whatToPlot heatmap "
            "-o DeDupBWs_PearsonCorr_Scores.png "
            "--outFileCorMatrix DeDupBWs_PearsonCorr_Scores.tab "
            "--plotNumbers "
            "--colorMap RdYlBu " 
            "--removeOutliers " 
        )
bash_lines.append(get_corr)

# WRITE FILE
with open(bash_script, "w") as f:
    f.writelines("\n".join(bash_lines))  


#os.system(f"sbatch {bash_script}") # JID:471871