# HiCExplorer pipeline

**Versions**  
bwa: 0.7.13  
samtools: 1.3.1  
java: 1.8.0_211  
HiC_explorer: 2.2  
Trimmomatic: 0.39  

In [None]:
#!/bin/bash

#SBATCH --partition=main    # Partition (job queue)
#SBATCH --job-name=HiC         # Assign an 8-character name to your job, no spaces
#SBATCH --nodes=1                # Number of compute nodes
#SBATCH --ntasks=1               # Processes (usually = cores) on each node
#SBATCH --cpus-per-task=28       # Threads per process (or per core)
#SBATCH --export=ALL             # Export you current environment settings to the job environment
#SBATCH --time=12:00:00
#SBATCH --mem=128000
#SBATCH --output=slurm-%A_%a.out
#SBATCH --array=1-10

DIR=/dup_*
REF=/references

BIN=5000

module load bwa
module load samtools
module load java

samtools faidx $REF 
bwa index $REF 

cd $DIR
cp /pkg/Trimmomatic-0.39/adapters/TruSeq3-PE.fa $DIR/.

# TRIM READS: will produce files ending in _1P.fastq.gz, _2P.fastq.gz, _1U.fastq.gz, and _2U.fastq.gz

java -jar /pkg/Trimmomatic-0.39/trimmomatic-0.39.jar PE $SLURM_ARRAY_TASK_ID.R1.fastq.gz $SLURM_ARRAY_TASK_ID.R2.fastq.gz -threads 28 -baseout $SLURM_ARRAY_TASK_ID.fastq.gz ILLUMINACLIP:TruSeq3-PE.fa:2:30:10 LEADING:10 TRAILING:10 SLIDINGWINDOW:4:15 MINLEN:36

# Perl script takes output from trimmomatic and creates two new files: split_P1.fa and split_P2.fa
perl /combined_cut_nt.pl $SLURM_ARRAY_TASK_ID\_1P.fastq.gz $SLURM_ARRAY_TASK_ID\_2P.fastq.gz $SLURM_ARRAY_TASK_ID\_1U.fastq.gz $SLURM_ARRAY_TASK_ID\_2U.fastq.gz $SLURM_ARRAY_TASK_ID 

# ALIGN split_P1.fa and split_P2.fa WITH BWA

bwa mem -t 28 -A1 -B4 -E50 -L0 $REF split_$SLURM_ARRAY_TASK_ID.P1.fa 2>mate_$SLURM_ARRAY_TASK_ID\_R1.log | samtools view -Shb - > $SLURM_ARRAY_TASK_ID.R1.bam
bwa mem -t 28 -A1 -B4 -E50 -L0 $REF split_$SLURM_ARRAY_TASK_ID.P2.fa 2>mate_$SLURM_ARRAY_TASK_ID\_R2.log | samtools view -Shb - > $SLURM_ARRAY_TASK_ID.R2.bam

# Delete split files
rm split_$SLURM_ARRAY_TASK_ID.P*.fa

# BUILD MATRIX
hicBuildMatrix --samFiles $SLURM_ARRAY_TASK_ID.R1.bam $SLURM_ARRAY_TASK_ID.R2.bam --binSize $BIN --outBam $SLURM_ARRAY_TASK_ID\_hicMat.bam --minMappingQuality 20 -o $SLURM_ARRAY_TASK_ID\_$BIN\bp.h5 --QCfolder $SLURM_ARRAY_TASK_ID\_hicQC >& $SLURM_ARRAY_TASK_ID.buildmatrix.log

# CORRECT MATRIX

hicCorrectMatrix diagnostic_plot -m $SLURM_ARRAY_TASK_ID\_$BIN\bp.h5 --plotName $SLURM_ARRAY_TASK_ID.diagnostic_plot.png >& $SLURM_ARRAY_TASK_ID.plot.log
hicCorrectMatrix correct --filterThreshold -2 2 --perchr -m $SLURM_ARRAY_TASK_ID\_$BIN\bp.h5 -o $SLURM_ARRAY_TASK_ID\_hic_corrected_matrix.h5 >& $SLURM_ARRAY_TASK_ID.correct_matrix.log


Find TADs in replicates

In [None]:
#!/bin/bash

#SBATCH --partition=main    # Partition (job queue)
#SBATCH --job-name=findtads         # Assign an 8-character name to your job, no spaces
#SBATCH --nodes=1                # Number of compute nodes
#SBATCH --ntasks=1               # Processes (usually = cores) on each node
#SBATCH --cpus-per-task=28       # Threads per process (or per core)
#SBATCH --export=ALL             # Export you current environment settings to the job environment
#SBATCH --time=12:00:00
#SBATCH --mem=128000
#SBATCH --output=slurm-%A_%a.out
#SBATCH --array=

BIN=5000

hicFindTADs --matrix /dup_1/dmel_hic_corrected_matrix.h5 --minDepth 50000 --maxDepth 200000 --correctForMultipleTesting fdr --outPrefix ./dup_1/dmel
hicFindTADs --matrix /dup_2/dmel_hic_corrected_matrix.h5 --minDepth 50000 --maxDepth 200000 --correctForMultipleTesting fdr --outPrefix ./dup_2/dmel

hicFindTADs --matrix /dup_1/dtri_hic_corrected_matrix.h5 --minDepth 50000 --maxDepth 200000 --correctForMultipleTesting fdr --outPrefix ./dup_1/dtri
hicFindTADs --matrix /dup_2/dtri_hic_corrected_matrix.h5 --minDepth 50000 --maxDepth 200000 --correctForMultipleTesting fdr --outPrefix ./dup_2/dtri

Python script to plot and find correlation between TAD separation scores in replicates.

In [None]:
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
import sys

dup1=sys.argv[1]
SP=sys.argv[2]

df1 = pd.read_csv(dup1, sep = '\t', header = None)

df1S = df1.loc[:,3]
df2S = df1.loc[:,7]

stats = stats.spearmanr(df1S,df2S)
title = str(sp) + "_" + str(stats)
png = str(sp) + ".png"


plt.scatter(df1S,df2S)
plt.title(title)
plt.savefig(png)