## Annotating the DHS Masterlist with Repeated Regions

 
1.	Download RepeatMasker and DHS Masterlist Files
2.	Map RepeatMasker to DHS Masterlist and echo the overlap and mapped-element size
3.	Choose element that has the largest overlap or the largest fraction of overlap, if there is a tie
4.	Rename Class Annotation to SINE, LINE, LTR, Simple_repeat, DNA, or Other (includes anything not already named)
5.  Annotate DHS's based on Family Repeats

In [12]:
#Establish Working Directory
import os
os.chdir("/home/nasi4/proj/encode3/DHS_Annotations/Repeated_Regions/results_streamline")

In [110]:
%%bash

##Establish Paths to Tools##
##Work in progress. Right now will just use the path to the tool (bedops/sort-bed)
export MODULE_VERSION_STACK=3.2.10
export LD_LIBRARY_PATH=/net/module/sw/tcl/8.5.19/lib/
export MODULE_VERSION=3.2.10
export MODULEPATH=/net/module/Modules/versions:/net/module/Modules/3.2.10/modulefiles:/net/module/Modules/modulefiles
export LOADEDMODULES=
export MODULESHOME=/net/module/Modules/3.2.10

module(){
    arg1=$1
    arg2=$2
    arg3=$3
    /net/module/Modules/3.2.10/bin/modulecmd bash "$@"
}

##Establish Directories and Key Files
fdir=/home/nasi4/proj/encode3/DHS_Annotations/Repeated_Regions/files
masterlist=${fdir}/masterlist_DHSs_733samples_WM20180608_all_indexIDs.txt


wdir=/home/nasi4/proj/encode3/DHS_Annotations/Repeated_Regions/results_streamline
cd $wdir

cut -f1-3 $masterlist > masterlist.bed3


## DHS Masterlist

In [114]:
%%bash
dhs=masterlist.bed3
head $dhs

chr1	16140	16200
chr1	51868	52040
chr1	57280	57354
chr1	66370	66482
chr1	79100	79231
chr1	79430	79497
chr1	79580	79760
chr1	87220	87295
chr1	88220	88360
chr1	88700	88814


## Repeats

In [116]:
%%bash
repeats=/home/nasi4/proj/encode3/DHS_Annotations/Repeated_Regions/files/repeats.txt
head $repeats

#genoName	genoStart	genoEnd	strand	repName	repClass	repFamily
chr1	67108753	67109046	+	L1P5	LINE	L1
chr1	8388315	8388618	-	AluY	SINE	Alu
chr1	25165803	25166380	+	L1MB5	LINE	L1
chr1	33554185	33554483	-	AluSc	SINE	Alu
chr1	41942894	41943205	-	AluY	SINE	Alu
chr1	50331336	50332274	+	HAL1	LINE	L1
chr1	58719764	58720546	+	L2a	LINE	L2
chr1	75496057	75497775	+	L1MA9	LINE	L1
chr1	92274205	92275925	+	L2	LINE	L2


In [119]:
%%bash
#Make temporary directory
mkdir -p extra_files


#Map the repeats.txt with the masterlist
if [ -f extra_files/repeats_mapped_with_overlapPlusExtra.bed ];
then
    echo "repeats_mapped_with_overlapPlusExtra.bed exists"

else

repeats=/home/nasi4/proj/encode3/DHS_Annotations/Repeated_Regions/files/repeats.txt
dhs=masterlist.bed3

tail -n +2 $repeats \
| /net/module/sw/bedops/2.4.37-typical/bin/sort-bed - \
| grep -v LTR? | grep -v DNA? | grep -v RC? | grep -v SINE? \
| /net/module/sw/bedops/2.4.37-typical/bin/bedmap --echo --echo-map --echo-overlap-size --echo-map-size --skip-unmapped --ec $dhs - \
> repeats_mapped_with_overlapPlusExtra.bed
fi

## DHS Masterlist Mapped to Repeats
Includes all mapped elements

In [120]:
%%bash
head repeats_mapped_with_overlapPlusExtra.bed
wc -l repeats_mapped_with_overlapPlusExtra.bed

chr1	51868	52040|chr1	51584	51880	+	AluYj4	SINE	Alu|12|296
chr1	66370	66482|chr1	66157	66632	+	(AT)n	Simple_repeat	Simple_repeat|112|475
chr1	79100	79231|chr1	78890	79850	+	L1PREC2	LINE	L1|131|960
chr1	79430	79497|chr1	78890	79850	+	L1PREC2	LINE	L1|67|960
chr1	79580	79760|chr1	78890	79850	+	L1PREC2	LINE	L1|180|960
chr1	87220	87295|chr1	87125	87413	-	AluJo	SINE	Alu|75|288
chr1	88220	88360|chr1	88143	88823	+	L2	LINE	L2|140|680
chr1	88700	88814|chr1	88143	88823	+	L2	LINE	L2|114|680
chr1	89780	89959|chr1	89858	90056	+	MLT1H2	LTR	ERVL-MaLR|101|198
chr1	113860	113950|chr1	113691	114101	+	MLT1F2	LTR	ERVL-MaLR|90|410
1930749 repeats_mapped_with_overlapPlusExtra.bed


## Choose the best annotation
When there were multiple repeated regions that mapped to the DHS Masterlist, chose the region that had the largest fraction of overlap. 

In [121]:
%%bash
#Choose the Repeat Region with largest fraction of overlap 

biggest=0
col=0
fraction=0

awk -F'|' -v f=$fraction -v b=$biggest -v c=$col '{
        line=$3
        split(line,a,";")

        mapped=$2
        split(mapped,m,";")

        size=$4
        split(size,s,";")
        
        if (length(a) == 1) {
            c=1;
        }
        else {
                for(i=1;i<=NF;i++) {
                        if (a[i] > b) {
                                b=a[i];
                                c=i;
                                f=a[i]/s[i];
                        }
                        else if (a[i] == b) {
                            if(a[i]/s[i] > f) {
                                b=a[i];
                                c=i;
                            }
                        } 
            }      
        }
        print $1"\t"m[c];
        b=0;      

}'  repeats_mapped_with_overlapPlusExtra.bed  > overlap-answer.txt


awk '{print $1"\t"$2"\t"$3"\t"$9"\t"$10}' overlap-answer.txt \
| /net/module/sw/bedops/2.4.37-typical/bin/sort-bed - \
> dhs_annotated_all-repeats.bed

#Create 7 Groups. 6 with original Repeat names and the last one "Others"
awk '{
        if ($4 != "SINE" && $4 != "LINE" && $4 != "LTR" && $4 != "Simple_repeat" && $4 != "DNA" && $4 != "Low_complexity") {
                print $1"\t"$2"\t"$3"\t""Others""\t""Others";
        }
        else {
                print;
        }
}' dhs_annotated_all-repeats.bed > dhs_annotated_7-repeats.bed

#Clean-up
mv repeats_mapped_with_overlapPlusExtra.bed extra_files/
mv masterlist.bed3 extra_files/
mv overlap-answer.txt extra_files
mv dhs_annotated_all-repeats.bed extra_files

## DHS Masterlist Annotated with Repeats

In [3]:
%%bash
head dhs_annotated_7-repeats.bed
cut -f4 dhs_annotated_7-repeats.bed | sort - | uniq -c
total=`wc -l dhs_annotated_7-repeats.bed | cut -d' ' -f1`
echo "Total: $total"


chr1	51868	52040	SINE	Alu
chr1	66370	66482	Simple_repeat	Simple_repeat
chr1	79100	79231	LINE	L1
chr1	79430	79497	LINE	L1
chr1	79580	79760	LINE	L1
chr1	87220	87295	SINE	Alu
chr1	88220	88360	LINE	L2
chr1	88700	88814	LINE	L2
chr1	89780	89959	LTR	ERVL-MaLR
chr1	113860	113950	LTR	ERVL-MaLR
 196686 DNA
 605539 LINE
  23210 Low_complexity
 451872 LTR
  20917 Others
 118922 Simple_repeat
 513603 SINE
Total: 1930749


### Annotate Family Repeats

In [15]:
%%bash

###Find the distribution of family repeats
#SINE -> Alu, MIR, and Others
#LTR -> ERVL-MaLR, ERV1, ERVL, Others
#DNA -> hAT-Charlie, TcMar-Tigger, Others
#LINE -> L1, L2, Others

#Split dhs_annotated_7-repeats.bed into the four classes that have familes
for i in SINE LINE LTR DNA
do
        awk -v class="$i" '{if($4 == class) print}' dhs_annotated_7-repeats.bed > tmp.${i}.bed
done


awk '{if($5 != "Alu" &&  $5 != "MIR") {
        print $1"\t"$2"\t"$3"\t"$4"\t""Others"; 
        }
        
        else {
        print $1"\t"$2"\t"$3"\t"$4"\t"$5;
        }
}' tmp.SINE.bed > SINE.bed

#LTR -> ERVL-MaLR, ERV1, ERVL, Others
awk '{if($5 != "ERVL-MaLR" &&  $5 != "ERV1" && $5 != "ERVL") {
        print $1"\t"$2"\t"$3"\t"$4"\t""Others"; 
        }
        
        else {
        print $1"\t"$2"\t"$3"\t"$4"\t"$5;
        }
}' tmp.LTR.bed > LTR.bed

#DNA -> hAT-Charlie, TcMar-Tigger, Others
awk '{if($5 != "hAT-Charlie" &&  $5 != "TcMar-Tigger") {
        print $1"\t"$2"\t"$3"\t"$4"\t""Others"; 
        }
        
        else {
        print $1"\t"$2"\t"$3"\t"$4"\t"$5;
        }
}' tmp.DNA.bed > DNA.bed

#LINE -> L1, L2, Others
awk '{if($5 != "L1" &&  $5 != "L2") {
        print $1"\t"$2"\t"$3"\t"$4"\t""Others"; 
        }
        
        else {
        print $1"\t"$2"\t"$3"\t"$4"\t"$5;
        }
}' tmp.LINE.bed > LINE.bed



#Clean Up
rm tmp.*