# Annotating the DHS Masterlist with Repeated Regions

 
1.	Download RepeatMasker and DHS Masterlist Files
2.	Map RepeatMasker to DHS Masterlist and echo the overlap and mapped-element size
3.	Choose element that has the largest overlap or the largest fraction of overlap, if there is a tie
4.	Rename Class Annotation to SINE, LINE, LTR, Simple_repeat, DNA, or Other (includes anything not already named)
5.  Annotate DHS's based on Family Repeats

## Download RepeatMasker and DHS Masterlist Files

1. Look at Github REAME.md for instructions on how to download and name RepeatMasker File and DHS Masterlist
2. Load bedops in order to run bedops, bedmap, and sort-bed


### DHS Masterlist

In [71]:
%%bash
#Initial DHS_Index Filename should be "DHS_Index_and_Vocabulary_hg38_WM20190703.txt.gz"
gunzip DHS_Index_and_Vocabulary_hg38_WM20190703.txt.gz
cut -f1-3 DHS_Index_and_Vocabulary_hg38_WM20190703.txt \
| tail -n +2 \
> DHS_Index.bed

head DHS_Index.bed
total=`wc -l DHS_Index.bed | cut -d' ' -f1`
echo "Total number of rows: $total"

chr1	16140	16200
chr1	51868	52040
chr1	57280	57354
chr1	66370	66482
chr1	79100	79231
chr1	79430	79497
chr1	79580	79760
chr1	87220	87295
chr1	88220	88360
chr1	88700	88814
Total number of rows: 3591898


### Repeats

In [72]:
%%bash
#Initial Repeats Filename should be "repeats_ucsc.gz"
gunzip repeats_ucsc.gz
head repeats_ucsc
total=`wc -l repeats_ucsc | cut -d' ' -f1`
echo "Total number of rows: $total"

#bin	swScore	milliDiv	milliDel	milliIns	genoName	genoStart	genoEnd	genoLeft	strand	repName	repClass	repFamily	repStart	repEnd	repLeft	id
0	1892	83	59	14	chr1	67108753	67109046	-181847376	+	L1P5	LINE	L1	5301	5607	-544	1
1	2582	27	0	23	chr1	8388315	8388618	-240567804	-	AluY	SINE	Alu	-15	296	1	1
1	4085	171	77	36	chr1	25165803	25166380	-223790042	+	L1MB5	LINE	L1	5567	6174	0	4
1	2285	91	0	13	chr1	33554185	33554483	-215401939	-	AluSc	SINE	Alu	-6	303	10	6
1	2451	64	3	26	chr1	41942894	41943205	-207013217	-	AluY	SINE	Alu	-7	304	1	8
1	1587	272	100	49	chr1	50331336	50332274	-198624148	+	HAL1	LINE	L1	773	1763	-744	9
1	1393	280	82	51	chr1	58719764	58720546	-190235876	+	L2a	LINE	L2	2582	3418	-8	1
2	5372	165	14	27	chr1	75496057	75497775	-173458647	+	L1MA9	LINE	L1	5168	6868	-30	1
2	536	349	146	56	chr1	92274205	92275925	-156680497	+	L2	LINE	L2	406	2306	-1113	1
Total number of rows: 5607739


## Map repeats_ucsc to DHS_Index.bed

In [73]:
%%bash

##Filter repeats_ucsc, map to DHS_Index.bed and print overlap stats
tail -n +2 repeats_ucsc \
| cut -f6-8,10-13 \
| sort-bed - \
| grep -v LTR? | grep -v DNA? | grep -v RC? | grep -v SINE? \
| bedmap --echo --echo-map --echo-overlap-size --echo-map-size --skip-unmapped --ec DHS_Index.bed - \
> repeats_mapped.bed


#Make directory to clean up working directory
mkdir -p extra_files
gzip DHS_Index_and_Vocabulary_hg38_WM20190703.txt
mv DHS_Index_and_Vocabulary_hg38_WM20190703.txt.gz extra_files/
mv repeats_ucsc extra_files
mv DHS_Index.bed extra_files/

### Mapped Repeats File

In [74]:
%%bash
head repeats_mapped.bed
total=`wc -l repeats_mapped.bed | cut -d' ' -f1`
echo "Total number of rows: $total"

chr1	51868	52040|chr1	51584	51880	+	AluYj4	SINE	Alu|12|296
chr1	66370	66482|chr1	66157	66632	+	(AT)n	Simple_repeat	Simple_repeat|112|475
chr1	79100	79231|chr1	78890	79850	+	L1PREC2	LINE	L1|131|960
chr1	79430	79497|chr1	78890	79850	+	L1PREC2	LINE	L1|67|960
chr1	79580	79760|chr1	78890	79850	+	L1PREC2	LINE	L1|180|960
chr1	87220	87295|chr1	87125	87413	-	AluJo	SINE	Alu|75|288
chr1	88220	88360|chr1	88143	88823	+	L2	LINE	L2|140|680
chr1	88700	88814|chr1	88143	88823	+	L2	LINE	L2|114|680
chr1	89780	89959|chr1	89858	90056	+	MLT1H2	LTR	ERVL-MaLR|101|198
chr1	113860	113950|chr1	113691	114101	+	MLT1F2	LTR	ERVL-MaLR|90|410
Total number of rows: 1930749


## Choose the best DHS annotation
When there were multiple repeated regions that mapped to the DHS Masterlist, choose the region that has the largest fraction of overlap. 

In [76]:
%%bash
#Choose the Repeat Region with largest fraction of overlap 

biggest=0
col=0
fraction=0

awk -F'|' -v f=$fraction -v b=$biggest -v c=$col '{
        line=$3
        split(line,a,";")

        mapped=$2
        split(mapped,m,";")

        size=$4
        split(size,s,";")
        
        if (length(a) == 1) {
            c=1;
        }
        else {
                for(i=1;i<=NF;i++) {
                        if (a[i] > b) {
                                b=a[i];
                                c=i;
                                f=a[i]/s[i];
                        }
                        else if (a[i] == b) {
                            if(a[i]/s[i] > f) {
                                b=a[i];
                                c=i;
                            }
                        } 
            }      
        }
        print $1"\t"m[c];
        b=0;      

}'  repeats_mapped.bed  > best_annotations.txt


awk '{print $1"\t"$2"\t"$3"\t"$9"\t"$10}' best_annotations.txt \
| sort-bed - \
> dhs_annotated_all-repeats.bed

#Create 7 Groups. 6 with original Repeat names and the last one "Others"
awk '{
        if ($4 != "SINE" && $4 != "LINE" && $4 != "LTR" && $4 != "Simple_repeat" && $4 != "DNA" && $4 != "Low_complexity") {
                print $1"\t"$2"\t"$3"\t""Others""\t""Others";
        }
        else {
                print;
        }
}' dhs_annotated_all-repeats.bed > dhs_annotated_7-classRepeats.bed

#Clean-up
mv best_annotations.txt extra_files
mv dhs_annotated_all-repeats.bed extra_files
mv repeats_mapped.bed extra_files

### DHS Masterlist Annotated with Repeats

In [77]:
%%bash
head dhs_annotated_7-classRepeats.bed
cut -f4 dhs_annotated_7-classRepeats.bed | sort - | uniq -c
total=`wc -l dhs_annotated_7-classRepeats.bed | cut -d' ' -f1`
echo "Total: $total"


chr1	51868	52040	SINE	Alu
chr1	66370	66482	Simple_repeat	Simple_repeat
chr1	79100	79231	LINE	L1
chr1	79430	79497	LINE	L1
chr1	79580	79760	LINE	L1
chr1	87220	87295	SINE	Alu
chr1	88220	88360	LINE	L2
chr1	88700	88814	LINE	L2
chr1	89780	89959	LTR	ERVL-MaLR
chr1	113860	113950	LTR	ERVL-MaLR
 196686 DNA
 605539 LINE
  23210 Low_complexity
 451872 LTR
  20917 Others
 118922 Simple_repeat
 513603 SINE
Total: 1930749


### Annotate and Clean-up Family Repeats

In [78]:
%%bash

###Find the distribution of family repeats
#SINE -> Alu, MIR, and Others
#LTR -> ERVL-MaLR, ERV1, ERVL, Others
#DNA -> hAT-Charlie, TcMar-Tigger, Others
#LINE -> L1, L2, Others

#Split dhs_annotated_7-classRepeats.bed into the four classes that have familes
for i in SINE LINE LTR DNA
do
        awk -v class="$i" '{if($4 == class) print}' dhs_annotated_7-classRepeats.bed > tmp.${i}.bed
done


awk '{if($5 != "Alu" &&  $5 != "MIR") {
        print $1"\t"$2"\t"$3"\t"$4"\t""Others"; 
        }
        
        else {
        print $1"\t"$2"\t"$3"\t"$4"\t"$5;
        }
}' tmp.SINE.bed > SINE.bed

#LTR -> ERVL-MaLR, ERV1, ERVL, Others
awk '{if($5 != "ERVL-MaLR" &&  $5 != "ERV1" && $5 != "ERVL") {
        print $1"\t"$2"\t"$3"\t"$4"\t""Others"; 
        }
        
        else {
        print $1"\t"$2"\t"$3"\t"$4"\t"$5;
        }
}' tmp.LTR.bed > LTR.bed

#DNA -> hAT-Charlie, TcMar-Tigger, Others
awk '{if($5 != "hAT-Charlie" &&  $5 != "TcMar-Tigger") {
        print $1"\t"$2"\t"$3"\t"$4"\t""Others"; 
        }
        
        else {
        print $1"\t"$2"\t"$3"\t"$4"\t"$5;
        }
}' tmp.DNA.bed > DNA.bed

#LINE -> L1, L2, Others
awk '{if($5 != "L1" &&  $5 != "L2") {
        print $1"\t"$2"\t"$3"\t"$4"\t""Others"; 
        }
        
        else {
        print $1"\t"$2"\t"$3"\t"$4"\t"$5;
        }
}' tmp.LINE.bed > LINE.bed



#Clean Up
rm tmp.*

### Sample SINE Family Repeats

In [79]:
%%bash 
head SINE.bed
cut -f5 SINE.bed | sort - | uniq -c
total=`wc -l SINE.bed | cut -d' ' -f1`
echo "Total: $total"

chr1	51868	52040	SINE	Alu
chr1	87220	87295	SINE	Alu
chr1	128619	128757	SINE	Alu
chr1	284375	284489	SINE	MIR
chr1	740730	740844	SINE	Alu
chr1	770440	770540	SINE	Alu
chr1	775269	775340	SINE	Alu
chr1	777886	778041	SINE	Alu
chr1	779727	779830	SINE	Alu
chr1	779794	779972	SINE	Alu
 256390 Alu
 250203 MIR
   7010 Others
Total: 513603
