# Annotating the DHS Masterlist with Gencode

1. Download Gencode Annotation file and DHS Masterlist file
2. Parse Gencode file and create bed files for gene body, exon, cds , promoter, utr, intronic, and intergenic
3. Map promoter, exon, intronic, and intergenic regions to DHS Masterlist
4. Annotate the DHS as promoter, exon, intron, or intergenic depending on which element had the largest overlap. If there is a tie in overlap, the winning annotation is based on the rank
5. For CDS and UTR under exon, pick the element that has the largest overlap. If there was a tie, pick the largest fraction of overlap
6. For Protein-Coding and Non-Protein-Coding under promoter/intronic, pick the element that has the largest overlap. If there was a tie, pick the largest fraction of overlap


In [4]:
#Establish Working Directory
import os
os.chdir("/home/nasi4/proj/encode3/DHS_Annotations/Gencode/results_streamline")

## Parse Gencode and map to DHS Masterlist

In [1]:
%%bash

#Refer back to Github README.md to obtain DHS Masterlist File and Basic Gencode v28 File
#Make sure bedops module is loaded in order to run bedmap and sort-bed.
#Remember to Download chromInfo.hg38.bed from Github Repository 
#Replace specific paths with what is written in the README.md

fdir=/home/nasi4/proj/encode3/DHS_Annotations/Gencode/files
masterlist=${fdir}/masterlist_DHSs_733samples_WM20180608_all_indexIDs.txt 

wdir=/home/nasi4/proj/encode3/DHS_Annotations/Gencode/results_streamline
cd $wdir

cut -f1-3 $masterlist > masterlist.bed3
dhs=masterlist.bed3
gencode=${fdir}/gencode.v28.basic.annotation.gtf

#Remove row if start = end (only ~2K cases)
tail -n +6 $gencode \
| awk -F'\t' '{
    if($4 != $5) {
        print $1"\t"$4"\t"$5"\t"$3"\t"$7
    }
}' \
| /net/module/sw/bedops/2.4.37-typical/bin/sort-bed -  \
> tmp.gencode

#Expand the transcription region to say promoter. +/- 1KB of TSS
awk -F'\t' '{
        if($4 == "transcript") {
                if ($5 == "+") {
                        print $1"\t"$2"\t"$2+1000"\t""promoter";
                }
                else if ($5 == "-") {
                        print $1"\t"$3-1000"\t"$3"\t""promoter";
                }
    }
        else if($4 != "transcript") {
                print $1"\t"$2"\t"$3"\t"$4;
        }
}' tmp.gencode \
| grep -v chrM | grep -v Selenocysteine | grep -v codon \
| /net/module/sw/bedops/2.4.37-typical/bin/sort-bed - \
> tmp2.gencode

#Need to find the INTRONS. Difference between gene and (CDS + PROMOTER + UTR) 
awk '{if($4 == "gene") print}' tmp2.gencode > gene.bed4
awk '{if($4 == "exon") print}' tmp2.gencode > exon.bed4
awk '{if($4 == "CDS") print}' tmp2.gencode > cds.bed4
awk '{if($4 == "promoter") print}' tmp2.gencode > promoter.bed4
awk '{if($4 == "UTR") print}' tmp2.gencode > utr.bed4

/net/module/sw/bedops/2.4.37-typical/bin/bedops --ec -m utr.bed4 exon.bed4 promoter.bed4 cds.bed4 | /net/module/sw/bedops/2.4.37-typical/bin/bedops --ec -d gene.bed4 - > tmp.intron.bed4
awk '{print $1"\t"$2"\t"$3"\t""intron"}' tmp.intron.bed4 > intron.bed4

#Need to find the Intergenic region. Difference between Genome 
cat /home/nasi4/proj/encode3/DHS_Annotations/Gencode/files/chromInfo.hg38.bed  \
| grep -v chrM > chromInfoNoM.bed
/net/module/sw/bedops/2.4.37-typical/bin/bedops --ec -d chromInfoNoM.bed gene.bed4 promoter.bed4 > tmp.intergenic.bed4
awk '{print $1"\t"$2"\t"$3"\t""intergenic"}' tmp.intergenic.bed4 > intergenic.bed4

#Clean Up
rm tmp.intron.bed4
rm tmp.intergenic.bed4

#First Plot
/net/module/sw/bedops/2.4.37-typical/bin/bedops --ec -u promoter.bed4 exon.bed4 intron.bed4 intergenic.bed4 | /net/module/sw/bedops/2.4.37-typical/bin/sort-bed - > gencode-genome.bed4
/net/module/sw/bedops/2.4.37-typical/bin/bedmap --ec --echo --echo-map --skip-unmapped --echo-overlap-size $dhs gencode-genome.bed4 > test.DHS-genome.bed

### Bed file of parsed Gencode
Includes overlaps

In [5]:
%%bash
head gencode-genome.bed4

chr1	0	11869	intergenic
chr1	11869	12227	exon
chr1	11869	12869	promoter
chr1	12010	12057	exon
chr1	12010	13010	promoter
chr1	12179	12227	exon
chr1	12613	12697	exon
chr1	12613	12721	exon
chr1	12975	13052	exon
chr1	13052	13221	intron


### Bed file of DHS masterlist mapped to Gencode
Includes overlap size of Gencode elements

In [6]:
%%bash
head test.DHS-genome.bed

chr1	16140	16200|chr1	15947	16436	intron|60
chr1	51868	52040|chr1	36081	52473	intergenic|172
chr1	57280	57354|chr1	53473	57598	intergenic|74
chr1	66370	66482|chr1	65419	66419	promoter;chr1	66419	69037	intron|49;63
chr1	79100	79231|chr1	71585	89295	intergenic|131
chr1	79430	79497|chr1	71585	89295	intergenic|67
chr1	79580	79760|chr1	71585	89295	intergenic|180
chr1	87220	87295|chr1	71585	89295	intergenic|75
chr1	88220	88360|chr1	71585	89295	intergenic|140
chr1	88700	88814|chr1	71585	89295	intergenic|114


## Choose the Best Annotation
Annotated DHS as promoter, exon, intron, or intergenic depending on which element had the largest overlap. If there was a tie, the winning annotation is based on the rank

Promoter > Exon > Intron > Intergenic

In [7]:
%%bash

#Rank the elements. Promoter > Exon > Intron > Intergenic.
#If two elements have the same overlap count, the winning element is chosen based on the rank

biggest=0
col=0

sed 's/intergenic/1/g' test.DHS-genome.bed \
| sed 's/intron/2/g' \
| sed 's/exon/3/g' \
| sed 's/promoter/4/g' \
> choose_best.bed

awk -F'|' -v b=$biggest -v c=$col '{
        line=$3
        split(line,a,";")

        mapped=$2
        split(mapped,m,";")
        
    if (length(a) == 1) {
        print $1"\t"$2
    }
    else {
        for(i=1;i<=NF;i++) {
            if (a[i] > b) {
                b=a[i];
                c=i;
            }
            else if (a[i] == b) {
                old=m[c];
                split(old,o,"\t");
                new=m[i];
                split(new,n,"\t");
                if (o[4] < n[4]) {
                    b=a[i];
                    c=i;
                }
            }
        }
    print $1"\t"m[c];
    b=0;
    }

}'  choose_best.bed > overlap-answer.txt



awk '{print $1"\t"$2"\t"$3"\t"$7}' overlap-answer.txt \
| /net/module/sw/bedops/2.4.37-typical/bin/sort-bed - \
| awk '{if($4 == 1) print $1"\t"$2"\t"$3"\t""intergenic"; else if($4 == 2) print $1"\t"$2"\t"$3"\t""intron"; else if($4 == 3) print $1"\t"$2"\t"$3"\t""exon"; else if($4 == 4) print $1"\t"$2"\t"$3"\t""promoter"}' - \
> dhs_annotated_gencode28.bed


### Annotated DHS Masterlist as Promoter, Exon, Intron, or Intergenic

In [8]:
%%bash
head dhs_annotated_gencode28.bed
cut -f4 dhs_annotated_gencode28.bed | sort - | uniq -c
total=`wc -l dhs_annotated_gencode28.bed | cut -d' ' -f1`
echo "Total: $total"

chr1	16140	16200	intron
chr1	51868	52040	intergenic
chr1	57280	57354	intergenic
chr1	66370	66482	intron
chr1	79100	79231	intergenic
chr1	79430	79497	intergenic
chr1	79580	79760	intergenic
chr1	87220	87295	intergenic
chr1	88220	88360	intergenic
chr1	88700	88814	intergenic
 158527 exon
1376951 intergenic
1891595 intron
 164825 promoter
Total: 3591898


### Split Exon Annotated DHSs to CDS, UTR, and non-coding

1. Extract Exonic regions from DHS Annotation
2. Extract UTR and CDS labeled regions from Gencode
3. Map UTR/CDS regions to Exonic DHS Annotations
4. Pick Element with the largest overlap. If there is a tie, pick the element with 
the largest fraction of overlap


In [16]:
%%bash
#Path to Gencode (Get from README.md)
fdir=/home/nasi4/proj/encode3/DHS_Annotations/Gencode/files
gencode=${fdir}/gencode.v28.basic.annotation.gtf

#Need
#dhs_annotated_gencode28.bed

tail -n +6 $gencode \
| awk -F'\t' '{
        if($4 != $5) {
                print $1"\t"$4"\t"$5"\t"$3;
        }
}' \
| /net/module/sw/bedops/2.4.37-typical/bin/sort-bed - \
> tmp.gencode


#Map
awk '{if($4 == "exon") print}' dhs_annotated_gencode28.bed > dhs-exon.bed

awk '{if($4 == "UTR") print}' tmp.gencode > utr.bed
awk '{if($4 == "CDS") print}' tmp.gencode > cds.bed
/net/module/sw/bedops/2.4.37-typical/bin/bedops -u utr.bed cds.bed > utr-cds-gencode.bed



/net/module/sw/bedops/2.4.37-typical/bin/bedmap --echo --echo-map --echo-overlap-size --echo-map-size --skip-unmapped --ec dhs-exon.bed  utr-cds-gencode.bed \
> dhs_mapped_with_overlapPlusExtra.bed


#Choose the element with the largest overlap or the largest fraction of overlap

biggest=0
col=0
fraction=0

awk -F'|' -v f=$fraction -v b=$biggest -v c=$col '{
        line=$3
        split(line,a,";")
        mapped=$2
        split(mapped,m,";")
        size=$4
        split(size,s,";")
        
        if (length(a) == 1) {
                c=1;
        }
        else {
                for(i=1;i<=NF;i++) {
                        if (a[i] > b) {
                                b=a[i];
                                c=i;
                                f=a[i]/s[i];
                        }
                        else if (a[i] == b) {
                                if(a[i]/s[i] > f) {
                                        b=a[i];
                                        c=i;
                                }
                        } 
                }      
        }
        print $1"\t"m[c];
        b=0;      
}' dhs_mapped_with_overlapPlusExtra.bed > overlap-answer.txt


awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$8}' overlap-answer.txt \
| /net/module/sw/bedops/2.4.37-typical/bin/sort-bed - \
> dhs_annotated_exon.bed

In [15]:
%%bash
head dhs_annotated_exon.bed
cut -f5 dhs_annotated_exon.bed | sort - | uniq -c
total=`wc -l dhs_annotated_exon.bed | cut -d' ' -f1`
echo "Total: $total"

chr1	935890	936080	exon	CDS
chr1	939198	939366	exon	CDS
chr1	939440	939680	exon	CDS
chr1	941140	941361	exon	CDS
chr1	941140	941620	exon	CDS
chr1	942191	942220	exon	CDS
chr1	942500	942700	exon	CDS
chr1	942660	942780	exon	CDS
chr1	943036	943460	exon	CDS
chr1	943189	943401	exon	CDS
  68648 CDS
  60908 UTR
Total: 129556


### Split Promoter annotated DHSs to coding-gene and non-coding

1. Extract Promoter regions from DHS Annotation
2. Extract protein-coding labeled regions from gencode
3. Map protein-coding/non-coding gencode regions to Promoter DHS Annotations
4. Pick element with the largest overlap. If there is a tie, pick the element with the largest fraction of overlap

Notes:
NPC = non-coding, PC = protein-coding

In [18]:
%%bash

#Path to Gencode (Get from README.md)
fdir=/home/nasi4/proj/encode3/DHS_Annotations/Gencode/files
gencode=${fdir}/gencode.v28.basic.annotation.gtf

#Need
#dhs_annotated_gencode28.bed


tail -n +6 $gencode \
| awk -F'\t' '{
        if($4 != $5) {
                print $1"\t"$4"\t"$5"\t"$3"\t"$9;
        }
}' \
| /net/module/sw/bedops/2.4.37-typical/bin/sort-bed - \
> tmp.gencode


#Map
awk '{if($4 == "promoter") print}' dhs_annotated_gencode28.bed > dhs-promoter.bed

grep protein_coding tmp.gencode \
| awk '{print $1"\t"$2"\t"$3"\t""PC"}' - > PC.bed

grep -v protein_coding tmp.gencode \
| awk '{print $1"\t"$2"\t"$3"\t""NPC"}'> NPC.bed


/net/module/sw/bedops/2.4.37-typical/bin/bedops -u PC.bed NPC.bed > PC-NPC-gencode.bed


/net/module/sw/bedops/2.4.37-typical/bin/bedmap --echo --echo-map --echo-overlap-size --echo-map-size --skip-unmapped --ec dhs-promoter.bed PC-NPC-gencode.bed \
> dhs_mapped_with_overlapPlusExtra.bed

#Pick the element with the largest overlap or the largest fraction of overlap

biggest=0
col=0
fraction=0

awk -F'|' -v f=$fraction -v b=$biggest -v c=$col '{
        line=$3
        split(line,a,";")
        mapped=$2
        split(mapped,m,";")
        size=$4
        split(size,s,";")
        
        if (length(a) == 1) {
                c=1;
        }
        else {
                for(i=1;i<=NF;i++) {
                        if (a[i] > b) {
                                b=a[i];
                                c=i;
                                f=a[i]/s[i];
                        }
                        else if (a[i] == b) {
                                if(a[i]/s[i] > f) {
                                        b=a[i];
                                        c=i;
                                }
                        } 
                }      
        }
        print $1"\t"m[c];
        b=0;      
}' dhs_mapped_with_overlapPlusExtra.bed > overlap-answer.txt

awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$8}' overlap-answer.txt \
| /net/module/sw/bedops/2.4.37-typical/bin/sort-bed - \
> dhs_annotated_promoter.bed

In [19]:
%%bash
head dhs_annotated_promoter.bed
cut -f5 dhs_annotated_promoter.bed | sort - | uniq -c
total=`wc -l dhs_annotated_promoter.bed | cut -d' ' -f1`
echo "Total: $total"

chr1	90140	90209	promoter	NPC
chr1	135100	135144	promoter	NPC
chr1	182681	182819	promoter	NPC
chr1	186960	187129	promoter	NPC
chr1	629100	629280	promoter	NPC
chr1	629160	629310	promoter	NPC
chr1	629512	629580	promoter	NPC
chr1	629520	629596	promoter	NPC
chr1	629870	630020	promoter	NPC
chr1	630075	630240	promoter	NPC
  47219 NPC
 112242 PC
Total: 159461


### Split Intronic annotated DHSs to coding-gene and non-coding

1. Extract Intronic regions from DHS Annotation
2. Extract protein-coding labeled regions from gencode
3. Map protein-coding/non-coding gencode regions to Intronic DHS Annotations
4. Pick element with the largest overlap. If there is a tie, pick the element with the largest fraction of overlap

Notes:
NPC = non-coding, PC = protein-coding

In [22]:
%%bash

#Path to Gencode (Get from README.md)
fdir=/home/nasi4/proj/encode3/DHS_Annotations/Gencode/files
gencode=${fdir}/gencode.v28.basic.annotation.gtf

#Need
#dhs_annotated_gencode28.bed

genePart="intron"

tail -n +6 $gencode \
| awk -F'\t' '{
        if($4 != $5) {
                print $1"\t"$4"\t"$5"\t"$3"\t"$9;
        }
}' \
| /net/module/sw/bedops/2.4.37-typical/bin/sort-bed - \
> tmp.gencode


#Map PC/NPC regions to Intronic DHSs
awk -F'\t' '{if($4 == "intron") print}' dhs_annotated_gencode28.bed > dhs-intron.bed

grep protein_coding tmp.gencode \
| awk '{print $1"\t"$2"\t"$3"\t""PC"}' - > PC.bed

grep -v protein_coding tmp.gencode \
| awk '{print $1"\t"$2"\t"$3"\t""NPC"}'> NPC.bed

/net/module/sw/bedops/2.4.37-typical/bin/bedops -u PC.bed NPC.bed > PC-NPC-gencode.bed

/net/module/sw/bedops/2.4.37-typical/bin/bedmap --echo --echo-map --echo-overlap-size --echo-map-size --skip-unmapped --ec dhs-${genePart}.bed PC-NPC-gencode.bed \
> dhs_mapped_with_overlapPlusExtra.bed

biggest=0
col=0
fraction=0

awk -F'|' -v f=$fraction -v b=$biggest -v c=$col '{
        line=$3
        split(line,a,";")
        mapped=$2
        split(mapped,m,";")
        size=$4
        split(size,s,";")
        
        if (length(a) == 1) {
                c=1;
        }
        else {
                for(i=1;i<=NF;i++) {
                        if (a[i] > b) {
                                b=a[i];
                                c=i;
                                f=a[i]/s[i];
                        }
                        else if (a[i] == b) {
                                if(a[i]/s[i] > f) {
                                        b=a[i];
                                        c=i;
                                }
                        } 
                }      
        }
        print $1"\t"m[c];
        b=0;      
}' dhs_mapped_with_overlapPlusExtra.bed > overlap-answer.txt

awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$8}' overlap-answer.txt \
| /net/module/sw/bedops/2.4.37-typical/bin/sort-bed - \
> dhs_annotated_${genePart}.bed

In [23]:
%%bash
head dhs_annotated_intron.bed
cut -f5 dhs_annotated_intron.bed | sort - | uniq -c
total=`wc -l dhs_annotated_intron.bed | cut -d' ' -f1`
echo "Total: $total"

chr1	16140	16200	intron	NPC
chr1	66370	66482	intron	PC
chr1	99630	99717	intron	NPC
chr1	113860	113950	intron	NPC
chr1	128619	128757	intron	NPC
chr1	186727	186834	intron	NPC
chr1	186817	186996	intron	NPC
chr1	190865	190920	intron	NPC
chr1	190920	191071	intron	NPC
chr1	191260	191340	intron	NPC
 383039 NPC
1508556 PC
Total: 1891595
