In [1]:
!wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/gencode.v45.annotation.gtf.gz

--2024-03-28 15:15:35--  https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/gencode.v45.annotation.gtf.gz
Resolving ftp.ebi.ac.uk (ftp.ebi.ac.uk)... 193.62.193.165
Connecting to ftp.ebi.ac.uk (ftp.ebi.ac.uk)|193.62.193.165|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49770653 (47M) [application/x-gzip]
Saving to: ‘gencode.v45.annotation.gtf.gz’


2024-03-28 15:16:16 (1.17 MB/s) - ‘gencode.v45.annotation.gtf.gz’ saved [49770653/49770653]



In [2]:
!gunzip gencode.v45.annotation.gtf.gz

In [3]:
!ls gencode.v45.annotation.gtf

gencode.v45.annotation.gtf


In [4]:
import pandas as pd 

from dataclasses import dataclass, field

@dataclass
class GFFMeta:
    tags: set[str] = field(default_factory=set)
    attrs: dict[str, str] = field(default_factory=dict)

    @classmethod
    def from_str(cls, s: str):
        self = cls() 
        fields = s.split("; ")
        for field in fields:
            if field.startswith("tag "):
                self.tags.add(field[4:]) # remove 'tag '
            else:
                key, value = field.split(" ", maxsplit=1)
                self.attrs[key] = value
        return self
                

In [5]:
df = pd.read_table("gencode.v45.annotation.gtf", 
                   sep="\t", 
                   comment="#",
                   header=None,
                   names=["chrom",
                          "source",
                          "feature",
                          "start",
                          "end",
                          "score",
                          "strand",
                          "fname",
                          "attribute"])

In [6]:
df.head()

Unnamed: 0,chrom,source,feature,start,end,score,strand,fname,attribute
0,chr1,HAVANA,gene,11869,14409,.,+,.,"gene_id ""ENSG00000290825.1""; gene_type ""lncRNA..."
1,chr1,HAVANA,transcript,11869,14409,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
2,chr1,HAVANA,exon,11869,12227,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
3,chr1,HAVANA,exon,12613,12721,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
4,chr1,HAVANA,exon,13221,14409,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."


In [7]:
exons = df[df['feature'] == 'exon'].copy()

In [8]:
exons['attribute'] = exons['attribute'].apply(GFFMeta.from_str)

In [9]:
exons.head()

Unnamed: 0,chrom,source,feature,start,end,score,strand,fname,attribute
2,chr1,HAVANA,exon,11869,12227,.,+,.,"GFFMeta(tags={'""basic""', '""Ensembl_canonical""'..."
3,chr1,HAVANA,exon,12613,12721,.,+,.,"GFFMeta(tags={'""basic""', '""Ensembl_canonical""'..."
4,chr1,HAVANA,exon,13221,14409,.,+,.,"GFFMeta(tags={'""basic""', '""Ensembl_canonical""'..."
7,chr1,HAVANA,exon,12010,12057,.,+,.,"GFFMeta(tags={'""basic""', '""Ensembl_canonical""'..."
8,chr1,HAVANA,exon,12179,12227,.,+,.,"GFFMeta(tags={'""basic""', '""Ensembl_canonical""'..."


In [10]:
exons['attribute'].apply(lambda x: x.attrs.get('transcript_support_level', '-100')).value_counts()

attribute
-100     470674
"1"      404912
"5"      288018
"2"      235694
"3"      136794
"4"       71572
"NA"      43034
"5";          3
"NA";         3
Name: count, dtype: int64

In [13]:
# to zero-based annotation 
exons['start'] = exons['start'] - 1

In [14]:
exons[['chrom', 'start', 'end']].to_csv("exons.bed", sep="\t", index=False)

In [15]:
!head exons.bed

chrom	start	end
chr1	11868	12227
chr1	12612	12721
chr1	13220	14409
chr1	12009	12057
chr1	12178	12227
chr1	12612	12697
chr1	12974	13052
chr1	13220	13374
chr1	13452	13670
