In [1]:
import numpy as np
import scipy as sp
import Bio
from tqdm import tqdm
import pandas as pd

In [3]:
trna_counter = []
gene_counter = []

with open('./ftp-data/gencode.v44.annotation.gtf') as topo_file:
    # counter = 0
    for line in topo_file:

        if line.startswith("#"):
            continue
        columns = line.split(sep="\t")
        feature_type = columns[2]
        length = int(columns[4])-int(columns[3])

        if feature_type == "tRNA":
            trna_counter.append(length)
        elif feature_type == "gene":
            gene_counter.append(length)

In [6]:
len(gene_counter)

62700

In [8]:
from BCBio.GFF import GFFExaminer

in_file = "./ftp-data/gencode.v44.annotation.gtf"
examiner = GFFExaminer()

In [9]:
examiner.available_limits(in_file)

{'gff_id': {('chr1',): 317470,
  ('chr2',): 259975,
  ('chr3',): 222905,
  ('chr4',): 142001,
  ('chr5',): 154025,
  ('chr6',): 162467,
  ('chr7',): 162555,
  ('chr8',): 124698,
  ('chr9',): 129496,
  ('chr10',): 135061,
  ('chr11',): 202627,
  ('chr12',): 188637,
  ('chr13',): 56972,
  ('chr14',): 122480,
  ('chr15',): 120500,
  ('chr16',): 151172,
  ('chr17',): 204071,
  ('chr18',): 57932,
  ('chr19',): 188662,
  ('chr20',): 80173,
  ('chr21',): 38256,
  ('chr22',): 71439,
  ('chrX',): 120574,
  ('chrY',): 9898,
  ('chrM',): 143},
 'gff_source_type': {('HAVANA', 'gene'): 55120,
  ('HAVANA', 'transcript'): 241005,
  ('HAVANA', 'exon'): 1598650,
  ('ENSEMBL', 'gene'): 7580,
  ('ENSEMBL', 'transcript'): 11830,
  ('ENSEMBL', 'exon'): 50826,
  ('HAVANA', 'CDS'): 847237,
  ('HAVANA', 'start_codon'): 93933,
  ('HAVANA', 'stop_codon'): 87801,
  ('HAVANA', 'UTR'): 371663,
  ('ENSEMBL', 'CDS'): 37872,
  ('ENSEMBL', 'start_codon'): 4029,
  ('ENSEMBL', 'stop_codon'): 4034,
  ('ENSEMBL', 'UTR'): 

In [10]:
from pprint import pprint
from BCBio import GFF

current_gene = None
last_index = None
introns_lst = []
with open(in_file) as handle:
    for rec in GFF.parse(handle, target_lines=100):
        for feature in rec.features:
            if feature.type == "gene":
                print(feature)
                pprint(vars(feature))

                # Access gene attributes using feature qualifiers
                gene_id = feature.qualifiers.get("gene_id", [""])[0]
                gene_name = feature.qualifiers.get("gene_name", [""])[0]
                gene_type = feature.qualifiers.get("gene_type", [""])[0]
                gene_start = feature.location.start.position
                gene_end = feature.location.end.position

                current_gene = gene_id
                # last_index = gene_end
                # Do something with the gene information
                # print(feature.)
                print("Gene ID:", gene_id)
                print("Gene Name:", gene_name)
                print("Gene Type:", gene_type)
                print("Gene Start:", gene_start)
                print("Gene Ends:", gene_end)
                print("-------------------------")
                # print("SUB", GFF.(feature))

            if feature.type == "exon":
                if last_index:
                    intron_dict = {
                        "gene_id": gene_id,
                        "intron_start": last_index+1,
                        "intron_ends": gene_start-1,
                    }
                    introns_lst.append(intron_dict)
                last_index = gene_end
                print("last_index", last_index)


        break

introns_lst

type: gene
location: [11868:14409](+)
qualifiers:
    Key: gene_id, Value: ['ENSG00000290825.1']
    Key: gene_name, Value: ['DDX11L2']
    Key: gene_type, Value: ['lncRNA']
    Key: level, Value: ['2']
    Key: source, Value: ['HAVANA']
    Key: tag, Value: ['overlaps_pseudogene']

{'id': '',
 'location': SimpleLocation(ExactPosition(11868), ExactPosition(14409), strand=1),
 'qualifiers': {'gene_id': ['ENSG00000290825.1'],
                'gene_name': ['DDX11L2'],
                'gene_type': ['lncRNA'],
                'level': ['2'],
                'source': ['HAVANA'],
                'tag': ['overlaps_pseudogene']},
 'sub_features': [],
 'type': 'gene'}
Gene ID: ENSG00000290825.1
Gene Name: DDX11L2
Gene Type: lncRNA
Gene Start: 11868
Gene Ends: 14409
-------------------------
type: gene
location: [12009:13670](+)
qualifiers:
    Key: gene_id, Value: ['ENSG00000223972.6']
    Key: gene_name, Value: ['DDX11L1']
    Key: gene_type, Value: ['transcribed_unprocessed_pseudogene']
    K

[]

In [82]:
from pprint import pprint
from BCBio import GFF

limit_info = {"gff_type": ["transcript", "exon"]}
current_gene = None
last_index = None
introns_lst = []
with open(in_file) as handle:
    for rec in GFF.parse(handle, limit_info=limit_info, target_lines=100):
        for feature in rec.features:
            # pprint(vars(feature))
            loop = feature.sub_features if feature.strand == 1 else feature.sub_features[::-1]
            for sub_features in loop:
                # print("sub_features:::\n", sub_features)
                if sub_features.type == "transcript":
                    intron_id = 0
                    ene_id = feature.qualifiers.get("transcript_id", [""])[0]
                    transcript_name = feature.qualifiers.get("transcript_name", [""])[0]
                    # transcript_type = feature.qualifiers.get("transcript_type", [""])[0]
                    transcript_start = feature.location.start.position
                    transcript_end = feature.location.end.position
                    last_index = transcript_start
                    previous_exon_name = "Start-" + transcript_name

                if sub_features.type == "exon":
                    # pprint(vars(sub_features))
                    if last_index != sub_features.location.start.position:
                        intron_dict = {
                            "gene_id": sub_features.qualifiers["gene_id"][0],
                            "transcript_id": sub_features.qualifiers["transcript_id"][0],
                            "intron_id": intron_id,
                            "intron_start": last_index,
                            "intron_ends": sub_features.location.start.position,
                            "prev_exon_id": previous_exon_name,
                            "next_exon_id": sub_features.qualifiers["exon_id"][0]
                        }
                        introns_lst.append(intron_dict)
                        intron_id += 1

                    last_index = sub_features.location.end.position
                    previous_exon_name = sub_features.qualifiers["exon_id"][0]
introns_lst

[{'gene_id': 'ENSG00000290825.1',
  'transcript_id': 'ENST00000456328.2',
  'intron_id': 0,
  'intron_start': 12227,
  'intron_ends': 12612,
  'prev_exon_id': 'ENSE00002234944.1',
  'next_exon_id': 'ENSE00003582793.1'},
 {'gene_id': 'ENSG00000290825.1',
  'transcript_id': 'ENST00000456328.2',
  'intron_id': 1,
  'intron_start': 12721,
  'intron_ends': 13220,
  'prev_exon_id': 'ENSE00003582793.1',
  'next_exon_id': 'ENSE00002312635.1'},
 {'gene_id': 'ENSG00000223972.6',
  'transcript_id': 'ENST00000450305.2',
  'intron_id': 0,
  'intron_start': 12057,
  'intron_ends': 12178,
  'prev_exon_id': 'ENSE00001948541.1',
  'next_exon_id': 'ENSE00001671638.2'},
 {'gene_id': 'ENSG00000223972.6',
  'transcript_id': 'ENST00000450305.2',
  'intron_id': 1,
  'intron_start': 12227,
  'intron_ends': 12612,
  'prev_exon_id': 'ENSE00001671638.2',
  'next_exon_id': 'ENSE00001758273.2'},
 {'gene_id': 'ENSG00000223972.6',
  'transcript_id': 'ENST00000450305.2',
  'intron_id': 2,
  'intron_start': 12697,
  '

In [76]:
len(introns_lst)

1511324

In [85]:
df_introns = pd.DataFrame(introns_lst.copy())
df_introns.to_csv("gencode_v44_introns.csv", index=False)
df_introns

Unnamed: 0,gene_id,transcript_id,intron_id,intron_start,intron_ends,prev_exon_id,next_exon_id
0,ENSG00000290825.1,ENST00000456328.2,0,12227,12612,ENSE00002234944.1,ENSE00003582793.1
1,ENSG00000290825.1,ENST00000456328.2,1,12721,13220,ENSE00003582793.1,ENSE00002312635.1
2,ENSG00000223972.6,ENST00000450305.2,0,12057,12178,ENSE00001948541.1,ENSE00001671638.2
3,ENSG00000223972.6,ENST00000450305.2,1,12227,12612,ENSE00001671638.2,ENSE00001758273.2
4,ENSG00000223972.6,ENST00000450305.2,2,12697,12974,ENSE00001758273.2,ENSE00001799933.2
...,...,...,...,...,...,...,...
1511319,ENSG00000292371.1,ENST00000711270.1,0,15955,57212183,Start-,ENSE00004015127.1
1511320,ENSG00000292371.1,ENST00000711270.1,1,57213125,57213203,ENSE00004015127.1,ENSE00004015126.1
1511321,ENSG00000292371.1,ENST00000711270.1,2,57213357,57213525,ENSE00004015126.1,ENSE00004015125.1
1511322,ENSG00000292371.1,ENST00000711270.1,3,57213602,57213879,ENSE00004015125.1,ENSE00004015124.1


In [78]:
df_introns_1_based = df_introns.copy()
df_introns_1_based.intron_start = df_introns_1_based.intron_start + 1
df_introns_1_based

Unnamed: 0,gene_id,transcript_id,intron_id,intron_start,intron_ends,prev_exon_id,next_exon_id
0,[ENSG00000290825.1],[ENST00000456328.2],0,12228,12612,ENSE00002234944.1,ENSE00003582793.1
1,[ENSG00000290825.1],[ENST00000456328.2],1,12722,13220,ENSE00003582793.1,ENSE00002312635.1
2,[ENSG00000223972.6],[ENST00000450305.2],0,12058,12178,ENSE00001948541.1,ENSE00001671638.2
3,[ENSG00000223972.6],[ENST00000450305.2],1,12228,12612,ENSE00001671638.2,ENSE00001758273.2
4,[ENSG00000223972.6],[ENST00000450305.2],2,12698,12974,ENSE00001758273.2,ENSE00001799933.2
...,...,...,...,...,...,...,...
1511319,[ENSG00000292371.1],[ENST00000711270.1],0,15956,57212183,Start-,ENSE00004015127.1
1511320,[ENSG00000292371.1],[ENST00000711270.1],1,57213126,57213203,ENSE00004015127.1,ENSE00004015126.1
1511321,[ENSG00000292371.1],[ENST00000711270.1],2,57213358,57213525,ENSE00004015126.1,ENSE00004015125.1
1511322,[ENSG00000292371.1],[ENST00000711270.1],3,57213603,57213879,ENSE00004015125.1,ENSE00004015124.1


In [79]:
df_introns_1_based[df_introns_1_based.intron_start == 962048]

Unnamed: 0,gene_id,transcript_id,intron_id,intron_start,intron_ends,prev_exon_id,next_exon_id
766,[ENSG00000187961.15],[ENST00000338591.8],3,962048,962354,ENSE00003185989.1,ENSE00001375296.1


In [87]:
df_introns_1_based[df_introns_1_based.intron_start == 155357520]

Unnamed: 0,gene_id,transcript_id,intron_id,intron_start,intron_ends,prev_exon_id,next_exon_id
413070,[ENSG00000164114.19],[ENST00000311277.9],7,155357520,155360167,ENSE00001081579.1,ENSE00001081582.1
413084,[ENSG00000164114.19],[ENST00000650955.1],7,155357520,155360167,ENSE00001081579.1,ENSE00001081582.1
413098,[ENSG00000164114.19],[ENST00000515654.5],7,155357520,155360167,ENSE00001081579.1,ENSE00002024875.1
413112,[ENSG00000164114.19],[ENST00000424373.5],5,155357520,155360167,ENSE00001081579.1,ENSE00001081582.1
413123,[ENSG00000164114.19],[ENST00000433024.5],4,155357520,155360167,ENSE00001081579.1,ENSE00001081582.1
