In [1]:
import pandas as pd
from BCBio.GFF import GFFExaminer
from BCBio import GFF

In [2]:
in_file = "./ftp-data/gencode.v44.annotation.gtf"
examiner = GFFExaminer()

In [3]:
limit_info = {"gff_type": ["transcript", "exon"]}
current_gene = None
last_index = None
introns_lst = []
with open(in_file) as handle:
    for rec in GFF.parse(handle, limit_info=limit_info, target_lines=100):
        for feature in rec.features:
            loop = feature.sub_features if feature.strand == 1 else feature.sub_features[::-1]
            first_exon = True
            for sub_features in loop:
                if sub_features.type == "exon":
                    if not first_exon:
                        intron_dict = {
                            "chr": rec.id,
                            "gene_id": sub_features.qualifiers["gene_id"][0],
                            "transcript_id": sub_features.qualifiers["transcript_id"][0],
                            # "intron_id": intron_id,
                            "start": last_index,
                            "end": sub_features.location.start.position,
                            "strand": "+" if feature.strand == 1 else "-",
                            "prev_exon_id": previous_exon_name,
                            "next_exon_id": sub_features.qualifiers["exon_id"][0]
                        }
                        introns_lst.append(intron_dict)
                    last_index = sub_features.location.end.position
                    previous_exon_name = sub_features.qualifiers["exon_id"][0]

                    first_exon = False

introns_lst

[{'chr': 'chr1',
  'gene_id': 'ENSG00000290825.1',
  'transcript_id': 'ENST00000456328.2',
  'start': 12227,
  'end': 12612,
  'strand': '+',
  'prev_exon_id': 'ENSE00002234944.1',
  'next_exon_id': 'ENSE00003582793.1'},
 {'chr': 'chr1',
  'gene_id': 'ENSG00000290825.1',
  'transcript_id': 'ENST00000456328.2',
  'start': 12721,
  'end': 13220,
  'strand': '+',
  'prev_exon_id': 'ENSE00003582793.1',
  'next_exon_id': 'ENSE00002312635.1'},
 {'chr': 'chr1',
  'gene_id': 'ENSG00000223972.6',
  'transcript_id': 'ENST00000450305.2',
  'start': 12057,
  'end': 12178,
  'strand': '+',
  'prev_exon_id': 'ENSE00001948541.1',
  'next_exon_id': 'ENSE00001671638.2'},
 {'chr': 'chr1',
  'gene_id': 'ENSG00000223972.6',
  'transcript_id': 'ENST00000450305.2',
  'start': 12227,
  'end': 12612,
  'strand': '+',
  'prev_exon_id': 'ENSE00001671638.2',
  'next_exon_id': 'ENSE00001758273.2'},
 {'chr': 'chr1',
  'gene_id': 'ENSG00000223972.6',
  'transcript_id': 'ENST00000450305.2',
  'start': 12697,
  'end'

In [4]:
df_introns = pd.DataFrame(introns_lst.copy())
df_introns

Unnamed: 0,chr,gene_id,transcript_id,start,end,strand,prev_exon_id,next_exon_id
0,chr1,ENSG00000290825.1,ENST00000456328.2,12227,12612,+,ENSE00002234944.1,ENSE00003582793.1
1,chr1,ENSG00000290825.1,ENST00000456328.2,12721,13220,+,ENSE00003582793.1,ENSE00002312635.1
2,chr1,ENSG00000223972.6,ENST00000450305.2,12057,12178,+,ENSE00001948541.1,ENSE00001671638.2
3,chr1,ENSG00000223972.6,ENST00000450305.2,12227,12612,+,ENSE00001671638.2,ENSE00001758273.2
4,chr1,ENSG00000223972.6,ENST00000450305.2,12697,12974,+,ENSE00001758273.2,ENSE00001799933.2
...,...,...,...,...,...,...,...,...
1396636,chrY,ENSG00000292372.1,ENST00000711285.1,57211569,57211760,+,ENSE00004015174.1,ENSE00004015172.1
1396637,chrY,ENSG00000292371.1,ENST00000711270.1,57213125,57213203,-,ENSE00004015127.1,ENSE00004015126.1
1396638,chrY,ENSG00000292371.1,ENST00000711270.1,57213357,57213525,-,ENSE00004015126.1,ENSE00004015125.1
1396639,chrY,ENSG00000292371.1,ENST00000711270.1,57213602,57213879,-,ENSE00004015125.1,ENSE00004015124.1


In [5]:
# Export it to tsv for next steps
df_introns.to_csv("data/All_introns_list.tsv", index=False, sep="\t")

In [6]:
# df_introns['Duplicate_Count'] = df_introns.groupby(["chr", "start", "end", "strand"]).transform('size')
df_introns.drop_duplicates(subset=["chr", "start", "end", "strand"], inplace=True)
df_introns

Unnamed: 0,chr,gene_id,transcript_id,start,end,strand,prev_exon_id,next_exon_id
0,chr1,ENSG00000290825.1,ENST00000456328.2,12227,12612,+,ENSE00002234944.1,ENSE00003582793.1
1,chr1,ENSG00000290825.1,ENST00000456328.2,12721,13220,+,ENSE00003582793.1,ENSE00002312635.1
2,chr1,ENSG00000223972.6,ENST00000450305.2,12057,12178,+,ENSE00001948541.1,ENSE00001671638.2
4,chr1,ENSG00000223972.6,ENST00000450305.2,12697,12974,+,ENSE00001758273.2,ENSE00001799933.2
5,chr1,ENSG00000223972.6,ENST00000450305.2,13052,13220,+,ENSE00001799933.2,ENSE00001746346.2
...,...,...,...,...,...,...,...,...
1396636,chrY,ENSG00000292372.1,ENST00000711285.1,57211569,57211760,+,ENSE00004015174.1,ENSE00004015172.1
1396637,chrY,ENSG00000292371.1,ENST00000711270.1,57213125,57213203,-,ENSE00004015127.1,ENSE00004015126.1
1396638,chrY,ENSG00000292371.1,ENST00000711270.1,57213357,57213525,-,ENSE00004015126.1,ENSE00004015125.1
1396639,chrY,ENSG00000292371.1,ENST00000711270.1,57213602,57213879,-,ENSE00004015125.1,ENSE00004015124.1


In [8]:
df_introns = df_introns.drop(columns=["gene_id", "transcript_id", "prev_exon_id", "next_exon_id"])
df_introns

Unnamed: 0,chr,start,end,strand
0,chr1,12227,12612,+
1,chr1,12721,13220,+
2,chr1,12057,12178,+
4,chr1,12697,12974,+
5,chr1,13052,13220,+
...,...,...,...,...
1396636,chrY,57211569,57211760,+
1396637,chrY,57213125,57213203,-
1396638,chrY,57213357,57213525,-
1396639,chrY,57213602,57213879,-


In [9]:
df_introns.to_csv("data/1-rmv_dup_introns_gencode_v44.tsv", index=False, sep="\t")

# Experimental code:

In [None]:
df_introns = pd.DataFrame(introns_lst.copy())
df_introns

In [None]:
# df_introns = pd.DataFrame(introns_lst.copy())
df_introns.to_csv("data/gencode_v44_introns.csv", index=False)
df_introns

In [None]:
df_introns_1_based = df_introns.copy()
df_introns_1_based.intron_start = df_introns_1_based.intron_start + 1
df_introns_1_based

In [None]:
df_introns_1_based[df_introns_1_based.intron_start == 962048]

In [None]:
df_introns_1_based[df_introns_1_based.intron_start == 155357520]

In [None]:
df_introns_1_based[df_introns_1_based.intron_start == 155759144]