In [1]:
import pandas as pd
from BCBio.GFF import GFFExaminer
from BCBio import GFF

In [2]:
in_file = "./ftp-data/gencode.v44.annotation.gtf"
examiner = GFFExaminer()

In [None]:
limit_info = {"gff_type": ["transcript", "exon"]}
current_gene = None
last_index = None
introns_lst = []
with open(in_file) as handle:
    for rec in GFF.parse(handle, limit_info=limit_info, target_lines=100):
        # print(rec.id)
        for feature in rec.features:
            # print(feature)
            # pprint(vars(feature))
            loop = feature.sub_features if feature.strand == 1 else feature.sub_features[::-1]
            for sub_features in loop:
                # print("sub_features:::\n", sub_features)
                if sub_features.type == "transcript":
                    intron_id = 0
                    ene_id = feature.qualifiers.get("transcript_id", [""])[0]
                    transcript_name = feature.qualifiers.get("transcript_name", [""])[0]
                    # transcript_type = feature.qualifiers.get("transcript_type", [""])[0]
                    transcript_start = feature.location.start.position
                    transcript_end = feature.location.end.position
                    last_index = transcript_start
                    previous_exon_name = "Start-" + transcript_name

                if sub_features.type == "exon":
                    # pprint(vars(sub_features))
                    if last_index != sub_features.location.start.position:
                        intron_dict = {
                            "chr": rec.id,
                            "gene_id": sub_features.qualifiers["gene_id"][0],
                            "transcript_id": sub_features.qualifiers["transcript_id"][0],
                            "intron_id": intron_id,
                            "start": last_index,
                            "end": sub_features.location.start.position,
                            "strand": "+" if feature.strand == 1 else "-",
                            "prev_exon_id": previous_exon_name,
                            "next_exon_id": sub_features.qualifiers["exon_id"][0]
                        }
                        introns_lst.append(intron_dict)
                        intron_id += 1

                    last_index = sub_features.location.end.position
                    previous_exon_name = sub_features.qualifiers["exon_id"][0]

introns_lst

In [None]:
df_introns = pd.DataFrame(introns_lst.copy())
df_introns

In [9]:
# Export it to tsv for next steps
df_introns.to_csv("data/All_introns_list.tsv", sep="\t")

In [12]:
df_introns['Duplicate_Count'] = df_introns.groupby(["chr", "start", "end", "strand"]).transform('size')
df_introns.drop_duplicates(subset=["chr", "start", "end", "strand"], inplace=True)
df_introns

Unnamed: 0,chr,gene_id,transcript_id,intron_id,start,end,strand,prev_exon_id,next_exon_id,Duplicate_Count
0,chr1,ENSG00000290825.1,ENST00000456328.2,0,12227,12612,+,ENSE00002234944.1,ENSE00003582793.1,2
1,chr1,ENSG00000290825.1,ENST00000456328.2,1,12721,13220,+,ENSE00003582793.1,ENSE00002312635.1,1
2,chr1,ENSG00000223972.6,ENST00000450305.2,0,12057,12178,+,ENSE00001948541.1,ENSE00001671638.2,1
4,chr1,ENSG00000223972.6,ENST00000450305.2,2,12697,12974,+,ENSE00001758273.2,ENSE00001799933.2,1
5,chr1,ENSG00000223972.6,ENST00000450305.2,3,13052,13220,+,ENSE00001799933.2,ENSE00001746346.2,1
...,...,...,...,...,...,...,...,...,...,...
1511319,chrY,ENSG00000292371.1,ENST00000711270.1,0,15955,57212183,-,Start-,ENSE00004015127.1,1
1511320,chrY,ENSG00000292371.1,ENST00000711270.1,1,57213125,57213203,-,ENSE00004015127.1,ENSE00004015126.1,1
1511321,chrY,ENSG00000292371.1,ENST00000711270.1,2,57213357,57213525,-,ENSE00004015126.1,ENSE00004015125.1,1
1511322,chrY,ENSG00000292371.1,ENST00000711270.1,3,57213602,57213879,-,ENSE00004015125.1,ENSE00004015124.1,1


In [13]:
df_introns = df_introns.drop(columns=["gene_id", "transcript_id", "intron_id", "prev_exon_id", "next_exon_id"])
df_introns

Unnamed: 0,chr,start,end,strand,Duplicate_Count
0,chr1,12227,12612,+,2
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
4,chr1,12697,12974,+,1
5,chr1,13052,13220,+,1
...,...,...,...,...,...
1511319,chrY,15955,57212183,-,1
1511320,chrY,57213125,57213203,-,1
1511321,chrY,57213357,57213525,-,1
1511322,chrY,57213602,57213879,-,1


In [14]:
df_introns.to_csv("data/1-rmv_dup_introns_gencode_v44.tsv", sep="\t")

# Experimental code:

In [None]:
df_introns = pd.DataFrame(introns_lst.copy())
df_introns

In [None]:
# df_introns = pd.DataFrame(introns_lst.copy())
df_introns.to_csv("data/gencode_v44_introns.csv", index=False)
df_introns

In [None]:
df_introns_1_based = df_introns.copy()
df_introns_1_based.intron_start = df_introns_1_based.intron_start + 1
df_introns_1_based

In [None]:
df_introns_1_based[df_introns_1_based.intron_start == 962048]

In [None]:
df_introns_1_based[df_introns_1_based.intron_start == 155357520]

In [None]:
df_introns_1_based[df_introns_1_based.intron_start == 155759144]