# Table of contents
1. Load Modules
2. Parse GFF file
3. Extract coding genes
4. Extract CDS and intron of coding genes
5. Extract metadata of CDS and intron of coding genes
6. Merge metadata of coding genes and CDS and intron of coding genes
7. Save metadata of coding genes


# 1. Load Modules

In [6]:
import numpy as np
import pandas as pd
import re
from pybedtools import BedTool

# 2. Parse GFF file

In [23]:
def extract_transcript_id(row):
    attribute = row["Attribute"]
    ID_pattern = re.compile(r"ID=(\S+?)(?:|(:\S+));")
    Parent_pattern = re.compile(r"Parent=(\S+?)$")
    if row["Feature"] in ["mRNA", "tRNA", "rRNA", "snoRNA", "snRNA", "lncRNA"]:
        return re.search(ID_pattern, attribute).group(1)
    elif "Parent" in attribute:
        Parent = re.search(Parent_pattern, attribute).group(1)
        return Parent
    else:
        return np.nan

def parse_gff_file(gff_file):
    GFF = pd.read_csv(
        gff_file,
        sep="\t",
        comment="#",
        names=[
            "Chr",
            "Source",
            "Feature",
            "Start",
            "End",
            "Score",
            "Strand",
            "Frame",
            "Attribute",
        ],
    )

    extract_systematic_ID_pattern = re.compile(
        r"ID=(\S+?)(?:$|(?:|\.\d(?::\S+|));)")
    GFF["Systematic ID"] = GFF["Attribute"].str.extract(
        extract_systematic_ID_pattern, expand=False
    )
    GFF["Transcript"] = GFF.apply(extract_transcript_id, axis=1)

    chr_order = ["chr_II_telomeric_gap", "I", "II", "III", "mating_type_region", "mitochondrial"]
    GFF["Chr"] = pd.Categorical(GFF["Chr"], categories=chr_order, ordered=True)

    return GFF

In [18]:
gff_file = "/data/c/yangyusheng_optimized/DIT_HAP_pipeline/resources/pombase_data/2025-05-01/gff/Schizosaccharomyces_pombe_all_chromosomes.gff3"
gff = parse_gff_file(gff_file)
fai_file = "/data/c/yangyusheng_optimized/DIT_HAP_pipeline/resources/pombase_data/2025-05-01/fasta/Schizosaccharomyces_pombe_all_chromosomes.fa.fai"


In [9]:
PeptideStats = pd.read_csv("/data/c/yangyusheng_optimized/DIT_HAP_pipeline/resources/pombase_data/2025-05-01/Protein_features/PeptideStats.tsv", sep="\t")
primary_peptide_length = dict(zip(PeptideStats["Systematic_ID"], PeptideStats["Residues"]))

In [10]:
gff["Feature"].value_counts()

Feature
gene                                       12685
CDS                                        10321
exon                                        7574
lncRNA                                      7166
intron                                      5337
mRNA                                        5145
three_prime_UTR                             4798
five_prime_UTR                              4635
long_terminal_repeat                         238
tRNA                                         196
snoRNA                                        66
promoter                                      62
low_complexity_region                         59
rRNA                                          49
repeat_region                                 44
sncRNA                                        38
pseudogenic_transcript                        31
region                                        28
nuclear_mt_pseudogene                         17
dh_repeat                                     16
LTR_retrotra

In [11]:
for type in ["mRNA", "tRNA", "rRNA", "snoRNA", "snRNA", "lncRNA"]:
    genes = gff.query(f"Feature == '{type}'")["Systematic ID"].unique().tolist()
    type_counts = gff.query("`Systematic ID` in @genes")["Feature"].value_counts()
    print(type)
    display(type_counts)

mRNA


Feature
CDS                10249
intron              5244
mRNA                5145
gene                5134
three_prime_UTR     4796
five_prime_UTR      4633
Name: count, dtype: int64

tRNA


Feature
exon      240
gene      196
tRNA      196
intron     44
Name: count, dtype: int64

rRNA


Feature
gene    49
rRNA    49
exon    49
Name: count, dtype: int64

snoRNA


Feature
exon      68
gene      66
snoRNA    66
intron     2
Name: count, dtype: int64

snRNA


Feature
exon      8
gene      7
snRNA     7
intron    1
Name: count, dtype: int64

lncRNA


Feature
exon      7171
lncRNA    7166
gene      7164
intron       5
Name: count, dtype: int64

In [12]:
coding_genes = gff.query("Feature == 'mRNA'")["Systematic ID"].unique().tolist()
tRNAs = gff.query("Feature == 'tRNA'")["Systematic ID"].unique().tolist()
rRNAs = gff.query("Feature == 'rRNA'")["Systematic ID"].unique().tolist()
snoRNAs = gff.query("Feature == 'snoRNA'")["Systematic ID"].unique().tolist()
snRNAs = gff.query("Feature == 'snRNA'")["Systematic ID"].unique().tolist()
lncRNAs = gff.query("Feature == 'lncRNA'")["Systematic ID"].unique().tolist()

In [51]:
def cal_accumlated_CDS_bases(sub_df):

    sub_df_sorted = sub_df.sort_values(["Start"]).copy()
    accumulated_CDS_bases = 0
    strand = sub_df_sorted["Strand"].iloc[0]
    if strand == "+":
        index_order = sub_df_sorted.index
    else:
        index_order = sub_df_sorted.index[::-1]
    for idx in index_order:
        if sub_df_sorted.loc[idx, "Feature"] == "CDS":
            sub_df_sorted.loc[idx,
                              "Accumulated_CDS_bases"] = accumulated_CDS_bases
            accumulated_CDS_bases += sub_df_sorted.loc[idx, "Length"]
        else:
            sub_df_sorted.loc[idx,
                              "Accumulated_CDS_bases"] = accumulated_CDS_bases

    return (sub_df_sorted)

In [52]:
def gff_to_bed(sub_df, type, primary_peptide_length=None):

    bed = sub_df.copy()
    # switch 1-based to 0-based
    bed["Start"] = bed["Start"] - 1

    bed["Length"] = (
        bed["End"] - bed["Start"]
    )

    bed_columns = ["Chr", "Start", "End", "Transcript", "Length", "Strand"]
    other_columns = ["Feature", "Systematic ID", "Type"]

    if type == "Coding gene":
        feature_for_boundary = "CDS"
        CDS_length = bed.query("Feature == @feature_for_boundary")["Length"].sum()
        bed = cal_accumlated_CDS_bases(bed)
        if int(CDS_length//3 -1) == primary_peptide_length[bed["Systematic ID"].iloc[0]]:
            bed["Primary_transcript"] = "Yes"
        else:
            bed["Primary_transcript"] = "No"

        other_columns.extend(["Primary_transcript", "Accumulated_CDS_bases"])
    else:
        feature_for_boundary = "exon"

    boundary_left = bed.query("Feature == @feature_for_boundary")["Start"].min()
    boundary_right = bed.query("Feature == @feature_for_boundary")["End"].max()

    filtered_bed = bed.query("Start >= @boundary_left and End <= @boundary_right").copy()

    filtered_bed.insert(4, "Type", type)

    
    
    filtered_bed = filtered_bed[bed_columns + other_columns].rename(
                columns={"Chr": "#Chr"}
            ).sort_values(["#Chr"] + bed_columns[1:])

    return filtered_bed


coding_bed = gff.query("`Systematic ID` in @coding_genes and (Feature == 'CDS' or Feature == 'intron')").groupby(["Systematic ID", "Transcript"], as_index=False).apply(gff_to_bed, type="Coding gene", primary_peptide_length=primary_peptide_length).reset_index(drop=True)

In [53]:
same_CDS_different_UTR_genes = coding_bed.query("Primary_transcript == 'Yes'").groupby("Systematic ID")["Transcript"].apply(lambda x: len(x.unique())).to_frame().query("Transcript > 1")

In [54]:
# only keep the first transcript
coding_bed["Primary_transcript"] = coding_bed.apply(lambda row: row["Primary_transcript"] if row["Systematic ID"] not in same_CDS_different_UTR_genes.index else ("Yes" if row["Transcript"].endswith(".1") else "No"), axis=1)

In [72]:
coding_bed["Primary_transcript"].value_counts()

Primary_transcript
Yes    15254
No        79
Name: count, dtype: int64

In [55]:
primary_transcript_bed = coding_bed.query("Primary_transcript == 'Yes'")

In [56]:
chr_order = ["chr_II_telomeric_gap", "I", "II", "III", "mating_type_region", "mitochondrial"]
primary_transcript_bed["#Chr"] = pd.Categorical(primary_transcript_bed["#Chr"], categories=chr_order, ordered=True)
primary_transcript_bed = primary_transcript_bed.sort_values(["#Chr", "Start", "End", "Transcript", "Length", "Strand"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_transcript_bed["#Chr"] = pd.Categorical(primary_transcript_bed["#Chr"], categories=chr_order, ordered=True)


In [70]:
primary_transcript_bed.to_csv("./test_transcript.csv", index=False)

In [57]:
primary_transcript_bedtool = BedTool.from_dataframe(primary_transcript_bed)

In [58]:
intergenic_bed = primary_transcript_bedtool.complement(g=fai_file).to_dataframe()


In [59]:
intergenic_bed = intergenic_bed.rename(columns={"chrom": "#Chr", "start": "Start", "end": "End"})

In [63]:
intergenic_bed

Unnamed: 0,#Chr,Start,End,Transcript,Systematic ID,Strand,Length,Feature,Type
0,chr_II_telomeric_gap,0,1478,Boundary|SPBC460.01c.1,Boundary|SPBC460.01c,Boundary|-,1478,Intergenic region,Intergenic region
1,chr_II_telomeric_gap,3197,8855,SPBC460.01c.1|SPBC460.02c.1,SPBC460.01c|SPBC460.02c,-|-,5658,Intergenic region,Intergenic region
2,chr_II_telomeric_gap,9803,11640,SPBC460.02c.1|SPBC460.03.1,SPBC460.02c|SPBC460.03,-|+,1837,Intergenic region,Intergenic region
3,chr_II_telomeric_gap,13344,14072,SPBC460.03.1|SPBC460.04c.1,SPBC460.03|SPBC460.04c,+|-,728,Intergenic region,Intergenic region
4,chr_II_telomeric_gap,15248,16469,SPBC460.04c.1|SPBC460.05.1,SPBC460.04c|SPBC460.05,-|+,1221,Intergenic region,Intergenic region
...,...,...,...,...,...,...,...,...,...
5101,mitochondrial,15531,15955,SPMIT.07.1|SPMIT.08.1,SPMIT.07|SPMIT.08,+|+,424,Intergenic region,Intergenic region
5102,mitochondrial,16639,16915,SPMIT.08.1|SPMIT.09.1,SPMIT.08|SPMIT.09,+|+,276,Intergenic region,Intergenic region
5103,mitochondrial,17062,17807,SPMIT.09.1|SPMIT.10.1,SPMIT.09|SPMIT.10,+|+,745,Intergenic region,Intergenic region
5104,mitochondrial,18032,18562,SPMIT.10.1|SPMIT.11.1,SPMIT.10|SPMIT.11,+|+,530,Intergenic region,Intergenic region


In [60]:
def annotate_intergenic_bed(row, primary_transcript_bed):
    chrom = row["#Chr"]
    start = row["Start"]
    end = row["End"]

    left_flank = primary_transcript_bed[(primary_transcript_bed["#Chr"]==chrom) & (primary_transcript_bed["End"]==start)]
    right_flank = primary_transcript_bed[(primary_transcript_bed["#Chr"]==chrom) & (primary_transcript_bed["Start"]==end)]

    if left_flank.empty:
        left_flank_transcript = "Boundary"
        left_flank_systematic_id = "Boundary"
        left_flank_strand = "Boundary"
        print(f"No left flank for {chrom}:{start}-{end}")
    elif left_flank.shape[0] > 1:
        left_flank_transcript = "multiple"
        left_flank_systematic_id = "multiple"
        left_flank_strand = "multiple"
        print(f"Multiple left flank for {chrom}:{start}-{end}")
        display(left_flank)
    else:
        left_flank_transcript = left_flank["Transcript"].iloc[0]
        left_flank_systematic_id = left_flank["Systematic ID"].iloc[0]
        left_flank_strand = left_flank["Strand"].iloc[0]

    if right_flank.empty:
        right_flank_transcript = "Boundary"
        right_flank_systematic_id = "Boundary"
        right_flank_strand = "Boundary"
        print(f"No right flank for {chrom}:{start}-{end}")
    elif right_flank.shape[0] > 1:
        right_flank_transcript = "multiple"
        right_flank_systematic_id = "multiple"
        right_flank_strand = "multiple"
        print(f"Multiple right flank for {chrom}:{start}-{end}")
        display(right_flank)
    else:
        right_flank_transcript = right_flank["Transcript"].iloc[0]
        right_flank_systematic_id = right_flank["Systematic ID"].iloc[0]
        right_flank_strand = right_flank["Strand"].iloc[0]

    flank = {
        "Transcript": left_flank_transcript + "|" + right_flank_transcript,
        "Systematic ID": left_flank_systematic_id + "|" + right_flank_systematic_id,
        "Strand": left_flank_strand + "|" + right_flank_strand
    }

    return pd.Series(flank)

intergenic_bed[["Transcript", "Systematic ID", "Strand"]] = intergenic_bed.apply(lambda row: annotate_intergenic_bed(row, primary_transcript_bed), axis=1)

No left flank for chr_II_telomeric_gap:0-1478
No right flank for chr_II_telomeric_gap:18062-20000
No left flank for II:0-303
No right flank for II:4532644-4539804
No left flank for III:0-28679
No right flank for III:2435662-2452883
No left flank for mating_type_region:0-3353
No right flank for mating_type_region:4368-20128
No left flank for mitochondrial:0-4884
No right flank for mitochondrial:19309-19433


In [61]:
intergenic_bed["Length"] = intergenic_bed["End"] - intergenic_bed["Start"]
intergenic_bed["Feature"] = "Intergenic region"
intergenic_bed["Type"] = "Intergenic region"


In [62]:
intergenic_bed

Unnamed: 0,#Chr,Start,End,Transcript,Systematic ID,Strand,Length,Feature,Type
0,chr_II_telomeric_gap,0,1478,Boundary|SPBC460.01c.1,Boundary|SPBC460.01c,Boundary|-,1478,Intergenic region,Intergenic region
1,chr_II_telomeric_gap,3197,8855,SPBC460.01c.1|SPBC460.02c.1,SPBC460.01c|SPBC460.02c,-|-,5658,Intergenic region,Intergenic region
2,chr_II_telomeric_gap,9803,11640,SPBC460.02c.1|SPBC460.03.1,SPBC460.02c|SPBC460.03,-|+,1837,Intergenic region,Intergenic region
3,chr_II_telomeric_gap,13344,14072,SPBC460.03.1|SPBC460.04c.1,SPBC460.03|SPBC460.04c,+|-,728,Intergenic region,Intergenic region
4,chr_II_telomeric_gap,15248,16469,SPBC460.04c.1|SPBC460.05.1,SPBC460.04c|SPBC460.05,-|+,1221,Intergenic region,Intergenic region
...,...,...,...,...,...,...,...,...,...
5101,mitochondrial,15531,15955,SPMIT.07.1|SPMIT.08.1,SPMIT.07|SPMIT.08,+|+,424,Intergenic region,Intergenic region
5102,mitochondrial,16639,16915,SPMIT.08.1|SPMIT.09.1,SPMIT.08|SPMIT.09,+|+,276,Intergenic region,Intergenic region
5103,mitochondrial,17062,17807,SPMIT.09.1|SPMIT.10.1,SPMIT.09|SPMIT.10,+|+,745,Intergenic region,Intergenic region
5104,mitochondrial,18032,18562,SPMIT.10.1|SPMIT.11.1,SPMIT.10|SPMIT.11,+|+,530,Intergenic region,Intergenic region
