In [3]:
import gffutils

in_file = "../isoseq_pipeline/refs/gencode.v48.annotation.gtf"
db_file = "../isoseq_pipeline/refs/gencode.v48.annotation.db"
out_file = "../isoseq_pipeline/refs/gencode.v48.annotation_nochr.gtf"

# Create the database (run once)

db = gffutils.create_db(in_file, dbfn=db_file, force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True, disable_infer_genes=True, disable_infer_transcripts=True)

db = gffutils.FeatureDB(db_file, keep_order=True)

with open(in_file) as fin, open(out_file, "w") as fout:
    # Copy header lines
    for line in fin:
        if line.startswith("#"):
            fout.write(line)
        else:
            break

    # Write updated features
    for feature in db.all_features(order_by='start'):
        seqid = feature.seqid
        if seqid == "chrM":
            new_seqid = "MT"
        elif seqid.startswith("chr"):
            new_seqid = seqid[3:]
        else:
            new_seqid = seqid

        fields = [
            new_seqid,
            feature.source,
            feature.featuretype,
            str(feature.start),
            str(feature.end),
            feature.score if feature.score is not None else '.',
            feature.strand,
            feature.frame if feature.frame is not None else '.',
            '; '.join(f'{k} "{v[0]}"' for k, v in feature.attributes.items()) + ';'
        ]
        fout.write('\t'.join(fields) + '\n') 

In [4]:
# subset chr1

in_file = "../isoseq_pipeline/refs/gencode.v48.annotation_nochr.gtf"
db_file = "../isoseq_pipeline/refs/gencode.v48.annotation_nochr.db"
out_file = "../isoseq_pipeline/refs/gencode.v48.annotation_chr1_only.gtf"

# Create the database (skip if already created)
db = gffutils.create_db(
    in_file,
    dbfn=db_file,
    force=True,
    keep_order=True,
    merge_strategy='merge',
    sort_attribute_values=True,
    disable_infer_genes=True,
    disable_infer_transcripts=True
)

db = gffutils.FeatureDB(db_file, keep_order=True)

with open(in_file) as fin, open(out_file, "w") as fout:
    # Copy header lines (optional)
    for line in fin:
        if line.startswith("#"):
            fout.write(line)
        else:
            break

    # Only include features on "chr1" (originally) → now just "1"
    for feature in db.all_features(order_by='start'):
        seqid = feature.seqid

        # Apply seqid transformation
        if seqid == "chrM":
            new_seqid = "MT"
        elif seqid.startswith("chr"):
            new_seqid = seqid[3:]
        else:
            new_seqid = seqid

        # Only write features from chromosome 1
        if new_seqid != "1":
            continue

        fields = [
            new_seqid,
            feature.source,
            feature.featuretype,
            str(feature.start),
            str(feature.end),
            feature.score if feature.score is not None else '.',
            feature.strand,
            feature.frame if feature.frame is not None else '.',
            '; '.join(f'{k} "{v[0]}"' for k, v in feature.attributes.items()) + ';'
        ]
        fout.write('\t'.join(fields) + '\n')
