In [1]:
from pathlib import Path
from pyfaidx import Fasta
import os
from gtfparse import read_gtf 
import pandas as pd
import numpy as np

INFO:numexpr.utils:Note: NumExpr detected 48 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
INFO:numexpr.utils:NumExpr defaulting to 16 threads.


In [2]:
cwd = Path(os.getcwd())

fa = Fasta( cwd /
    ("transcriptome/gencode.v45.transcripts.fa"),
    key_function=lambda h: h.split("|")[0]
)


gtf_file = cwd / "transcriptome/gencode.v45.annotation.gtf"
gtf = read_gtf(gtf_file).to_pandas()

tx  = gtf[gtf["feature"] == "transcript"][
        ["transcript_id", "gene_id", "gene_name",
         "transcript_type", "tag"]]

tx["sequence"] = tx["transcript_id"].map(lambda t: str(fa[t]) if t in fa else "")

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'tag', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'havana_transcript', 'exon_number', 'exon_id', 'hgnc_id', 'havana_gene', 'ont', 'protein_id', 'ccdsid', 'artif_dupl']


In [None]:
tx["Non_can_bases"] = tx["sequence"].apply(
    lambda s: sum([1 for c in s if c not in "ACGT"]))
tx["Non_can_bases"] = tx["Non_can_bases"].apply(lambda x: True if x > 0 else False)

print("Number of transcripts with non-canonical bases:",
      tx["Non_can_bases"].sum())

tx = tx[tx["Non_can_bases"] == False].copy()
tx.drop(columns=["Non_can_bases"], inplace=True)
tx = tx.reset_index(drop=True)

#add 25 P both left and right padding to the sequences
tx["sequence"] = tx["sequence"].apply(lambda s: "P"*25 + s + "P"*25)

Number of transcripts with non-canonical bases: 2


In [6]:
#save tx to file
tx_file = cwd / "transcriptome/gencode_customized.tsv"
tx.to_csv(tx_file, sep="\t", index=False)

In [71]:
tx["tx_idx"] = np.arange(len(tx), dtype=np.uint32)
id2tx = tx["transcript_id"].to_numpy()        # position i → ENST…
tx2id = dict(zip(id2tx, np.arange(len(id2tx))))


In [92]:
alpha = np.full(256, 255, dtype=np.uint8)   # 255 = “invalid”
alpha[[ord(x) for x in "PACGT"]] = [0,1,2,3,4]

def encode(seq: str) -> np.ndarray:
    """Return a uint8 vector of length len(seq) with values 0–4."""
    b = np.frombuffer(seq.encode("ascii"), dtype=np.uint8)
    return alpha[b]

def c_positions(enc_seq: np.ndarray) -> np.ndarray:
    # enc_seq == 2 marks a C; skip first/last 25 (they are 'P' anyway)
    c_mask = enc_seq == 2
    valid = np.flatnonzero(c_mask)
    return valid.astype(np.uint32)


In [93]:
seq_arrs   = []
c_pos_arrs = []

for _, row in tx.iterrows():
    e = encode(row.sequence)
    seq_arrs.append(e)
    c_pos_arrs.append(c_positions(e))


In [None]:
np.savez_compressed(
    cwd / "transcriptome/tx_data.npz",
    id2tx=id2tx,
    seq_arrs=np.array(seq_arrs, dtype=object),
    c_pos_arrs=np.array(c_pos_arrs, dtype=object),
)
