In [1]:
from Bio import SeqIO
import pandas as pd
import re

# Function to extract details from the FASTA header
def extract_details(description):
    # Extract id (e.g., ENSMUST00000103301.3)
    id_match = re.search(r"(ENST\d+\.\d+)", description)
    id_ = id_match.group(1) if id_match else None

    # Extract gene (e.g., ENSMUSG00000076500.3)
    gene_match = re.search(r"gene:(ENSG\d+\.\d+)", description)
    gene = gene_match.group(1) if gene_match else None

    # Example: gene_symbol:Gm10878
    gene_symbol_match = re.search(r"gene_symbol:([^\s]+)", description)
    gene_symbol = gene_symbol_match.group(1) if gene_symbol_match else None

    # Extract chromosome (e.g., GRCh38:6)
    chromosome_match = re.search(r"chromosome:(GRCh38:\d+)", description)
    chromosome = chromosome_match.group(1) if chromosome_match else None

    # Extract start, end, and strand
    position_match = re.search(r"chromosome:GRCh38:(\d+):(\d+):(\d+):(-?1)", description)
    start = int(position_match.group(2)) if position_match else None
    end = int(position_match.group(3)) if position_match else None
    strand = int(position_match.group(4)) if position_match else None

    return id_, gene, gene_symbol, chromosome, start, end, strand


In [2]:

# Function to parse the FASTA file and store in DataFrame
def parse_fasta_to_df(fasta_file):
    # List to hold all entries
    records = []

    # Parse the FASTA file/
    for record in SeqIO.parse(fasta_file, "fasta"):
        # Extract id, gene, chromosome, start, end, and strand
        id_, gene, gene_symbol, chromosome, start, end, strand = extract_details(record.description)
        
        # Create a dictionary for the current entry
        entry = {
            "id": id_,
            "gene": gene,
            "gene_symbol": gene_symbol,
            "chromosome": chromosome,
            "start": start,
            "end": end,
            "strand": strand,
            #"Description": record.description,
            "nuc_seq": str(record.seq)
        }
        # Append to the list
        records.append(entry)
    
    # Convert the list of dicts to DataFrame
    df = pd.DataFrame(records)
    return df

# Usage example with the sample file
df = parse_fasta_to_df("./data/Homo_sapiens.GRCh38.cdna.all.fa")
df["ens_gene_id"] = df["gene"].str.split(".").str[0]

In [3]:
display(df.head())

Unnamed: 0,id,gene,gene_symbol,chromosome,start,end,strand,nuc_seq,ens_gene_id
0,ENST00000390473.1,ENSG00000211825.1,TRDJ1,GRCh38:14,22450089.0,22450139.0,1.0,ACACCGATAAACTCATCTTTGGAAAAGGAACCCGTGTGACTGTGGA...,ENSG00000211825
1,ENST00000390484.1,ENSG00000211836.1,TRAJ54,GRCh38:14,22482287.0,22482346.0,1.0,TAATTCAGGGAGCCCAGAAGCTGGTATTTGGCCAAGGAACCAGGCT...,ENSG00000211836
2,ENST00000390488.1,ENSG00000211840.1,TRAJ49,GRCh38:14,22489488.0,22489543.0,1.0,GAACACCGGTAACCAGTTCTATTTTGGGACAGGGACAAGTTTGACG...,ENSG00000211840
3,ENST00000390476.1,ENSG00000211828.1,TRDJ3,GRCh38:14,22459098.0,22459156.0,1.0,CTCCTGGGACACCCGACAGATGTTTTTCGGAACTGGCATCAAACTC...,ENSG00000211828
4,ENST00000390489.1,ENSG00000211841.1,TRAJ48,GRCh38:14,22490491.0,22490553.0,1.0,TATCTAACTTTGGAAATGAGAAATTAACCTTTGGGACTGGAACAAG...,ENSG00000211841


In [4]:
df.to_csv("./data/nuc_seqs_human.csv", index=False)
display(df.head(3))

Unnamed: 0,id,gene,gene_symbol,chromosome,start,end,strand,nuc_seq,ens_gene_id
0,ENST00000390473.1,ENSG00000211825.1,TRDJ1,GRCh38:14,22450089.0,22450139.0,1.0,ACACCGATAAACTCATCTTTGGAAAAGGAACCCGTGTGACTGTGGA...,ENSG00000211825
1,ENST00000390484.1,ENSG00000211836.1,TRAJ54,GRCh38:14,22482287.0,22482346.0,1.0,TAATTCAGGGAGCCCAGAAGCTGGTATTTGGCCAAGGAACCAGGCT...,ENSG00000211836
2,ENST00000390488.1,ENSG00000211840.1,TRAJ49,GRCh38:14,22489488.0,22489543.0,1.0,GAACACCGGTAACCAGTTCTATTTTGGGACAGGGACAAGTTTGACG...,ENSG00000211840


In [5]:
import gget

out = gget.seq(df["ens_gene_id"][0], translate=True, isoforms=False)
print(out)

18:10:12 - INFO - Requesting amino acid sequence of the canonical transcript ENST00000390473 of gene ENSG00000211825 from UniProt.


['>ENST00000390473 uniprot_id: A0A075B706 ensembl_id: ENST00000390473 gene_name: TRDJ1 organism: Homo sapiens sequence_length: 16', 'TDKLIFGKGTRVTVEP']


In [6]:
import re

def parse_seq_header(header):
    # Extract key information using regex
    ensemble_pattern = r'ensembl_id:\s*(ENS[T|G]\d+)'
    gene_pattern = r'gene_name:\s*([^\s]+)'  # Changed: capture everything until a space
    length_pattern = r'sequence_length:\s*(\d+)'
    
    ensemble_id = re.search(ensemble_pattern, header).group(1)
    gene_name = re.search(gene_pattern, header).group(1)
    sequence_length = int(re.search(length_pattern, header).group(1))
    
    return {
        'ensemble_id': ensemble_id,
        'gene_name': gene_name,
        'sequence_length': sequence_length
    }

In [7]:
out = gget.seq(df["ens_gene_id"][0], translate=True, isoforms=False)
if len(out)==2:
    df_prot = pd.DataFrame([parse_seq_header(out[0])])
    df_prot["aa_seq"] = out[1]
df_prot

18:10:20 - INFO - Requesting amino acid sequence of the canonical transcript ENST00000390473 of gene ENSG00000211825 from UniProt.


Unnamed: 0,ensemble_id,gene_name,sequence_length,aa_seq
0,ENST00000390473,TRDJ1,16,TDKLIFGKGTRVTVEP
