In [1]:
from Bio import SeqIO
import pandas as pd
import re

# Function to extract details from the FASTA header
def extract_details(description):
    # Extract id (e.g., ENSMUST00000103301.3)
    id_match = re.search(r"(ENSMUST\d+\.\d+)", description)
    id_ = id_match.group(1) if id_match else None

    # Extract gene (e.g., ENSMUSG00000076500.3)
    gene_match = re.search(r"gene:(ENSMUSG\d+\.\d+)", description)
    gene = gene_match.group(1) if gene_match else None

    # Example: gene_symbol:Gm10878
    gene_symbol_match = re.search(r"gene_symbol:([^\s]+)", description)
    gene_symbol = gene_symbol_match.group(1) if gene_symbol_match else None

    # Extract chromosome (e.g., GRCm39:6)
    chromosome_match = re.search(r"chromosome:(GRCm39:\d+)", description)
    chromosome = chromosome_match.group(1) if chromosome_match else None

    # Extract start, end, and strand
    position_match = re.search(r"chromosome:GRCm39:(\d+):(\d+):(\d+):(-?1)", description)
    start = int(position_match.group(2)) if position_match else None
    end = int(position_match.group(3)) if position_match else None
    strand = int(position_match.group(4)) if position_match else None

    return id_, gene, gene_symbol, chromosome, start, end, strand


In [2]:

# Function to parse the FASTA file and store in DataFrame
def parse_fasta_to_df(fasta_file):
    # List to hold all entries
    records = []

    # Parse the FASTA file/
    for record in SeqIO.parse(fasta_file, "fasta"):
        # Extract id, gene, chromosome, start, end, and strand
        id_, gene, gene_symbol, chromosome, start, end, strand = extract_details(record.description)
        
        # Create a dictionary for the current entry
        entry = {
            "id": id_,
            "gene": gene,
            "gene_symbol": gene_symbol,
            "chromosome": chromosome,
            "start": start,
            "end": end,
            "strand": strand,
            #"Description": record.description,
            "nuc_seq": str(record.seq)
        }
        # Append to the list
        records.append(entry)
    
    # Convert the list of dicts to DataFrame
    df = pd.DataFrame(records)
    return df

# Usage example with the sample file
df = parse_fasta_to_df("./data/Mus_musculus.GRCm39.cdna.all.fa")
df["ens_gene_id"] = df["gene"].str.split(".").str[0]

In [3]:
display(df.head())

Unnamed: 0,id,gene,gene_symbol,chromosome,start,end,strand,nuc_seq,ens_gene_id
0,ENSMUST00000103301.3,ENSMUSG00000076500.3,Gm20730,GRCm39:6,43058452.0,43059031.0,-1.0,ATGAGGTGCCTAGCTGAGTTCCTGAGGCTACTTGTGCTCTGGATCC...,ENSMUSG00000076500
1,ENSMUST00000166255.2,ENSMUSG00000090395.2,Gm54608,GRCm39:12,113618587.0,113618864.0,1.0,CATGGCTGTGTACTCAGACCTCAGACTGTTTATTTTCAGGTAAAGT...,ENSMUSG00000090395
2,ENSMUST00000095364.3,ENSMUSG00000090765.2,Gm54637,GRCm39:12,114594300.0,114594593.0,1.0,TCTCTCACAATAATAGTCAGCAGTGTCCTCAGATGTCAGGCTTCTG...,ENSMUSG00000090765
3,ENSMUST00000103304.3,ENSMUSG00000094491.3,Igkv1-133,GRCm39:6,67701886.0,67702645.0,1.0,TATTTCCTCAAAATGATGAGTCCTGCCCAGTTCCTGTTTCTGTTAG...,ENSMUSG00000094491
4,ENSMUST00000103305.2,ENSMUSG00000096580.2,Igkv1-132,GRCm39:6,67736684.0,67737397.0,1.0,ATGATGAGTCCTGTCCAGTTCCTGTTTCTGTTAATGCTCTGGATTC...,ENSMUSG00000096580


In [4]:
df.to_csv("./data/nuc_seqs_mouse.csv", index=False)
display(df.head(3))

Unnamed: 0,id,gene,gene_symbol,chromosome,start,end,strand,nuc_seq,ens_gene_id
0,ENSMUST00000103301.3,ENSMUSG00000076500.3,Gm20730,GRCm39:6,43058452.0,43059031.0,-1.0,ATGAGGTGCCTAGCTGAGTTCCTGAGGCTACTTGTGCTCTGGATCC...,ENSMUSG00000076500
1,ENSMUST00000166255.2,ENSMUSG00000090395.2,Gm54608,GRCm39:12,113618587.0,113618864.0,1.0,CATGGCTGTGTACTCAGACCTCAGACTGTTTATTTTCAGGTAAAGT...,ENSMUSG00000090395
2,ENSMUST00000095364.3,ENSMUSG00000090765.2,Gm54637,GRCm39:12,114594300.0,114594593.0,1.0,TCTCTCACAATAATAGTCAGCAGTGTCCTCAGATGTCAGGCTTCTG...,ENSMUSG00000090765


In [5]:
import gget

out = gget.seq(df["ens_gene_id"][0], translate=True, isoforms=False)
print(out)

17:34:17 - INFO - Requesting amino acid sequence of the canonical transcript ENSMUST00000103301 of gene ENSMUSG00000076500 from UniProt.


['>ENSMUST00000103301 uniprot_id: A0A075B5J6 ensembl_id: ENSMUST00000103301 gene_name: Gm20730 organism: Mus musculus sequence_length: 119', 'MRCLAEFLRLLVLWIPATGDIVMTQAAPSVPANPGESVSISCRSSKSLLHSSGNTYLYWFLQRPGQSPQLLIYYISNLASGVPDRFSGSGSGTDFTLRISRVEAEDVGVYYCMQGLEYP']


In [6]:
import re

def parse_seq_header(header):
    # Extract key information using regex
    ensemble_pattern = r'ensembl_id:\s*(ENSMUS[T|G]\d+)'
    gene_pattern = r'gene_name:\s*([^\s]+)'  # Changed: capture everything until a space
    length_pattern = r'sequence_length:\s*(\d+)'
    
    ensemble_id = re.search(ensemble_pattern, header).group(1)
    gene_name = re.search(gene_pattern, header).group(1)
    sequence_length = int(re.search(length_pattern, header).group(1))
    
    return {
        'ensemble_id': ensemble_id,
        'gene_name': gene_name,
        'sequence_length': sequence_length
    }

In [7]:
out = gget.seq(df["ens_gene_id"][0], translate=True, isoforms=False)
if len(out)==2:
    df_prot = pd.DataFrame([parse_seq_header(out[0])])
    df_prot["aa_seq"] = out[1]
df_prot

17:34:19 - INFO - Requesting amino acid sequence of the canonical transcript ENSMUST00000103301 of gene ENSMUSG00000076500 from UniProt.


Unnamed: 0,ensemble_id,gene_name,sequence_length,aa_seq
0,ENSMUST00000103301,Gm20730,119,MRCLAEFLRLLVLWIPATGDIVMTQAAPSVPANPGESVSISCRSSK...
