In [None]:
import re
import pandas as pd
from Bio import SeqIO
from IPython.display import display

# Ask the user for the input FASTA file
fasta_file = input("Please enter the full name of the FASTA file and press Enter: ")

# Initialize lists to store different fields
databases, ids, protein_names, sequences, organisms, gene_names, taxonomy_ids = [], [], [], [], [], [], []

# Define a regular expression pattern to extract information from the FASTA header
header_pattern = r'([^ ]*) (.*) OS=(.*) OX=(.*)$'

# Open and parse the FASTA file
with open(fasta_file, "r") as file:
    for record in SeqIO.parse(file, "fasta"):
        full_id = record.id
        description_line = record.description
        databases.append(full_id.split('|')[0])
        ids.append(full_id.split('|')[1])
        sequences.append(str(record.seq))

        match = re.match(header_pattern, description_line)
        protein_names.append(match.group(2))
        organisms.append(match.group(3))
        taxonomy_ids.append(match.group(4).split(' ')[0])
        gene_names.append(match.group(4).split(' ')[1][3:] if len(match.group(4).split(' ')) > 1 else '')

print(f'Read in {len(ids)} sequences from FASTA file {fasta_file}.')

# Create a DataFrame using the collected information
df_genes = pd.DataFrame({
    'Protein Id': ids,
    'Database': databases,
    'Protein Name': protein_names,
    'Organism': organisms,
    'Taxonomy ID': taxonomy_ids,
    'Gene Name': gene_names,
    'seq': sequences
})

# Define the name for the output TSV file
output_tsv_name = fasta_file.replace('.fasta', '.tsv')

# Write the DataFrame to a TSV file
df_genes.to_csv(output_tsv_name, sep='\t', index=False)

print(f"Data has been written to {output_tsv_name}")