In [1]:
import pandas as pd
import ast
from Bio import SeqIO

The function takes as input an alignment file, the start and end coordinates of a protein, and extracts the sequence of this protein from all sequences.

In [7]:
def extract_sequences_by_coordinates(input_fasta, output_fasta, start, end):
    # Open input and output FASTA files
    with open(input_fasta, "r") as infile, open(output_fasta, "w") as outfile:
        # Parse input FASTA file
        for record in SeqIO.parse(infile, "fasta"):
            # Extract sequence based on coordinates
            extracted_seq = record.seq[start-1:end]  # Adjust for 0-based indexing

            # Write the extracted sequence to the output file with the original header
            outfile.write(f">{record.id}\n")
            outfile.write(f"{extracted_seq}\n")

In [8]:
gapped_coords = pd.read_csv('../Proteins/gapped_protein_coords.csv')
VP1_coords = gapped_coords.loc[gapped_coords['Product'] == 'VP1', 'Gapped_Coordinates'].values[0]
VP1_coords = ast.literal_eval(VP1_coords)
RdRp_coords = gapped_coords.loc[gapped_coords['Product'] == 'p56a', 'Gapped_Coordinates'].values[0]
RdRp_coords = ast.literal_eval(RdRp_coords)

In [9]:
input_fasta = "../filtered_CDS_alignment.fasta"
output_fasta_VP1 = "../aln_repo/VP1.fasta"
output_fasta_RdRp = "../RdRp.fasta"
start_coordinate_VP1 = VP1_coords[0]
end_coordinate_VP1 = VP1_coords[1]
start_coordinate_RdRp = RdRp_coords[0]
end_coordinate_RdRp = RdRp_coords[1]

In [10]:
extract_sequences_by_coordinates(input_fasta, output_fasta_VP1, start_coordinate_VP1, end_coordinate_VP1)

In [11]:
extract_sequences_by_coordinates(input_fasta, output_fasta_RdRp, start_coordinate_RdRp, end_coordinate_RdRp)