In [None]:
#2 For removing X
from Bio import SeqIO

def filter_sequences(input_file, output_file):
    """
    Filters sequences containing 'X' and saves the cleaned FASTA file.

    Args:
        input_file (str): Path to the input FASTA file.
        output_file (str): Path to save the filtered FASTA file.
    """
    filtered_sequences = []
    for record in SeqIO.parse(input_file, "fasta"):
        if 'X' not in str(record.seq):
            filtered_sequences.append(record)

    
    SeqIO.write(filtered_sequences, output_file, "fasta")
    print(f"Filtered sequences saved to {output_file}. Total sequences retained: {len(filtered_sequences)}")
    return output_file


input_file = "sequences.fasta"  # Replace with your actual input FASTA file path
output_file = "filtered_sequences_nox.fasta"  # Replace with your desired output file path
filter_sequences(input_file, output_file)


In [None]:
#Toc check the number of sequences
def count_sequences_in_fasta(file_path):
    count = 0
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('>'):
                count += 1
    return count


fasta_file = 'Sequences.fasta'
sequence_count = count_sequences_in_fasta(fasta_file)
print(f"Number of sequences in the FASTA file: {sequence_count}")

In [None]:
#Checks the first and last non-gap residues in aligned protein sequences to determine the true alignment boundaries.
from Bio import SeqIO

def find_residue_positions(fasta_file):
    sequences = list(SeqIO.parse(fasta_file, "fasta"))
    if not sequences:
        return None, None

    num_sequences = len(sequences)
    sequence_length = len(sequences[0].seq)
    
    # Convert sequences to string for easier manipulation
    seq_strings = [str(seq.seq) for seq in sequences]
    
    # Find the first position from the start where all sequences have a residue
    start_position = None
    for i in range(sequence_length):
        if all(seq_strings[j][i] != '-' for j in range(num_sequences)):
            start_position = i + 1  # 1-based index
            break
    
    # Find the first position from the end where all sequences have a residue
    end_position = None
    for i in range(sequence_length - 1, -1, -1):
        if all(seq_strings[j][i] != '-' for j in range(num_sequences)):
            end_position = i + 1  # 1-based index
            break
    
    return start_position, end_position

# Example usage
fasta_file = "delta_align.fasta"
start, end = find_residue_positions(fasta_file)
print(f"First position with no gaps from the start: {start}")
print(f"First position with no gaps from the end: {end}")



In [None]:
#For Trimming sequences
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

def find_residue_positions_and_trim(fasta_file, output_file):
    sequences = list(SeqIO.parse(fasta_file, "fasta"))
    if not sequences:
        print("No sequences found in the file.")
        return

    num_sequences = len(sequences)
    sequence_length = len(sequences[0].seq)
    
    # Convert sequences to string for easier manipulation
    seq_strings = [str(seq.seq) for seq in sequences]
    
    # Find the first position from the start where all sequences have a residue
    start_position = None
    for i in range(sequence_length):
        if all(seq_strings[j][i] != '-' for j in range(num_sequences)):
            start_position = i
            break
    
    # Find the first position from the end where all sequences have a residue
    end_position = None
    for i in range(sequence_length - 1, -1, -1):
        if all(seq_strings[j][i] != '-' for j in range(num_sequences)):
            end_position = i
            break
    
    # Trim the sequences
    trimmed_sequences = []
    for seq in sequences:
        trimmed_seq = seq.seq[start_position:end_position+1]
        trimmed_sequences.append(SeqRecord(Seq(trimmed_seq), id=seq.id, description=seq.description))
    
    # Write the trimmed sequences to a new FASTA file
    SeqIO.write(trimmed_sequences, output_file, "fasta")
    
    print(f"Sequences trimmed and saved to {output_file}")
    print(f"Start position (1-based): {start_position + 1}")
    print(f"End position (1-based): {end_position + 1}")

# Example usage
fasta_file = "aligned_sequence.fasta"
output_file = "trimmed_sequence.fasta"
find_residue_positions_and_trim(fasta_file, output_file)
