In [None]:
from Bio import SeqIO
import pandas as pd
from collections import defaultdict

def calculate_gap_frequencies(reference_file, alignment_file, start_position, output_file="gap_frequencies.csv"):
    """
    Calculate the frequency of gaps and non-gaps at each position across all sequences 
    compared to the reference, and include the residue at each position in the reference sequence.
    
    Parameters:
    - reference_file: Path to the reference sequence file
    - alignment_file: Path to the aligned sequences file
    - start_position: The reference sequence position where the numbering should begin
    - output_file: Path to the output CSV file
    """

    # Load the reference sequence
    reference = SeqIO.read(reference_file, "fasta")
    ref_seq = str(reference.seq).upper()  # Convert reference sequence to string and uppercase

    # Parse aligned sequences
    alignment = list(SeqIO.parse(alignment_file, "fasta"))
    num_sequences = len(alignment)  # Total number of sequences in the alignment

    # Dictionary to store counts of gaps and non-gaps at each position
    gap_counts = defaultdict(lambda: {"Gaps": 0, "Non_Gaps": 0, "Residue": ''})

    # Iterate over each sequence in the alignment
    for record in alignment:
        seq = str(record.seq).upper()  # Convert sequence to string and uppercase

        # Find the first non-gap residue in the reference sequence
        first_non_gap_position = next((i for i, res in enumerate(ref_seq) if res != '-'), None)
        if first_non_gap_position is None:
            raise ValueError("No non-gap residue found in the reference sequence.")
        
        # Iterate over each position in the sequence and reference, starting at the given start position
        for pos, (ref_base, seq_base) in enumerate(zip(ref_seq[first_non_gap_position:], seq[first_non_gap_position:]), start=start_position):
            # Store the reference residue at this position (only if not already stored)
            if gap_counts[pos]["Residue"] == '':
                gap_counts[pos]["Residue"] = ref_base

            # Count gaps and non-gaps
            if seq_base == "-":  # Check if the base in the sequence is a gap
                gap_counts[pos]["Gaps"] += 1  # Increment the gap count for this position
            else:
                gap_counts[pos]["Non_Gaps"] += 1  # Increment the non-gap count for this position

    # Calculate frequencies and prepare the data for the DataFrame
    gap_data = [
        {
            "Position": pos,
            "Residue": counts["Residue"],
            "Gap_Frequency": round((counts["Gaps"] / num_sequences) * 100, 2),
            "Non_Gap_Frequency": round((counts["Non_Gaps"] / num_sequences) * 100, 2)
        }
        for pos, counts in sorted(gap_counts.items())
    ]

    # Convert the data to a DataFrame
    gap_df = pd.DataFrame(gap_data)

    # Print the frequencies
    print(f"Gap and non-gap frequencies at each position with reference residues:\n{gap_df}")

    # Save the frequencies to a CSV file
    gap_df.to_csv(output_file, index=False)
    print(f"Gap and non-gap frequencies with residues saved to {output_file}")

    return gap_df

# Example usage
reference_file = "REF.fasta"  # Path to the reference sequence file
alignment_file = "trimmed_sequences.fasta"  # Path to the aligned sequences file
start_position = 306  # The position to start numbering from
output_file = "gap_frequencies.csv"  # Output CSV file
calculate_gap_frequencies(reference_file, alignment_file, start_position, output_file)


In [None]:
#For checking Deletions
import pandas as pd

def filter_gap_frequency(input_file, output_file):
    """
    Filters rows where the gap frequency is less than 99 (and not equal to 0) and the residue is not '-'.

    Args:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to save the filtered data.

    Returns:
        None
    """
    # Load the CSV file into a DataFrame
    df = pd.read_csv(input_file)

    # Check if required columns exist
    if 'Residue' not in df.columns or 'Gap_Frequency' not in df.columns:
        raise ValueError("The input file must contain 'Residue' and 'Gap_Frequency' columns.")

    # Filter rows: gap frequency < 99 and residue is not '-' and gap frequency is not 0
    filtered_df = df[(df['Gap_Frequency'] > 0) & (df['Residue'] != '-')]

    # Print the filtered DataFrame to the console
    print("Filtered Data:")
    print(filtered_df)

    # Save the filtered data to a new CSV file
    filtered_df.to_csv(output_file, index=False)
    print(f"Filtered data saved to {output_file}")

# Example usage
input_file = "Gap_frequencies.csv"  # Input CSV file path
output_file = "deletions.csv"  # Output CSV file path

# Run the function
filter_gap_frequency(input_file, output_file)



In [None]:
#For checking Insterions
import pandas as pd

def filter_non_gap_frequency_more_than_1(input_file, output_file):
    """
    Filters rows where the non-gap frequency is more than 0.006% and the residue is '-'.

    Args:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to save the filtered data.

    Returns:
        None
    """
    # Load the CSV file into a DataFrame
    df = pd.read_csv(input_file)

    # Check if required columns exist
    if 'Residue' not in df.columns or 'Non_Gap_Frequency' not in df.columns:
        raise ValueError("The input file must contain 'Residue' and 'Non_Gap_Frequency' columns.")

    # Filter rows: Non gap frequency > 1% and residue is '-'
    filtered_df = df[(df['Non_Gap_Frequency'] > 0.000) & (df['Residue'] == '-')]

    # Print the filtered DataFrame to the console
    print("Filtered Data:")
    print(filtered_df)

    # Save the filtered data to a new CSV file
    filtered_df.to_csv(output_file, index=False)
    print(f"Filtered data saved to {output_file}")

# Example usage
input_file = "Gap_frequencies.csv"  # Input CSV file path
output_file = "insertions.csv"  # Output CSV file path


# Run the function
filter_non_gap_frequency_more_than_1(input_file, output_file)
