In [24]:
import argparse
from Bio import AlignIO
from Bio.Align import AlignInfo
import numpy as np
from Bio import SeqIO
import pandas as pd

# Conserved Sequence python function

In [2]:
#Get conserved sequences function:
def get_conserved_sites(msa_file, threshold):
    
    # Read the MSA file
    alignment = AlignIO.read(msa_file, "fasta")

    # Create an AlignInfo summary object
    summary = AlignInfo.SummaryInfo(alignment) # obtains results from the aligment object created before

    # Calculate the conservation of each position in the aln
    conservation = summary.pos_specific_score_matrix()

    # Get the length of the Alignment(Positions length)
    aln_len = alignment.get_alignment_length()

    # Get number of sequences for threshold
    sequence_number = len(alignment)

    # Calculate conservation threshold
    thres_val = threshold * sequence_number # number of ocurrences needed to reach the threshold ex: 0.7*100 = 70 ocurrences threshold

    # Extract conserved sites
    conserved_pos = [] #positions
    conserved_keys = [] #AA letter
    conserved_prob = [] # %of conservation Ex: 0.7 = 7/10
    '''
    for position in conservation:
        if conservation[position] >= thres_val:
            conserved_sites.append(position + 1) # Convert to 1-based indexing
    
    #return conserved_sites
    ''' 
    for i in range(aln_len):
        max_conservation = max(conservation[i].values()) # Most present amino acid in each position number 
        max_key = max(conservation[i], key = conservation[i].get) # Gets letter from the most present aminoacid in position
        if max_conservation >= thres_val:
            conserved_pos.append(i+1) # Convert to 1-based indexing
            conserved_keys.append(max_key)
            conserved_prob.append(max_conservation/sequence_number) # returning it as a %

    results = np.array([conserved_keys, conserved_pos, conserved_prob]).T
    return results

In [3]:
def map_conserved_sites(original_fasta_file, conserved_sites, msa_file):
    # Read the original FASTA file
    sequences = SeqIO.to_dict(SeqIO.parse(original_fasta_file, "fasta"))

    # Find the first sequence dynamically
    first_sequence_name = next(iter(sequences))

    # Convert first sequence to string
    seq1 = str(sequences[first_sequence_name].seq)

    # Read the MSA file to count gaps in the first sequence
    with open(msa_file, 'r') as f:
        msa_record = next(SeqIO.parse(f, 'fasta'))
        msa_seq1 = str(msa_record.seq)

    # Initialize list to store mapped conserved sites
    mapped_conserved_sites = []

    # Map conserved sites to the original sequence
    for conserved_site in conserved_sites:
        conserved_aa, conserved_pos, conserved_prob = conserved_site
        conserved_pos = int(conserved_pos)

        # Initialize gap count
        gap_count = 0

        # Adjust the position based on the number of gaps
        for i in range(conserved_pos):
            if msa_seq1[i] == '-':
                gap_count += 1
        
        # Calculate mapped position
        mapped_pos = conserved_pos - gap_count

        # Append mapped conserved site to the list
        mapped_conserved_sites.append([conserved_aa, mapped_pos, conserved_prob])

    return mapped_conserved_sites

In [28]:
def map_conserved_sites(original_fasta_file, conserved_sites, msa_file):
    # Read the original FASTA file
    sequences = SeqIO.to_dict(SeqIO.parse(original_fasta_file, "fasta"))

    # Find the first sequence dynamically
    first_sequence_name = next(iter(sequences))

    # Convert first sequence to string
    original_seq = str(sequences[first_sequence_name].seq)

    # Read the MSA file
    alignment = AlignIO.read(msa_file, "fasta")
    msa_seq1 = str(alignment[0].seq)

    # Initialize list to store mapped conserved sites
    mapped_conserved_sites = []

    # Map conserved sites to the original sequence
    for conserved_site in conserved_sites:
        conserved_aa, conserved_pos, conserved_prob = conserved_site
        conserved_pos = int(conserved_pos)

        # Initialize gap count
        gap_count = msa_seq1[:conserved_pos].count('-')

        # Calculate mapped position
        
        mapped_pos = conserved_pos - gap_count
        print("Conserved Position:", conserved_pos)
        print("Gap Count:", gap_count)
        print("Mapped Position:", mapped_pos)

        # Append mapped conserved site to the list
        mapped_conserved_sites.append([conserved_aa, mapped_pos, conserved_prob])

    return mapped_conserved_sites


In [29]:
x = get_conserved_sites("fdh/results/fdh_MSA_cluster0.fasta", 0.5) 

In [37]:
a = map_conserved_sites("fdh/data/fdh_seq.fasta", x, "fdh/results/fdh_MSA_cluster0.fasta")

In [39]:
# Create DataFrame
df_conserved = pd.DataFrame(a, columns=['Residue', 'Position', 'conservation'])

    # Write DataFrame to a text file
#df_conserved.to_csv(args.output, sep='\t', index=False)  # Change separator and file format as needed

In [41]:
print(df_conserved)

   Residue  Position conservation
0        M         0          0.9
1        G         0          0.6
2        T         0          0.6
3        V         0          0.6
4        V         0          0.7
5        S         0          0.7
6        G         0          0.7
7        G         0          0.6
8        S         0          0.6
9        M         0          0.6
10       N         0          0.6
11       Y         0          0.6
12       T         0          0.5
13       I         0          0.5
14       Q         0          0.5
15       L         0          0.6
16       A         0          0.5
17       V         0          0.5
