In [6]:
import argparse
from Bio import AlignIO
from Bio.Align import AlignInfo
import numpy as np
from Bio import SeqIO

In [4]:
#Get conserved sequences function:
def get_conserved_sites(msa_file, threshold):
    
    # Read the MSA file
    alignment = AlignIO.read(msa_file, "fasta")

    # Create an AlignInfo summary object
    summary = AlignInfo.SummaryInfo(alignment) # obtains results from the aligment object created before

    # Calculate the conservation of each position in the aln
    conservation = summary.pos_specific_score_matrix()

    # Get the length of the Alignment(Positions length)
    aln_len = alignment.get_alignment_length()

    # Get number of sequences for threshold
    sequence_number = len(alignment)

    # Calculate conservation threshold
    thres_val = threshold * sequence_number # number of ocurrences needed to reach the threshold ex: 0.7*100 = 70 ocurrences threshold

    # Extract conserved sites
    conserved_pos = [] #positions
    conserved_keys = [] #AA letter
    conserved_prob = [] # %of conservation Ex: 0.7 = 7/10
    '''
    for position in conservation:
        if conservation[position] >= thres_val:
            conserved_sites.append(position + 1) # Convert to 1-based indexing
    
    #return conserved_sites
    ''' 
    for i in range(aln_len):
        max_conservation = max(conservation[i].values()) # Most present amino acid in each position number 
        max_key = max(conservation[i], key = conservation[i].get) # Gets letter from the most present aminoacid in position
        if max_conservation >= thres_val:
            conserved_pos.append(i+1) # Convert to 1-based indexing
            conserved_keys.append(max_key)
            conserved_prob.append(max_conservation/sequence_number) # returning it as a %

    results = np.array([conserved_keys, conserved_pos, conserved_prob]).T
    return results

In [2]:
def map_conserved_sites(original_fasta_file, conserved_sites):
    
    # Read the first sequence from the FASTA file
    with SeqIO.open(original_fasta_file, "fasta") as handle:
        first_sequence = handle.__next__()

    # Use the first sequence and process as before
    seq1 = str(first_sequence.seq)

    # Initialize list to store mapped conserved sites
    mapped_conserved_sites = []

    # Map conserved sites to the original sequence
    for conserved_site in conserved_sites:
        conserved_aa, conserved_pos, conserved_prob = conserved_site
        conserved_pos = int(conserved_pos)

        # Count the number of gaps ('-') in Sequence_1 up to conserved_pos
        num_gaps = seq1[:conserved_pos].count('-')

        # Adjust the position based on the number of gaps
        mapped_pos = conserved_pos - num_gaps

        # Append mapped conserved site to the list
        mapped_conserved_sites.append([conserved_aa, mapped_pos, conserved_prob])

    return mapped_conserved_sites

In [23]:
#map_conserved_sites("data/MSA_dummy_data.fasta", x)
sequences = SeqIO.to_dict(SeqIO.parse("data/MSA_dummy_data.fasta", "fasta"))

# Find the first sequence dynamically
first_sequence_name = next(iter(sequences))
seq1 = str(sequences[first_sequence_name].seq)
mapped_conserved_sites = []
for conserved_site in x:
        conserved_aa, conserved_pos, conserved_prob = conserved_site
        conserved_pos = int(conserved_pos)
        # Count the number of gaps ('-') in the first sequence up to conserved_pos
        num_gaps = seq1[:conserved_pos].count('-')

        # Adjust the position based on the number of gaps
        mapped_pos = conserved_pos - num_gaps

        # Append mapped conserved site to the list
        mapped_conserved_sites.append([conserved_aa, mapped_pos, conserved_prob])

0


In [26]:
def map_conserved_sites(original_fasta_file, conserved_sites, msa_file):
    # Read the original FASTA file
    sequences = SeqIO.to_dict(SeqIO.parse(original_fasta_file, "fasta"))

    # Find the first sequence dynamically
    first_sequence_name = next(iter(sequences))

    # Convert first sequence to string
    seq1 = str(sequences[first_sequence_name].seq)

    # Read the MSA file to count gaps in the first sequence
    with open(msa_file, 'r') as f:
        msa_record = next(SeqIO.parse(f, 'fasta'))
        msa_seq1 = str(msa_record.seq)

   # Convert position from conserved_sites to an integer
    last_conserved_pos = int(conserved_sites[-1][1]) #[-1]: This accesses the last element in the list  [1]: This accesses the second element of the inner list, which corresponds to the position of the conserved site.

    # Count gaps in the first sequence of the MSA

    gap_count = msa_seq1[:last_conserved_pos].count('-')

    # Initialize list to store mapped conserved sites
    mapped_conserved_sites = []

    # Map conserved sites to the original sequence
    for conserved_site in conserved_sites:
        conserved_aa, conserved_pos, conserved_prob = conserved_site
        conserved_pos = int(conserved_pos)

        # Adjust the position based on the number of gaps
        mapped_pos = conserved_pos - gap_count

        # Append mapped conserved site to the list
        mapped_conserved_sites.append([conserved_aa, mapped_pos, conserved_prob])

    return mapped_conserved_sites



In [30]:
x = get_conserved_sites("data/test_MSA.fasta", 0.6) # Matrix with results from function

In [28]:
#a = AlignIO.read("data/msa_dummy.fasta", "fasta") # MSA from clustalomega
#print(x)
r = map_conserved_sites()

[['M' '1' '0.9']
 ['G' '4' '0.6']
 ['T' '5' '0.6']
 ['V' '6' '0.6']
 ['V' '7' '0.7']
 ['S' '8' '0.7']
 ['G' '9' '0.7']
 ['G' '10' '0.6']
 ['S' '11' '0.6']
 ['M' '12' '0.6']
 ['N' '13' '0.6']
 ['Y' '26' '0.6']
 ['L' '32' '0.6']]


In [27]:
map_conserved_sites("data/MSA_dummy_data.fasta", x,"data/msa_dummy.fasta")

[['M', 1, '0.9'],
 ['G', 4, '0.6'],
 ['T', 5, '0.6'],
 ['V', 6, '0.6'],
 ['V', 7, '0.7'],
 ['S', 8, '0.7'],
 ['G', 9, '0.7'],
 ['G', 10, '0.6'],
 ['S', 11, '0.6'],
 ['M', 12, '0.6'],
 ['N', 13, '0.6'],
 ['Y', 26, '0.6'],
 ['L', 32, '0.6']]

In [53]:
x[0,0] == a[0,0]

True

In [70]:
int(x[0,1]) + int(x[0,1]) # 0 is position, 0,0 is AA conserved, 0,1 is position and 0,2 is % of apparisons

2

In [66]:
a[0,85]

'-'

In [86]:
length = a.get_alignment_length() # Length of actual alignme

In [101]:

for k in range(len(x)): # 0 - 9 seq
    for i in range(length): # 0 - 85 pos 
        if x[k,0] == a[0, i] and int(x[k, 1]) == i + 1:# if the AA conserved in the dictionary is the same that the letter in the aligment AND the position of that AA conserved is the same as the position of the matched letters:
            print(x[k])

            
         

['M' '1' '0.9']
['G' '4' '0.6']
['T' '5' '0.6']
['V' '6' '0.6']
['V' '7' '0.7']
['S' '8' '0.7']
['G' '9' '0.7']
['G' '10' '0.6']
['S' '11' '0.6']
['M' '12' '0.6']
['N' '13' '0.6']
