In [1]:
import argparse
from Bio import AlignIO
from Bio.Align import AlignInfo
import numpy as np
from Bio import SeqIO
import pandas as pd

# Conserved Sequence python function

In [2]:
#Get conserved sequences function:
def get_conserved_sites(msa_file, threshold):
    
    # Read the MSA file
    alignment = AlignIO.read(msa_file, "fasta")

    # Create an AlignInfo summary object
    summary = AlignInfo.SummaryInfo(alignment) # obtains results from the aligment object created before

    # Calculate the conservation of each position in the aln
    conservation = summary.pos_specific_score_matrix()

    # Get the length of the Alignment(Positions length)
    aln_len = alignment.get_alignment_length()

    # Get number of sequences for threshold
    sequence_number = len(alignment)

    # Calculate conservation threshold
    thres_val = threshold * sequence_number # number of ocurrences needed to reach the threshold ex: 0.7*100 = 70 ocurrences threshold

    # Extract conserved sites
    conserved_pos = [] #positions
    conserved_keys = [] #AA letter
    conserved_prob = [] # %of conservation Ex: 0.7 = 7/10
    '''
    for position in conservation:
        if conservation[position] >= thres_val:
            conserved_sites.append(position + 1) # Convert to 1-based indexing
    
    #return conserved_sites
    ''' 
    for i in range(aln_len):
        max_conservation = max(conservation[i].values()) # Most present amino acid in each position number 
        max_key = max(conservation[i], key = conservation[i].get) # Gets letter from the most present aminoacid in position
        if max_conservation >= thres_val:
            conserved_pos.append(i+1) # Convert to 1-based indexing
            conserved_keys.append(max_key)
            conserved_prob.append(max_conservation/sequence_number) # returning it as a %

    results = np.array([conserved_keys, conserved_pos, conserved_prob]).T
    return results

In [3]:
def map_conserved_sites(original_fasta_file, conserved_sites, msa_file):
    # Read the original FASTA file
    sequences = SeqIO.to_dict(SeqIO.parse(original_fasta_file, "fasta"))

    # Find the first sequence dynamically
    first_sequence_name = next(iter(sequences))

    # Convert first sequence to string
    seq1 = str(sequences[first_sequence_name].seq)

    # Read the MSA file to count gaps in the first sequence
    with open(msa_file, 'r') as f:
        msa_record = next(SeqIO.parse(f, 'fasta'))
        msa_seq1 = str(msa_record.seq)

    # Initialize list to store mapped conserved sites
    mapped_conserved_sites = []

    # Map conserved sites to the original sequence
    for conserved_site in conserved_sites:
        conserved_aa, conserved_pos, conserved_prob = conserved_site
        conserved_pos = int(conserved_pos)

        # Initialize gap count
        gap_count = 0

        # Adjust the position based on the number of gaps
        for i in range(conserved_pos):
            if msa_seq1[i] == '-':
                gap_count += 1
        
        # Calculate mapped position
        mapped_pos = conserved_pos - gap_count

        # Append mapped conserved site to the list
        mapped_conserved_sites.append([conserved_aa, mapped_pos, conserved_prob])

    return mapped_conserved_sites

In [42]:
def map_conserved_sites(original_fasta_file, conserved_sites, msa_file):
    # Read the original FASTA file
    sequences = SeqIO.to_dict(SeqIO.parse(original_fasta_file, "fasta"))

    # Find the first sequence dynamically
    first_sequence_name = next(iter(sequences))

    # Convert first sequence to string
    original_seq = str(sequences[first_sequence_name].seq)

    # Read the MSA file
    alignment = AlignIO.read(msa_file, "fasta")
    msa_seq1 = str(alignment[0].seq)

    # Initialize list to store mapped conserved sites
    mapped_conserved_sites = []

    # Map conserved sites to the original sequence
    for conserved_site in conserved_sites:
        conserved_aa, conserved_pos, conserved_prob = conserved_site
        conserved_pos = int(conserved_pos)

        # Initialize gap count
        gap_count = msa_seq1[:conserved_pos].count('-')

        # Calculate mapped position
        
        mapped_pos = conserved_pos - gap_count
        print("Conserved Position:", conserved_pos)
        print("Gap Count:", gap_count)
        print("Mapped Position:", mapped_pos)

        # Append mapped conserved site to the list
        mapped_conserved_sites.append([conserved_aa, mapped_pos, conserved_prob])

    return mapped_conserved_sites


Look for header 
if header present:
    work with index 
if not add the sequence

then run the MSA
then COnseverd on Wild type


In [6]:
x = get_conserved_sites("data/msa_dummy.fasta", 0.0) 

In [7]:
print(x)

[['M' '1' '0.9']
 ['G' '2' '0.2']
 ['G' '3' '0.1']
 ['G' '4' '0.6']
 ['T' '5' '0.6']
 ['V' '6' '0.6']
 ['V' '7' '0.7']
 ['S' '8' '0.7']
 ['G' '9' '0.7']
 ['G' '10' '0.6']
 ['S' '11' '0.6']
 ['M' '12' '0.6']
 ['N' '13' '0.6']
 ['R' '14' '0.2']
 ['D' '15' '0.1']
 ['E' '16' '0.2']
 ['A' '17' '0.1']
 ['L' '18' '0.4']
 ['L' '19' '0.3']
 ['L' '20' '0.2']
 ['R' '21' '0.2']
 ['L' '22' '0.4']
 ['L' '23' '0.3']
 ['A' '24' '0.4']
 ['L' '25' '0.4']
 ['Y' '26' '0.6']
 ['T' '27' '0.5']
 ['I' '28' '0.5']
 ['T' '29' '0.4']
 ['Q' '30' '0.5']
 ['H' '31' '0.4']
 ['L' '32' '0.6']
 ['A' '33' '0.5']
 ['L' '34' '0.3']
 ['I' '35' '0.2']
 ['L' '36' '0.4']
 ['N' '37' '0.2']
 ['S' '38' '0.2']
 ['V' '39' '0.2']
 ['P' '40' '0.1']
 ['L' '41' '0.1']
 ['Q' '42' '0.1']
 ['L' '43' '0.1']
 ['L' '44' '0.1']
 ['L' '45' '0.2']
 ['H' '46' '0.1']
 ['A' '47' '0.2']
 ['G' '48' '0.3']
 ['I' '49' '0.2']
 ['A' '50' '0.2']
 ['P' '51' '0.1']
 ['I' '52' '0.2']
 ['G' '53' '0.3']
 ['H' '54' '0.2']
 ['K' '55' '0.1']
 ['K' '56' '0.1']
 

In [44]:
a = map_conserved_sites("fdh/data/fdh_seq.fasta", x, "fdh/results/fdh_MSA_cluster0.fasta")

Conserved Position: 215
Gap Count: 212
Mapped Position: 3
Conserved Position: 216
Gap Count: 212
Mapped Position: 4
Conserved Position: 217
Gap Count: 212
Mapped Position: 5
Conserved Position: 219
Gap Count: 212
Mapped Position: 7
Conserved Position: 220
Gap Count: 212
Mapped Position: 8
Conserved Position: 221
Gap Count: 212
Mapped Position: 9
Conserved Position: 222
Gap Count: 212
Mapped Position: 10
Conserved Position: 228
Gap Count: 217
Mapped Position: 11
Conserved Position: 286
Gap Count: 267
Mapped Position: 19
Conserved Position: 288
Gap Count: 267
Mapped Position: 21
Conserved Position: 289
Gap Count: 267
Mapped Position: 22
Conserved Position: 290
Gap Count: 267
Mapped Position: 23
Conserved Position: 309
Gap Count: 284
Mapped Position: 25
Conserved Position: 310
Gap Count: 284
Mapped Position: 26
Conserved Position: 312
Gap Count: 284
Mapped Position: 28
Conserved Position: 313
Gap Count: 284
Mapped Position: 29
Conserved Position: 314
Gap Count: 284
Mapped Position: 30
Con

In [45]:
# Create DataFrame
df_conserved = pd.DataFrame(a, columns=['Residue', 'Position', 'conservation'])

    # Write DataFrame to a text file
#df_conserved.to_csv(args.output, sep='\t', index=False)  # Change separator and file format as needed

In [46]:
print(df_conserved)

    Residue  Position        conservation
0         K         3  0.9132636910414689
1         V         4  0.5701171256726812
2         L         5  0.6771130104463438
3         V         7  0.8376068376068376
4         L         8  0.8217790440012662
..      ...       ...                 ...
243       I       400  0.9810066476733144
244       V       401  0.9167458056346945
245       G       404    0.98005698005698
246       A       407  0.8315922760367205
247       Y       411  0.9860715416270972

[248 rows x 3 columns]


# Adding OG seq to each cluster file


In [10]:
import argparse
from Bio import SeqIO
from Bio.Align.Applications import ClustalOmegaCommandline

def add_header(input_file, output_file, header_file):
    """
    Function to add a header sequence to a fasta file if it's missing.
    
    Parameters:
        input_file (str): Path to the input fasta file.
        output_file (str): Path to the output fasta file.
        header_file (str): Path to the fasta file containing the header sequence.
    """
    # Read the header sequence from the header file
    header_record = SeqIO.read(header_file, "fasta")

    # Check if the header exists in the input file
    header_exists = False
    for record in SeqIO.parse(input_file, "fasta"):
        if record.description == header_record.description:
            header_exists = True
            # Check if the sequence matches as well
            if str(record.seq) == str(header_record.seq):
                print("Header and sequence already present in the input file.")
            else:
                print("Header found in the input file, but with a different sequence.")
            break
    
    # If header doesn't exist, add it to the beginning of the file
    if not header_exists:
        SeqIO.write(header_record, output_file, "fasta")
        with open(input_file, "r") as f:
            with open(output_file, "a") as f_out:
                for line in f:
                    f_out.write(line)

In [12]:
input_file = "data/dummy_data.fasta"
output_file = "data/seq_added.fasta"
header_file = "data/dummy_seq.fasta"

add_header(input_file, output_file, header_file)