In [7]:
import argparse
import pandas as pd
import numpy as np

from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform
import matplotlib.pyplot as plt

#LocalAlignment Implemented Using Smith-Waterman
def LocalAlignment(match_reward: int, mismatch_penalty: int, indel_penalty: int,
                    s: str, t: str):
    rows, cols = len(s) + 1, len(t) + 1
    score_matrix = [[0] * cols for _ in range(rows)]

    maxScore = 0
    maxI, maxJ = 0, 0

    # Fill in the matrix based on match, mismatch, and gap penalties
    for i in range(1, rows):
        for j in range(1, cols):
            match_mismatch_score = match_reward if s[i - 1] == t[j - 1] else (-mismatch_penalty)

            diagonal = score_matrix[i - 1][j - 1] + match_mismatch_score
            left = score_matrix[i][j - 1] - indel_penalty
            up = score_matrix[i - 1][j] - indel_penalty

            score_matrix[i][j] = max(diagonal, left, up, 0)
            if score_matrix[i][j] > maxScore:
                maxScore = score_matrix[i][j]
                maxI, maxJ = i, j
        
    
    alignment1, alignment2 = '', ''
    i, j = maxI, maxJ

    while i > 0 and j > 0 and score_matrix[i][j] > 0:
        if score_matrix[i][j] == score_matrix[i - 1][j - 1] + (match_reward if s[i - 1] == t[j - 1] else (-mismatch_penalty)):
            alignment1 = s[i - 1] + alignment1
            alignment2 = t[j - 1] + alignment2
            i -= 1
            j -= 1
        elif score_matrix[i][j] == score_matrix[i - 1][j] - indel_penalty:
            alignment1 = s[i - 1] + alignment1
            alignment2 = '-' + alignment2
            i -= 1
        else:
            alignment1 = '-' + alignment1
            alignment2 = t[j - 1] + alignment2
            j -= 1
    
    return [maxScore,alignment1,alignment2]

def FastaParse(filename):
    seqDict = {}
    with open(filename, 'r') as file:
        line = file.readline().strip()
        while len(line) != 0:
            if line[0 == '>']:
                seqDict[line] = file.readline().strip()
            line = file.readline().strip()
    file.close()
    return seqDict



In [3]:

filename = 'test_data/pone.0192851.s009.faa'
seqs = FastaParse(filename)

m,s,d = 1, -10, -2

alnScores = {}
for seq in seqs:
    alnScores[seq] = {}
    toPopulate = {other for other in seqs if other != seq}
    for i in toPopulate:
        alnScores[seq][i] = {'Score' : None, 'Alignment' : None}

for seq in alnScores:
    for i in alnScores[seq]:
        if alnScores[seq][i]['Score'] == None and alnScores[seq][i]['Alignment'] == None:
            print('aligning ' +seq +' with ' + i)
            
            score, alignment1, alignment2 = LocalAlignment(m,s,d,seqs[seq],seqs[i])
            alnScores[seq][i] = {'Score' : score, 'Alignment' : alignment1}
            alnScores[i][seq] = {'Score' : score, 'Alignment' : alignment2}

#print(seqs)
#print(alnScores)
print('DONE')
scores = {}
for outer_key, inner_dict in alnScores.items():
    scores[outer_key] = {inner_key: values['Score'] for inner_key, values in inner_dict.items()}
df = pd.DataFrame(scores)


aligning >1.A.17.1.1 with >1.A.17.3.11
aligning >1.A.17.1.1 with >1.A.17.4.4
aligning >1.A.17.1.1 with >1.A.17.5.2
aligning >1.A.17.1.1 with >1.A.17.4.2
aligning >1.A.17.1.1 with >1.A.17.4.1
aligning >1.A.17.1.1 with >1.A.17.1.3
aligning >1.A.17.1.1 with >1.A.17.6.4
aligning >1.A.17.1.1 with >1.A.17.3.9
aligning >1.A.17.1.1 with >1.A.17.5.1
aligning >1.A.17.1.1 with >1.A.17.5.7
aligning >1.A.17.1.1 with >1.A.17.4.7
aligning >1.A.17.1.1 with >1.A.17.5.4
aligning >1.A.17.1.1 with >1.A.17.1.4
aligning >1.A.17.1.1 with >1.A.17.6.1
aligning >1.A.17.1.1 with >1.A.17.2.3
aligning >1.A.17.1.1 with >1.A.17.3.10
aligning >1.A.17.1.1 with >1.A.17.3.4
aligning >1.A.17.1.1 with >1.A.17.5.11
aligning >1.A.17.1.1 with >1.A.17.5.6
aligning >1.A.17.1.1 with >1.A.17.5.9
aligning >1.A.17.1.1 with >1.A.17.6.3
aligning >1.A.17.1.1 with >1.A.17.6.7
aligning >1.A.17.1.1 with >1.A.17.1.9
aligning >1.A.17.1.1 with >1.A.17.5.5
aligning >1.A.17.1.1 with >1.A.17.7.5
aligning >1.A.17.1.1 with >1.A.17.1.6
aligning 

Unnamed: 0,>1.A.17.1.1,>1.A.17.1.10,>1.A.17.1.11,>1.A.17.1.12,>1.A.17.1.13,>1.A.17.1.14,>1.A.17.1.15,>1.A.17.1.16,>1.A.17.1.17,>1.A.17.1.18,...,>1.A.17.6.3,>1.A.17.6.4,>1.A.17.6.5,>1.A.17.6.6,>1.A.17.6.7,>1.A.17.7.1,>1.A.17.7.2,>1.A.17.7.3,>1.A.17.7.4,>1.A.17.7.5
>1.A.17.3.11,10416.0,9224.0,7848.0,13338.0,12860.0,9800.0,7712.0,12520.0,8408.0,8408.0,...,9200.0,9824.0,7488.0,9448.0,9648.0,9216.0,7624.0,8272.0,6544.0,9112.0
>1.A.17.4.4,8778.0,8342.0,7020.0,10026.0,9548.0,8618.0,6884.0,9340.0,7580.0,7580.0,...,8312.0,8624.0,6660.0,8464.0,8568.0,8334.0,6796.0,7444.0,5716.0,8248.0
>1.A.17.5.2,8500.0,8166.0,6952.0,9748.0,9270.0,8346.0,6816.0,9062.0,7512.0,7512.0,...,8148.0,8352.0,6592.0,8246.0,8308.0,8158.0,6728.0,7376.0,5648.0,8114.0
>1.A.17.4.2,7746.0,7448.0,6762.0,8994.0,8516.0,7592.0,6626.0,8308.0,7184.0,7196.0,...,7442.0,7598.0,6402.0,7504.0,7554.0,7446.0,6538.0,7108.0,5458.0,7420.0
>1.A.17.4.1,9074.0,8458.0,7094.0,10322.0,9844.0,8878.0,6958.0,9636.0,7654.0,7654.0,...,8440.0,8872.0,6734.0,8640.0,8768.0,8450.0,6870.0,7518.0,5790.0,8358.0


In [14]:
# Step 1: Force Symmetry by averaging the upper and lower triangles
def make_symmetric(df):
    return (df + df.T) / 2

df_symmetric = make_symmetric(df)

# Verify symmetry (just for confirmation)
assert np.allclose(df_symmetric, df_symmetric.T), "DataFrame is not symmetric"

AssertionError: DataFrame is not symmetric

In [10]:
max_score = np.max(df.values)
distance_matrix = max_score - df.values

In [11]:
# Step 2: Perform Hierarchical Clustering
# Convert the DataFrame to a condensed distance matrix, required by linkage
condensed_distance_matrix = squareform(df.values)
Z = linkage(condensed_distance_matrix, method='average')  # You can change the method as needed

# Step 3: Generate and Save the Tree
plt.figure(figsize=(10, 7))
dendrogram(Z, labels=df.index.tolist())
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sequences')
plt.ylabel('Distance')
plt.show()

# Save the plot as an image
plt.savefig('hierarchical_clustering_tree.png')

ValueError: Distance matrix 'X' must be symmetric.