In [None]:
import numpy as np
import pandas as pd

# Sample alignment
alignment = [
    "MVLSPADKTNVKGKVGAHAGEYGAAAW",
    "MKRLPADPPCVKGKVKAKAGDYGATTW",
    "MALSAADKTNVKSKVGGHAGEYGAATS",
    "MVLSAADKTNVKSKAGGNAGEWWAAAW",
    "MVLSAADKTNVKSKVLANAGEFGAAAW",
    "ALLPIRTTYHKKCASGHIPEEKDLNNV",
    "DEASSLKGHHIKKLEADALLIPLSASS"
]

# Amino acid residues
residues = ["G", "A", "V", "L", "I", "P", "F", "Y", "W", "S", "T", "C", "M", "N", "Q", "D", "E", "K", "R", "H"]
alignment_length = len(alignment[0])
num_sequences = len(alignment)

# Initialize frequency matrix
frequency_matrix = {residue: [0] * alignment_length for residue in residues}

# Count residue occurrences at each position
for seq in alignment:
    for i, residue in enumerate(seq):
        if residue in frequency_matrix:
            frequency_matrix[residue][i] += 1

# Add pseudocount and calculate weights (log-odds)
p = 1 / len(residues)
weight_matrix = {
    residue: [
        round(np.log((count + p) / (p * (num_sequences + 1))), 2)
        for count in counts
    ]
    for residue, counts in frequency_matrix.items()
}

# Create DataFrame
df = pd.DataFrame(weight_matrix).T
df.columns = range(1, alignment_length + 1)

# Print final DataFrame
print(df)


     1     2     3     4     5     6     7     8     9     10  ...    18  \
G -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08  0.97 -2.08 -2.08  ... -2.08   
A  0.97  0.97  0.97 -2.08  2.03  2.54 -2.08 -2.08 -2.08 -2.08  ...  0.97   
V -2.08  2.03 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08  ... -2.08   
L -2.08  0.97  2.54  0.97 -2.08  0.97 -2.08 -2.08 -2.08 -2.08  ... -2.08   
I -2.08 -2.08 -2.08 -2.08  0.97 -2.08 -2.08 -2.08 -2.08 -2.08  ...  0.97   
P -2.08 -2.08 -2.08  0.97  1.63 -2.08 -2.08  0.97  0.97 -2.08  ... -2.08   
F -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08  ... -2.08   
Y -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08  0.97 -2.08  ... -2.08   
W -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08  ... -2.08   
S -2.08 -2.08 -2.08  2.54  0.97 -2.08 -2.08 -2.08 -2.08 -2.08  ... -2.08   
T -2.08 -2.08 -2.08 -2.08 -2.08 -2.08  0.97  0.97  2.32 -2.08  ... -2.08   
C -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08 -2.08  0.97  ... -2.08   
M  2.54 -2.0