### Homework solutions


## Needlman-Wunsch



In [2]:
def needleman_wunsch_with_matrix(seq1, seq2, score_matrix, gap_penalty=-1):
    # Initialize the scoring matrix
    m, n = len(seq1) + 1, len(seq2) + 1
    alignment_matrix = [[0] * n for _ in range(m)]

    # Initialize the first row and column with gap penalties
    for i in range(m):
        alignment_matrix[i][0] = i * gap_penalty
    for j in range(n):
        alignment_matrix[0][j] = j * gap_penalty

    # Fill in the scoring matrix
    for i in range(1, m):
        for j in range(1, n):
            match_mismatch_score = score_matrix[seq1[i - 1]][seq2[j - 1]]
            diagonal_score = alignment_matrix[i - 1][j - 1] + match_mismatch_score
            up_score = alignment_matrix[i - 1][j] + gap_penalty
            left_score = alignment_matrix[i][j - 1] + gap_penalty

            # Choose the maximum score
            alignment_matrix[i][j] = max(diagonal_score, up_score, left_score)

    # Traceback to find the alignment
    align_seq1, align_seq2 = '', ''
    i, j = m - 1, n - 1
    while i > 0 or j > 0:
        if i > 0 and j > 0 and alignment_matrix[i][j] == alignment_matrix[i - 1][j - 1] + score_matrix[seq1[i - 1]][seq2[j - 1]]:
            align_seq1 = seq1[i - 1] + align_seq1
            align_seq2 = seq2[j - 1] + align_seq2
            i -= 1
            j -= 1
        elif i > 0 and alignment_matrix[i][j] == alignment_matrix[i - 1][j] + gap_penalty:
            align_seq1 = seq1[i - 1] + align_seq1
            align_seq2 = '-' + align_seq2
            i -= 1
        else:
            align_seq1 = '-' + align_seq1
            align_seq2 = seq2[j - 1] + align_seq2
            j -= 1

    return align_seq1, align_seq2, alignment_matrix[-1][-1]

# Example usage:
seq1 = "AGTACGCA"
seq2 = "TATGC"
match_matrix = {
    'A': {'A': 1, 'C': -1, 'G': 0, 'T': 0},
    'C': {'A': -1, 'C': 1, 'G': 0, 'T': 0},
    'G': {'A': 0, 'C': 0, 'G': 1, 'T': -1},
    'T': {'A': 0, 'C': 0, 'G': -1, 'T': 1}
}

alignment1, alignment2, score = needleman_wunsch_with_matrix(seq1, seq2, match_matrix)

print("Sequence 1:", alignment1)
print("Sequence 2:", alignment2)
print("Alignment Score:", score)


Sequence 1: AGTACGCA
Sequence 2: --TATGC-
Alignment Score: 1


## Affine gap algorithm

In [5]:
blosum62matrix = """
   A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  W  Y
A  4  0 -2 -1 -2  0 -2 -1 -1 -1 -1 -2 -1 -1 -1  1  0  0 -3 -2
C  0  9 -3 -4 -2 -3 -3 -1 -3 -1 -1 -3 -3 -3 -3 -1 -1 -1 -2 -2
D -2 -3  6  2 -3 -1 -1 -3 -1 -4 -3  1 -1  0 -2  0 -1 -3 -4 -3
E -1 -4  2  5 -3 -2  0 -3  1 -3 -2  0 -1  2  0  0 -1 -2 -3 -2
F -2 -2 -3 -3  6 -3 -1  0 -3  0  0 -3 -4 -3 -3 -2 -2 -1  1  3
G  0 -3 -1 -2 -3  6 -2 -4 -2 -4 -3  0 -2 -2 -2  0 -2 -3 -2 -3
H -2 -3 -1  0 -1 -2  8 -3 -1 -3 -2  1 -2  0  0 -1 -2 -3 -2  2
I -1 -1 -3 -3  0 -4 -3  4 -3  2  1 -3 -3 -3 -3 -2 -1  3 -3 -1
K -1 -3 -1  1 -3 -2 -1 -3  5 -2 -1  0 -1  1  2  0 -1 -2 -3 -2
L -1 -1 -4 -3  0 -4 -3  2 -2  4  2 -3 -3 -2 -2 -2 -1  1 -2 -1
M -1 -1 -3 -2  0 -3 -2  1 -1  2  5 -2 -2  0 -1 -1 -1  1 -1 -1
N -2 -3  1  0 -3  0  1 -3  0 -3 -2  6 -2  0  0  1  0 -3 -4 -2
P -1 -3 -1 -1 -4 -2 -2 -3 -1 -3 -2 -2  7 -1 -2 -1 -1 -2 -4 -3
Q -1 -3  0  2 -3 -2  0 -3  1 -2  0  0 -1  5  1  0 -1 -2 -2 -1
R -1 -3 -2  0 -3 -2  0 -3  2 -2 -1  0 -2  1  5 -1 -1 -3 -3 -2
S  1 -1  0  0 -2  0 -1 -2  0 -2 -1  1 -1  0 -1  4  1 -2 -3 -2
T  0 -1 -1 -1 -2 -2 -2 -1 -1 -1 -1  0 -1 -1 -1  1  5  0 -2 -2
V  0 -1 -3 -2 -1 -3 -3  3 -2  1  1 -3 -2 -2 -3 -2  0  4 -3 -1
W -3 -2 -4 -3  1 -2 -2 -3 -3 -2 -1 -4 -4 -2 -3 -3 -2 -3 11  2
Y -2 -2 -3 -2  3 -3  2 -1 -2 -1 -1 -2 -3 -1 -2 -2 -2 -1  2  7
""".split('\n')[1:-1]

blosum62 = dict()


def initialize_blosum62():
    alphabet = [a.strip() for a in blosum62matrix[0].split()]
    for i, row in enumerate(blosum62matrix[1:]):
        for j, val in enumerate([int(x) for x in row[1:].split()]):
            blosum62[(alphabet[i], alphabet[j])] = val


def global_alignment_affine(s, t, gap_open=11, gap_extend=1):


    if len(blosum62) == 0:
        initialize_blosum62()
    
    down = [[0]*(len(t)+1) for i in range(len(s)+1)]
    right = [[0]*(len(t)+1) for i in range(len(s)+1)]
    m = [[0]*(len(t)+1) for i in range(len(s)+1)]
    
    for i in range(len(s)):
        down[i+1][0] = -gap_open-i*gap_extend
        right[i+1][0] = -float('inf')
        m[i+1][0] = -gap_open-i*gap_extend
    for j in range(len(t)):
        down[0][j+1] = -float('inf')
        right[0][j+1] = -gap_open-j*gap_extend
        m[0][j+1] = -gap_open-j*gap_extend
    
    for i in range(len(s)):
        for j in range(len(t)):
            down[i+1][j+1] = max(down[i][j+1]-gap_extend, m[i][j+1]-gap_open)
            right[i+1][j+1] = max(right[i+1][j]-gap_extend, m[i+1][j]-gap_open)
            m[i+1][j+1] = max(down[i+1][j+1],
                              right[i+1][j+1],
                              m[i][j] + blosum62[(s[i], t[j])])
    
    i, j = len(s), len(t)
    
    if m[i][j] == down[i][j]:
        prev = 'up'
    elif m[i][j] == right[i][j]:
        prev = 'left'
    else:
        prev = 'diagonal'
    
    s_align = ''
    t_align = ''
    
    while True:
        if i > 0 and j > 0:
            up = max(down[i-1][j]-gap_extend, m[i-1][j]-gap_open)
            left = max(right[i][j-1]-gap_extend, m[i][j-1]-gap_open)
    
            if prev == 'up':
                if up != down[i-1][j]-gap_extend:
                    prev = 'diagonal'
                i -= 1
                s_align = s[i] + s_align
                t_align = '-' + t_align
            elif prev == 'left':
                if left != right[i][j-1]-gap_extend:
                    prev = 'diagonal'
                j -= 1
                s_align = '-' + s_align
                t_align = t[j] + t_align
            else:
                diagonal = m[i-1][j-1] + blosum62[(s[i-1], t[j-1])]
    
                if up == max(up, diagonal, left):
                    prev = 'up'
                elif left > diagonal:
                    prev = 'left'
                else:
                    i -= 1
                    j -= 1
                    s_align = s[i] + s_align
                    t_align = t[j] + t_align
        elif i > 0:
            i -= 1
            s_align = s[i] + s_align
            t_align = '-' + t_align
        elif j > 0:
            j -= 1
            s_align = '-' + s_align
            t_align = t[j] + t_align
        else:
            break

    print(m[-1][-1])
    print(s_align)
    print(t_align)



In [6]:
global_alignment_affine("PTIMWAYSKPVSNGIMFLILLTWCRCYIRPSNFHMRTQNWTFKVNQLLPLDKYSCMWGIQGMYASTWLSHGCVRVWQYPVM", "PFGKWTHLIVSNGIYNSQWTTHHNFQESLELLTWCRCYIRPSMRTQNWTFKVNQLLPLDKYSCMWKIQGMYASTLSHGCVRVWQYPVM")


302
PTIMWAYSKPVSNGIM------------FLILLTWCRCYIRPSNFHMRTQNWTFKVNQLLPLDKYSCMWGIQGMYASTWLSHGCVRVWQYPVM
PFGKWTHLI-VSNGIYNSQWTTHHNFQESLELLTWCRCYIRPS---MRTQNWTFKVNQLLPLDKYSCMWKIQGMYAST-LSHGCVRVWQYPVM


## Number of alignments

![correct](formula.png)

![stirling](formula2.png)

### Experimenting with BLAST

## Random Sequences and BLAST.
Sequence alignments can be performed online using the programs and data provided by the National Center for Biotechnology Information (NCBI).


## Task 1

Generate a random amino acid sequence and run it against a database of non-redundant sequences employing BLAST (http://www.ncbi.nlm.nih.gov/BLAST/); use the standard protein-protein BLAST [blastp]. Create a single fasta that contains several amino acid sequences of length 10, 50, 100, 1500 aminoacids. Did you find any “false homologous” in the database?

## Task 2

Generate a random amino acid sequences with amino acid frequencies from the table below and with PERIODICITY of hydrophobic and non-hydrophobic residues. Run these sequences using BLAST. Interpret your results.

In [21]:
import pandas as pd

# Amino acid information
amino_acids = {
    'CYS': {'hydrophobic': True, 'value': 1.660},
    'MET': {'hydrophobic': True, 'value': 2.370},
    'PHE': {'hydrophobic': True, 'value': 4.100},
    'ILE': {'hydrophobic': True, 'value': 5.810},
    'LEU': {'hydrophobic': True, 'value': 9.430},
    'VAL': {'hydrophobic': True, 'value': 6.580},
    'TRP': {'hydrophobic': True, 'value': 1.240},
    'TYR': {'hydrophobic': True, 'value': 3.190},
    'ALA': {'hydrophobic': True, 'value': 7.580},
    'GLY': {'hydrophobic': True, 'value': 6.840},
    'THR': {'hydrophobic': False, 'value': 5.670},
    'SER': {'hydrophobic': False, 'value': 7.130},
    'GLN': {'hydrophobic': False, 'value': 3.970},
    'ASN': {'hydrophobic': False, 'value': 4.440},
    'GLU': {'hydrophobic': False, 'value': 6.360},
    'ASP': {'hydrophobic': False, 'value': 5.270},
    'HIS': {'hydrophobic': False, 'value': 2.240},
    'ARG': {'hydrophobic': False, 'value': 5.160},
    'LYS': {'hydrophobic': False, 'value': 5.940},
    'PRO': {'hydrophobic': False, 'value': 4.920}
}

# Create DataFrame
df = pd.DataFrame.from_dict(amino_acids, orient='index')
df.index.name = 'Amino Acid'


# Add 'One_Letter' column
one_letter_mapping = {
    'CYS': 'C', 'MET': 'M', 'PHE': 'F', 'ILE': 'I', 'LEU': 'L',
    'VAL': 'V', 'TRP': 'W', 'TYR': 'Y', 'ALA': 'A', 'GLY': 'G',
    'THR': 'T', 'SER': 'S', 'GLN': 'Q', 'ASN': 'N', 'GLU': 'E',
    'ASP': 'D', 'HIS': 'H', 'ARG': 'R', 'LYS': 'K', 'PRO': 'P'
}

df['One_Letter'] = df.index.map(one_letter_mapping)


# Display the DataFrame
print(df)

            hydrophobic  value One_Letter
Amino Acid                               
CYS                True   1.66          C
MET                True   2.37          M
PHE                True   4.10          F
ILE                True   5.81          I
LEU                True   9.43          L
VAL                True   6.58          V
TRP                True   1.24          W
TYR                True   3.19          Y
ALA                True   7.58          A
GLY                True   6.84          G
THR               False   5.67          T
SER               False   7.13          S
GLN               False   3.97          Q
ASN               False   4.44          N
GLU               False   6.36          E
ASP               False   5.27          D
HIS               False   2.24          H
ARG               False   5.16          R
LYS               False   5.94          K
PRO               False   4.92          P


In [20]:
df

Unnamed: 0_level_0,hydrophobic,value
Amino Acid,Unnamed: 1_level_1,Unnamed: 2_level_1
CYS,True,1.66
MET,True,2.37
PHE,True,4.1
ILE,True,5.81
LEU,True,9.43
VAL,True,6.58
TRP,True,1.24
TYR,True,3.19
ALA,True,7.58
GLY,True,6.84


Random Sequence: QRKEICYIAKNPTKKENKKRIMFWLIYIFGVAIQSKHPQDDNPTFAMGGC


### Task 3 
Take an amino acid sequence (a protein of your choice, or one of proteins suggested below) and introduce X% of random mutations. Run mutated proteins against the database
using BLAST or PSI-BLAST (same web page). Try different frequency of mutations (X = 0, · · · 100%). What level of mutations is tolerated by BLAST? by PSI-BLAST? Interpret
your results.

Myoglobin: "MAKRRGSVPGRVREYWLPSPCWKCHMLHQGKWWGRRSQGMGGAEGFMEHGSTTLQRKPGASSELGILQVRDLSWLVQPQAQTCCGSFVPLSAGLRASAK"
Histon H2B: "MTDKITKKKRNETYSIYIYKVLRQVHPKIGVSSKAMNIMNSFVNDLFERLVSESYNLSNSSRSKTLTAREIQTSVRLVIPGELAKHSVSEGTKAVAKYRSSI"
SH3 Domain: "MDETGKELVLALYDYQEKSPREVTMKKGDILTLLNSTNKDWWKVEVNDRQGYVPAAYVKKLD"

### Task 4

Introduce X% of mutations such that a hydrophobic amino acid is substituted by a random hydrophobic one and vice versa. Run using BLAST. Did the threshold level for X change?