In [1]:
import numpy as np
import os
import collections
import copy
import pickle
import random
import time

In [2]:
"""
Read alignment file into numpy array
"""
def func_file2seq(filepath):
    seq_list = []
    with open(filepath,'r') as f1:
        for line in f1:
            if line[0] == '>':
                continue
            else:
                seq = list(line.strip().replace('\n','').upper())
                seq_list.append(seq)
    return np.array(seq_list)

In [None]:
'''
Due to limited computational power and lack of optimization, this code takes forever to compute.
Therefore, randomly sampled 100 sequence from each class and saved also to save memory.

'''

super_directory ="/home/chingyuenliu/TTIC31160_bioinfo/hw2/Super"
twil_directory ="/home/chingyuenliu/TTIC31160_bioinfo/hw2/Twil"

super_seq_list = []
for root, dirs, files in os.walk(super_directory):
    for file in files:
        if file.endswith(".fasta") and file.startswith('d'):
            super_seq_list.append(func_file2seq(os.path.join(root,file)))
            
twil_seq_list = []
for root, dirs, files in os.walk(twil_directory):
    for file in files:
        if file.endswith(".fasta") and file.startswith('d'):
            twil_seq_list.append(func_file2seq(os.path.join(root,file)))

random.shuffle(super_seq_list)
random.shuffle(twil_seq_list)

with open("super_dataset_sampled100.txt", "wb") as fp:   #Pickling
    pickle.dump(super_seq_list[:100], fp)

with open("twil_dataset_sampled100.txt","wb") as fp:
    pickle.dump(twil_seq_list[:100], fp)


In [3]:
def pure_seq(seq_list):
    pure_list=[]
    for i in seq_list:
        seq_pair = []
        for j in i:
            seq_pair.append([aa for aa in j if aa != '-' ])
        pure_list.append(seq_pair)
    return pure_list

In [4]:
def blosum2mat(blosum_filepath):
    seq_list = []
    with open(blosum_filepath,'r') as f1:
        for line in f1:
            if line[0] == '#' or line[3] =='A':
                continue
            else:
                seq = list(map(float,line[1:].strip().replace('\n','').split()))
                seq_list.append(seq)
    return np.array(seq_list)[:20,:20]

In [5]:
def logsum(num_list):
    #already logrithmic entry
    if len(num_list) == 0: return -np.inf
    anker = np.inf
    second_half = np.inf
    if type(num_list) == list:
        num_list = np.array(num_list).reshape(-1)

    else:
        num_list = num_list.reshape(-1)

        
    for i in range(num_list.shape[0]):
        if (not np.isinf(num_list[i])) and (not np.isnan(num_list[i])):
            anker = num_list[i]
            second_half = np.delete(num_list,i)
            break
    
    if anker == np.inf:
        return -np.inf
    second_half = np.clip(second_half - anker, -np.inf,1233.9)
    return anker+np.log(np.exp(second_half).sum()+1)
    

0.0

In [6]:
def index_determ(i,j,s2):
    if s2 == 0:
        return i+1, j+1, s2
    elif s2 == 1:
        return i+1, j, s2
    elif s2 == 2:
        return i, j+1, s2

def super_matrix_update(seq):
    global freq_matrix_sup
    for col_in in range(seq.shape[1]):
        values,counts = np.unique(seq[:,col_in],return_counts=True)

        index = np.where(values == '-')

        values = np.delete(values,index)
        counts = np.delete(counts,index)

        if np.sum(counts)<2:
            continue
        else:
            for i in range(len(values)):
                for j in range(i,len(values)):
                    if j == i:
                        freq_matrix_sup[aa2index[values[i]],aa2index[values[j]]] += counts[i]*(counts[i]-1)*0.01
                    else:
                        freq_matrix_sup[aa2index[values[i]],aa2index[values[j]]] += counts[i]*counts[j]*0.01
                        freq_matrix_sup[aa2index[values[j]],aa2index[values[i]]] += counts[i]*counts[j]*0.01

def twil_matrix_update(seq):
    global freq_matrix_twil
    for col_in in range(seq.shape[1]):
        values,counts = np.unique(seq[:,col_in],return_counts=True)

        index = np.where(values == '-')

        values = np.delete(values,index)
        counts = np.delete(counts,index)

        if np.sum(counts)<2:
            continue
        else:
            for i in range(len(values)):
                for j in range(i,len(values)):
                    if j == i:
                        freq_matrix_twil[aa2index[values[i]],aa2index[values[j]]] += counts[i]*(counts[i]-1)*0.01
                    else:
                        freq_matrix_twil[aa2index[values[i]],aa2index[values[j]]] += counts[i]*counts[j]*0.01
                        freq_matrix_twil[aa2index[values[j]],aa2index[values[i]]] += counts[i]*counts[j]*0.01

In [7]:
one2all ={'A': ('A', 'ALA', 'alanine'),
              'R': ('R', 'ARG', 'arginine'),
              'N': ('N', 'ASN', 'asparagine'),
              'D': ('D', 'ASP', 'aspartic acid'),
              'C': ('C', 'CYS', 'cysteine'),
              'Q': ('Q', 'GLN', 'glutamine'),
              'E': ('E', 'GLU', 'glutamic acid'),
              'G': ('G', 'GLY', 'glycine'),
              'H': ('H', 'HIS', 'histidine'),
              'I': ('I', 'ILE', 'isoleucine'),
              'L': ('L', 'LEU', 'leucine'),
              'K': ('K', 'LYS', 'lysine'),
              'M': ('M', 'MET', 'methionine'),
              'F': ('F', 'PHE', 'phenylalanine'),
              'P': ('P', 'PRO', 'proline'),
              'S': ('S', 'SER', 'serine'),
              'T': ('T', 'THR', 'threonine'),
              'W': ('W', 'TRP', 'tryptophan'),
              'Y': ('Y', 'TYR', 'tyrosine'),
              'V': ('V', 'VAL', 'valine'),
              'X': ('X', 'GLX', 'glutaminx'),
              'Z': ('Z', 'GLI', 'glycine'),
              'J': ('J', 'NLE', 'norleucine'),
                'U': ('U', 'CYC', 'cysteinc')}

In [8]:
aa2index={}
for i, aa in enumerate(list(one2all.keys())):
    if i == 20: break
    aa2index[aa] = i

In [9]:
with open("super_dataset_sampled100.txt", "rb") as fp:   # Unpickling
    super_list_sampled = pickle.load(fp)
    
with open("twil_dataset_sampled100.txt", "rb") as fp:   # Unpickling
    twil_list_sampled = pickle.load(fp)

In [10]:
super_list_train=super_list_sampled[:90]
super_list_test = super_list_sampled[90:]

twil_list_train = twil_list_sampled[:90]
twil_list_test = twil_list_sampled[90:]

In [11]:
blosum_filepath = "/home/chingyuenliu/TTIC31160_bioinfo/hw2/BLOSUM62"

blosum = blosum2mat(blosum_filepath)

blosum_f = np.exp(blosum)

In [12]:
aa_background_f = np.zeros(20)
for i in super_list_train:
    values,counts = np.unique(i,return_counts=True)
    for v in range(values.shape[0]):
        if values[v] in aa2index:
            aa_background_f[aa2index[values[v]]] += counts[v]
aa_background_f=aa_background_f/aa_background_f.sum()  

In [13]:
for i in range(blosum_f.shape[0]):
    for j in range(blosum_f.shape[1]):
        blosum_f[i,j] = blosum_f[i,j] * aa_background_f[i] * aa_background_f[j]
blosum_f = blosum_f/blosum_f.sum()

In [14]:
states = ["Match", "Insert_X", "Insert_Y"]

l_para = {}


emit_M = np.log(blosum_f)
emit_X = np.tile(np.log(aa_background_f).reshape(-1,1),[1,20])
emit_Y = np.tile(np.log(aa_background_f), [20,1])

l_para['lgtau'] = np.log(0.1)
tau = 0.1
de = 0.3
eta = 0.6

l_para['trans_mat'] = np.array([1-2*de,de,de,1-eta-tau,eta,0,1-eta-tau,0,eta]).reshape([3,3])

l_para['trans_mat'] = np.log(l_para['trans_mat'])
#MM,MX,MY,XM,XX,XY,YM,YX,YY

emit_mat = np.zeros([20,20,3])
emit_mat[:,:,0] = emit_M
emit_mat[:,:,1] = emit_X
emit_mat[:,:,2] = emit_Y

l_para['emit_mat'] = emit_mat



In [15]:
def forward_M_compute_l(i,j):
    if i == 0 or j == 0:
        return -np.inf
        
    fm = f_matrix[i-1,j-1,0]
    fx = f_matrix[i-1,j-1,1]
    fy = f_matrix[i-1,j-1,2]
    
    inf_index = [index for index, value in enumerate([fm,fx,fy]) if value != -np.inf]
    if len(inf_index) == 0:
        return -np.inf
    else:
        log_list = []
        for index in inf_index:
            log_list.append(l_para['emit_mat'][aa2index[seq1[i-1]],aa2index[seq2[j-1]]][0] + \
                           l_para['trans_mat'][index,0] + f_matrix[i-1,j-1,index]
                           )
        if len(log_list) == 1:
            return log_list[0]
        else:
            return logsum(log_list)


In [16]:
def backward_M_compute_l(i,j):

    if i >= m and j >= n: return -np.inf
        
    if i == m and j < n:
        bm = -np.inf
        bx = -np.inf    
        by = b_matrix[i,j+1,2]
    elif i < m and j == n:
        bm = -np.inf
        by = -np.inf
        bx = b_matrix[i+1,j,1]
    else:
        bm = b_matrix[i+1,j+1,0]
        bx = b_matrix[i+1,j,1]
        by = b_matrix[i,j+1,2]
    
    inf_index = [index for index, value in enumerate([bm,bx,by]) if value != -np.inf]
    if len(inf_index) == 0:
        return -np.inf
    else:
        log_list = []
        for index in inf_index:
            if index == 0:
                log_list.append(l_para['emit_mat'][aa2index[seq1[i]],aa2index[seq2[j]]][0] + \
                           l_para['trans_mat'][0,index] + b_matrix[i+1,j+1,index]
                           )
            if index == 1:
                log_list.append(l_para['emit_mat'][aa2index[seq1[i]],0][1] + \
                           l_para['trans_mat'][0,index] + b_matrix[i+1,j,index] )
            if index == 2:
                log_list.append(l_para['emit_mat'][0,aa2index[seq2[j]]][2] + \
                           l_para['trans_mat'][0,index] + b_matrix[i,j+1,index] )                           
            
        if len(log_list) == 1:
            return log_list[0]
        else:
            return logsum(log_list)


In [17]:
def forward_X_compute_l(i,j):
    if i == 0:
        return -np.inf
        
    fm = f_matrix[i-1,j,0]
    fx = f_matrix[i-1,j,1]
    
    inf_index = [index for index, value in enumerate([fm,fx,-np.inf]) if value != -np.inf]
    if len(inf_index) == 0:
        return -np.inf
    else:
        log_list = []
        for index in inf_index:
            log_list.append(l_para['emit_mat'][aa2index[seq1[i-1]],0][1] + \
                           l_para['trans_mat'][index,1] + f_matrix[i-1,j,index]
                           )
        if len(log_list) == 1:
            return log_list[0]
        else:
            return logsum(log_list)

In [18]:
def backward_X_compute_l(i,j):

    if i >= m : return -np.inf
    if j > n: return -np.inf
    
    by = -np.inf

    if i < m and j == n:
        bm = -np.inf
        by = -np.inf        
        bx = b_matrix[i+1,j,1]
    else:
        bm = b_matrix[i+1,j+1,0]
        bx = b_matrix[i+1,j,1]

    
    inf_index = [index for index, value in enumerate([bm,bx,-np.inf]) if value != -np.inf]
    if len(inf_index) == 0:
        return -np.inf
    else:
        log_list = []
        for index in inf_index:
            if index == 0:
                log_list.append(l_para['emit_mat'][aa2index[seq1[i]],aa2index[seq2[j]]][0] + \
                           l_para['trans_mat'][1,index] + b_matrix[i+1,j+1,index]
                           )
            if index == 1:
                log_list.append(l_para['emit_mat'][aa2index[seq1[i]],0][1] + \
                           l_para['trans_mat'][1,index] + b_matrix[i+1,j,index] )                    
            
        if len(log_list) == 1:
            return log_list[0]
        else:
            return logsum(log_list)


In [19]:
def forward_Y_compute_l(i,j):
    if j == 0:
        return -np.inf
        
    fm = f_matrix[i,j-1,0]
    fy = f_matrix[i,j-1,2]
    
    inf_index = [index for index, value in enumerate([fm, -np.inf, fy]) if value != -np.inf]
    if len(inf_index) == 0:
        return -np.inf
    else:
        log_list = []
        for index in inf_index:
            log_list.append(l_para['emit_mat'][0,aa2index[seq2[j-1]]][2] + \
                           l_para['trans_mat'][index,2] + f_matrix[i,j-1,index]
                           )
        if len(log_list) == 1:
            return log_list[0]
        else:
            return logsum(log_list)

In [20]:
def backward_Y_compute_l(i,j):

    
    if j >= n: return -np.inf
    if i > m : return -np.inf
    
    bx = -np.inf

    if i == m and j < n:
        bm = -np.inf      
        by = b_matrix[i,j+1,2]
    else:
        bm = b_matrix[i+1,j+1,0]
        by = b_matrix[i,j+1,2]

    
    inf_index = [index for index, value in enumerate([bm,-np.inf,by]) if value != -np.inf]
    if len(inf_index) == 0:
        return -np.inf
    else:
        log_list = []
        for index in inf_index:
            if index == 0:
                log_list.append(l_para['emit_mat'][aa2index[seq1[i]],aa2index[seq2[j]]][0] + \
                           l_para['trans_mat'][2,index] + b_matrix[i+1,j+1,index]
                           )
            if index == 2:
                log_list.append(l_para['emit_mat'][0,aa2index[seq2[j]]][2] + \
                           l_para['trans_mat'][2,index] + b_matrix[i,j+1,index] )                    
            
        if len(log_list) == 1:
            return log_list[0]
        else:
            return logsum(log_list)

2

In [None]:

for seq_pair in super_list_train:

    seq1,seq2 = pure_seq([seq_pair])[0]
    #local
    m = len(seq1)
    n = len(seq2)

    for loop in range(2):
        #order match:0,x:1,y:2
        f_matrix = np.full([m+1,n+1, len(states)], -np.inf)
        b_matrix = np.full([m+1,n+1, len(states)], -np.inf)

        f_matrix[0,0,1] = np.log(0.5)
        f_matrix[0,0,2] = np.log(0.5)
        for i in range(0,m+1):
            for j in range(0,n+1):
                if i==0 and j ==0: continue
                f_matrix[i,j,0] = forward_M_compute_l(i,j)
                f_matrix[i,j,1] = forward_X_compute_l(i,j)
                f_matrix[i,j,2] = forward_Y_compute_l(i,j)            

        b_matrix[m,n,1] = l_para['lgtau']
        b_matrix[m,n,2] = l_para['lgtau']
        for i in range(m,-1,-1):
            for j in range(n,-1,-1):
                if i ==m and j == n: continue
                b_matrix[i,j,0] = backward_M_compute_l(i,j)
                b_matrix[i,j,1] = backward_X_compute_l(i,j)
                b_matrix[i,j,2] = backward_Y_compute_l(i,j)



        gamma_matrix=f_matrix+b_matrix


        trans_mat_new = np.zeros(l_para['trans_mat'].shape)
        for s1 in range(trans_mat_new.shape[0]):
            for s2 in range(trans_mat_new.shape[1]):
                if (s1 == 1 and s2 ==2) or (s1 ==2 and s2 ==1):
                    trans_mat_new[s1,s2] = -np.inf
                    continue
                mat = np.zeros([m,n])
                for i in range(m):
                    for j in range(n):
                        mat[i,j] = f_matrix[i,j,s1]+b_matrix[index_determ(i,j,s2)] + l_para['trans_mat'][s1,s2] + \
                                    l_para['emit_mat'][aa2index[seq1[i]],aa2index[seq2[j]]][s2]
                trans_mat_new[s1,s2] = logsum(mat) - logsum(gamma_matrix[:,:,s1])


        l_para['lgtau'] = np.log(3-np.exp(trans_mat_new).sum())

        emit_mat_new = np.zeros(l_para['emit_mat'].shape)

        for aa1 in aa2index:
            for aa2 in aa2index:
                aa1_index = [index for index,value in enumerate(seq1) if value == aa1]
                aa2_index = [index for index,value in enumerate(seq2) if value == aa2]

                value_list = []
                for aa1_i in aa1_index:
                    for aa2_i in aa2_index:
                        value_list.append(gamma_matrix[aa1_i,aa2_i])

                if len(value_list) == 0:
                    for k in range(gamma_matrix.shape[2]):
                        emit_mat_new[aa2index[aa1],aa2index[aa2],k] = -np.inf
                    continue

                value_np = np.array(value_list)  

                for k in range(value_np.shape[1]):
                    emit_mat_new[aa2index[aa1],aa2index[aa2],k] = logsum(value_np[:,k])

        for it in range(emit_mat_new.shape[2]):
            emit_mat_new[:,:,it] -= logsum(gamma_matrix[:,:,it])

        for it in range(20):
            emit_mat_new[it,0,1] = logsum(emit_mat_new[it,:,1])
            emit_mat_new[0,it,2] = logsum(emit_mat_new[:,it,2])

        emit_mat_new[:,:,1] = np.tile(emit_mat_new[:,0,1].reshape(-1,1),[1,20])
        emit_mat_new[:,:,2] = np.tile(emit_mat_new[0,:,2].reshape(1,-1),[20,1])

        emit_mat_new = np.clip(emit_mat_new,-10,-1e-5)

        l_para['emit_mat'] = emit_mat_new
        l_para['trans_mat'] = trans_mat_new

    




array([[-0.13678829, -2.95254524, -2.79820466],
       [-1.82171815, -1.24798799,        -inf],
       [-1.41440862,        -inf, -0.27848366]])

In [25]:
start = time.time()
for seq_pair in super_list_train:

    seq1,seq2 = pure_seq([seq_pair])[0]
    #local
    m = len(seq1)
    n = len(seq2)

    for loop in range(1):
        #order match:0,x:1,y:2
        f_matrix = np.full([m+1,n+1, len(states)], -np.inf)
        b_matrix = np.full([m+1,n+1, len(states)], -np.inf)

        f_matrix[0,0,1] = np.log(0.5)
        f_matrix[0,0,2] = np.log(0.5)
        for i in range(0,m+1):
            for j in range(0,n+1):
                if i==0 and j ==0: continue
                f_matrix[i,j,0] = forward_M_compute_l(i,j)
                f_matrix[i,j,1] = forward_X_compute_l(i,j)
                f_matrix[i,j,2] = forward_Y_compute_l(i,j)            

        b_matrix[m,n,1] = l_para['lgtau']
        b_matrix[m,n,2] = l_para['lgtau']
        for i in range(m,-1,-1):
            for j in range(n,-1,-1):
                if i ==m and j == n: continue
                b_matrix[i,j,0] = backward_M_compute_l(i,j)
                b_matrix[i,j,1] = backward_X_compute_l(i,j)
                b_matrix[i,j,2] = backward_Y_compute_l(i,j)



        gamma_matrix=f_matrix+b_matrix


        trans_mat_new = np.zeros(l_para['trans_mat'].shape)
        for s1 in range(trans_mat_new.shape[0]):
            for s2 in range(trans_mat_new.shape[1]):
                if (s1 == 1 and s2 ==2) or (s1 ==2 and s2 ==1):
                    trans_mat_new[s1,s2] = -np.inf
                    continue
                mat = np.zeros([m,n])
                for i in range(m):
                    for j in range(n):
                        mat[i,j] = f_matrix[i,j,s1]+b_matrix[index_determ(i,j,s2)] + l_para['trans_mat'][s1,s2] + \
                                    l_para['emit_mat'][aa2index[seq1[i]],aa2index[seq2[j]]][s2]
                trans_mat_new[s1,s2] = logsum(mat) - logsum(gamma_matrix[:,:,s1])


        l_para['lgtau'] = np.log(3-np.exp(trans_mat_new).sum())

        emit_mat_new = np.zeros(l_para['emit_mat'].shape)
        
        break
        
        for aa1 in aa2index:
            for aa2 in aa2index:
                aa1_index = [index for index,value in enumerate(seq1) if value == aa1]
                aa2_index = [index for index,value in enumerate(seq2) if value == aa2]

                value_list = []
                for aa1_i in aa1_index:
                    for aa2_i in aa2_index:
                        value_list.append(gamma_matrix[aa1_i,aa2_i])

                if len(value_list) == 0:
                    for k in range(gamma_matrix.shape[2]):
                        emit_mat_new[aa2index[aa1],aa2index[aa2],k] = -np.inf
                    continue

                value_np = np.array(value_list)  

                for k in range(value_np.shape[1]):
                    emit_mat_new[aa2index[aa1],aa2index[aa2],k] = logsum(value_np[:,k])

        for it in range(emit_mat_new.shape[2]):
            emit_mat_new[:,:,it] -= logsum(gamma_matrix[:,:,it])

        for it in range(20):
            emit_mat_new[it,0,1] = logsum(emit_mat_new[it,:,1])
            emit_mat_new[0,it,2] = logsum(emit_mat_new[:,it,2])

        emit_mat_new[:,:,1] = np.tile(emit_mat_new[:,0,1].reshape(-1,1),[1,20])
        emit_mat_new[:,:,2] = np.tile(emit_mat_new[0,:,2].reshape(1,-1),[20,1])

        emit_mat_new = np.clip(emit_mat_new,-10,-1e-5)

        l_para['emit_mat'] = emit_mat_new
        l_para['trans_mat'] = trans_mat_new

    break
end = time.time()
end-start

24.27428126335144

In [None]:
for aa1 in aa2index:
            for aa2 in aa2index:
                aa1_index = [index for index,value in enumerate(seq1) if value == aa1]
                aa2_index = [index for index,value in enumerate(seq2) if value == aa2]

                value_list = []
                for aa1_i in aa1_index:
                    for aa2_i in aa2_index:
                        value_list.append(gamma_matrix[aa1_i,aa2_i])

                if len(value_list) == 0:
                    for k in range(gamma_matrix.shape[2]):
                        emit_mat_new[aa2index[aa1],aa2index[aa2],k] = -np.inf
                    continue

                value_np = np.array(value_list)  

                for k in range(value_np.shape[1]):
                    emit_mat_new[aa2index[aa1],aa2index[aa2],k] = logsum(value_np[:,k])

        for it in range(emit_mat_new.shape[2]):
            emit_mat_new[:,:,it] -= logsum(gamma_matrix[:,:,it])

        for it in range(20):
            emit_mat_new[it,0,1] = logsum(emit_mat_new[it,:,1])
            emit_mat_new[0,it,2] = logsum(emit_mat_new[:,it,2])

        emit_mat_new[:,:,1] = np.tile(emit_mat_new[:,0,1].reshape(-1,1),[1,20])
        emit_mat_new[:,:,2] = np.tile(emit_mat_new[0,:,2].reshape(1,-1),[20,1])

        emit_mat_new = np.clip(emit_mat_new,-10,-1e-5)

        l_para['emit_mat'] = emit_mat_new
        l_para['trans_mat'] = trans_mat_new


In [None]:
l_para_super = copy.deepcopy(l_para)

In [None]:
states = ["Match", "Insert_X", "Insert_Y"]

g_para = {}


emit_M = np.log(blosum_f)
emit_X = np.tile(np.log(aa_background_f).reshape(-1,1),[1,20])
emit_Y = np.tile(np.log(aa_background_f), [20,1])

g_para['lgtau'] = np.log(0.1)

tau = 0.1
de = 0.3
eta = 0.6

g_para['trans_mat'] = np.array([1-2*de-tau,de,de,1-eta-tau,eta,0,1-eta-tau,0,eta]).reshape([3,3])
g_para['trans_mat'] = np.log(g_para['trans_mat'])
#MM,MX,MY,XM,XX,XY,YM,YX,YY

emit_mat = np.zeros([20,20,3])
emit_mat[:,:,0] = emit_M
emit_mat[:,:,1] = emit_X
emit_mat[:,:,2] = emit_Y

g_para['emit_mat'] = emit_mat

In [None]:
def forward_M_compute_g(i,j):
    if i == 0 or j == 0:
        return -np.inf
        
    fm = f_matrix[i-1,j-1,0]
    fx = f_matrix[i-1,j-1,1]
    fy = f_matrix[i-1,j-1,2]
    
    inf_index = [index for index, value in enumerate([fm,fx,fy]) if value != -np.inf]
    if len(inf_index) == 0:
        return -np.inf
    else:
        log_list = []
        for index in inf_index:
            log_list.append(g_para['emit_mat'][aa2index[seq1[i-1]],aa2index[seq2[j-1]]][0] + \
                           g_para['trans_mat'][index,0] + f_matrix[i-1,j-1,index]
                           )
        if len(log_list) == 1:
            return log_list[0]
        else:
            return logsum(log_list)


In [19]:
def backward_M_compute_g(i,j):

    if i >= m and j >= n: return -np.inf
        
    if i == m and j < n:
        bm = -np.inf
        bx = -np.inf    
        by = b_matrix[i,j+1,2]
    elif i < m and j == n:
        bm = -np.inf
        by = -np.inf
        bx = b_matrix[i+1,j,1]
    else:
        bm = b_matrix[i+1,j+1,0]
        bx = b_matrix[i+1,j,1]
        by = b_matrix[i,j+1,2]
    
    inf_index = [index for index, value in enumerate([bm,bx,by]) if value != -np.inf]
    if len(inf_index) == 0:
        return -np.inf
    else:
        log_list = []
        for index in inf_index:
            if index == 0:
                log_list.append(g_para['emit_mat'][aa2index[seq1[i]],aa2index[seq2[j]]][0] + \
                           g_para['trans_mat'][0,index] + b_matrix[i+1,j+1,index]
                           )
            if index == 1:
                log_list.append(g_para['emit_mat'][aa2index[seq1[i]],0][1] + \
                           g_para['trans_mat'][0,index] + b_matrix[i+1,j,index] )
            if index == 2:
                log_list.append(g_para['emit_mat'][0,aa2index[seq2[j]]][2] + \
                           g_para['trans_mat'][0,index] + b_matrix[i,j+1,index] )                           
            
        if len(log_list) == 1:
            return log_list[0]
        else:
            return logsum(log_list)


In [20]:
def forward_X_compute_g(i,j):
    if i == 0:
        return -np.inf
        
    fm = f_matrix[i-1,j,0]
    fx = f_matrix[i-1,j,1]
    
    inf_index = [index for index, value in enumerate([fm,fx,-np.inf]) if value != -np.inf]
    if len(inf_index) == 0:
        return -np.inf
    else:
        log_list = []
        for index in inf_index:
            log_list.append(g_para['emit_mat'][aa2index[seq1[i-1]],0][1] + \
                           g_para['trans_mat'][index,1] + f_matrix[i-1,j,index]
                           )
        if len(log_list) == 1:
            return log_list[0]
        else:
            return logsum(log_list)

In [21]:
def backward_X_compute_g(i,j):

    if i >= m : return -np.inf
    if j > n: return -np.inf
    
    by = -np.inf

    if i < m and j == n:
        bm = -np.inf
        by = -np.inf        
        bx = b_matrix[i+1,j,1]
    else:
        bm = b_matrix[i+1,j+1,0]
        bx = b_matrix[i+1,j,1]

    
    inf_index = [index for index, value in enumerate([bm,bx,-np.inf]) if value != -np.inf]
    if len(inf_index) == 0:
        return -np.inf
    else:
        log_list = []
        for index in inf_index:
            if index == 0:
                log_list.append(g_para['emit_mat'][aa2index[seq1[i]],aa2index[seq2[j]]][0] + \
                           g_para['trans_mat'][1,index] + b_matrix[i+1,j+1,index]
                           )
            if index == 1:
                log_list.append(g_para['emit_mat'][aa2index[seq1[i]],0][1] + \
                           g_para['trans_mat'][1,index] + b_matrix[i+1,j,index] )                    
            
        if len(log_list) == 1:
            return log_list[0]
        else:
            return logsum(log_list)


In [22]:
def forward_Y_compute_g(i,j):
    if j == 0:
        return -np.inf
        
    fm = f_matrix[i,j-1,0]
    fy = f_matrix[i,j-1,2]
    
    inf_index = [index for index, value in enumerate([fm, -np.inf, fy]) if value != -np.inf]
    if len(inf_index) == 0:
        return -np.inf
    else:
        log_list = []
        for index in inf_index:
            log_list.append(g_para['emit_mat'][0,aa2index[seq2[j-1]]][2] + \
                           g_para['trans_mat'][index,2] + f_matrix[i,j-1,index]
                           )
        if len(log_list) == 1:
            return log_list[0]
        else:
            return logsum(log_list)

In [23]:
def backward_Y_compute_g(i,j):

    
    if j >= n: return -np.inf
    if i > m : return -np.inf
    
    bx = -np.inf

    if i == m and j < n:
        bm = -np.inf      
        by = b_matrix[i,j+1,2]
    else:
        bm = b_matrix[i+1,j+1,0]
        by = b_matrix[i,j+1,2]

    
    inf_index = [index for index, value in enumerate([bm,-np.inf,by]) if value != -np.inf]
    if len(inf_index) == 0:
        return -np.inf
    else:
        log_list = []
        for index in inf_index:
            if index == 0:
                log_list.append(g_para['emit_mat'][aa2index[seq1[i]],aa2index[seq2[j]]][0] + \
                           g_para['trans_mat'][2,index] + b_matrix[i+1,j+1,index]
                           )
            if index == 2:
                log_list.append(g_para['emit_mat'][0,aa2index[seq2[j]]][2] + \
                           g_para['trans_mat'][2,index] + b_matrix[i,j+1,index] )                    
            
        if len(log_list) == 1:
            return log_list[0]
        else:
            return logsum(log_list)

In [None]:
#global

m = len(seq1)
n = len(seq2)
#order match:0,x:1,y:2
f_matrix = np.full([m+1,n+1, len(states)], -np.inf)
b_matrix = np.full([m+1,n+1, len(states)], -np.inf)

f_matrix[0,0,0] = np.log(1/3)
f_matrix[0,0,1] = np.log(1/3)
f_matrix[0,0,2] = np.log(1/3)

for i in range(0,m+1):
    for j in range(0,n+1):
        if i==0 and j ==0: continue
        f_matrix[i,j,0] = forward_M_compute_g(i,j)
        f_matrix[i,j,1] = forward_X_compute_g(i,j)
        f_matrix[i,j,2] = forward_Y_compute_g(i,j)

b_matrix[m,n,0] = g_para['lgtau']
b_matrix[m,n,1] = g_para['lgtau']
b_matrix[m,n,2] = g_para['lgtau']

for i in range(m,-1,-1):
    for j in range(n,-1,-1):
        if i ==m and j == n: continue
        b_matrix[i,j,0] = backward_M_compute_g(i,j)
        b_matrix[i,j,1] = backward_X_compute_g(i,j)
        b_matrix[i,j,2] = backward_Y_compute_g(i,j)



gamma_matrix=f_matrix+b_matrix


trans_mat_new = np.zeros(g_para['trans_mat'].shape)
for s1 in range(trans_mat_new.shape[0]):
    for s2 in range(trans_mat_new.shape[1]):
        if (s1 == 1 and s2 ==2) or (s1 ==2 and s2 ==1):
            trans_mat_new[s1,s2] = -np.inf
            continue
        mat = np.zeros([m,n])
        for i in range(m):
            for j in range(n):
                mat[i,j] = f_matrix[i,j,s1]+b_matrix[index_determ(i,j,s2)] + g_para['trans_mat'][s1,s2] + \
                            g_para['emit_mat'][aa2index[seq1[i]],aa2index[seq2[j]]][s2]
        trans_mat_new[s1,s2] = logsum(mat) - logsum(gamma_matrix[:,:,s1])


g_para['lgtau'] = np.clip(np.log((3-np.exp(trans_mat_new).sum())/3), -10,-1e-5)

emit_mat_new = np.zeros(g_para['emit_mat'].shape)

for aa1 in aa2index:
    for aa2 in aa2index:
        aa1_index = [index for index,value in enumerate(seq1) if value == aa1]
        aa2_index = [index for index,value in enumerate(seq2) if value == aa2]

        value_list = []
        for aa1_i in aa1_index:
            for aa2_i in aa2_index:
                value_list.append(gamma_matrix[aa1_i,aa2_i])

        if len(value_list) == 0:
            for k in range(gamma_matrix.shape[2]):
                emit_mat_new[aa2index[aa1],aa2index[aa2],k] = -np.inf
            continue

        value_np = np.array(value_list)  

        for k in range(value_np.shape[1]):
            emit_mat_new[aa2index[aa1],aa2index[aa2],k] = logsum(value_np[:,k])

for it in range(emit_mat_new.shape[2]):
    emit_mat_new[:,:,it] -= logsum(gamma_matrix[:,:,it])

for it in range(20):
    emit_mat_new[it,0,1] = logsum(emit_mat_new[it,:,1])
    emit_mat_new[0,it,2] = logsum(emit_mat_new[:,it,2])

emit_mat_new[:,:,1] = np.tile(emit_mat_new[:,0,1].reshape(-1,1),[1,20])
emit_mat_new[:,:,2] = np.tile(emit_mat_new[0,:,2].reshape(1,-1),[20,1])

emit_mat_new = np.clip(emit_mat_new,-10,-1e-5)

g_para['emit_mat'] = emit_mat_new
g_para['trans_mat'] = trans_mat_new

In [None]:
#local
m = len(seq1)
n = len(seq2)
#order match:0,x:1,y:2
f_matrix = np.full([m+1,n+1, len(states)], -np.inf)
b_matrix = np.full([m+1,n+1, len(states)], -np.inf)

f_matrix[0,0,1] = np.log(0.5)
f_matrix[0,0,2] = np.log(0.5)

for i in range(0,m+1):
    for j in range(0,n+1):
        if i==0 and j ==0: continue
        f_matrix[i,j,0] = forward_M_compute_l(i,j)
        f_matrix[i,j,1] = forward_X_compute_l(i,j)
        f_matrix[i,j,2] = forward_Y_compute_l(i,j)

In [None]:
f_total = logsum(f_matrix[m,n,1:] + l_para['lgtau'])

In [None]:
f_total

In [None]:
b_matrix[m,n,1] = l_para['lgtau']
b_matrix[m,n,2] = l_para['lgtau']
for i in range(m,-1,-1):
    for j in range(n,-1,-1):
        if i ==m and j == n: continue
        b_matrix[i,j,0] = backward_M_compute_l(i,j)
        b_matrix[i,j,1] = backward_X_compute_l(i,j)
        b_matrix[i,j,2] = backward_Y_compute_l(i,j)
            

In [None]:
b_total = logsum(b_matrix[0,0,1:] + np.log(0.5))

In [None]:
b_total

In [None]:
gamma_matrix=f_matrix+b_matrix

In [None]:
trans_mat_new = np.zeros(l_para['trans_mat'].shape)
# trans_mat_new[1,2] = trans_mat_new[2,1] = -np.inf 

In [None]:
for s1 in range(trans_mat_new.shape[0]):
    for s2 in range(trans_mat_new.shape[1]):
        if (s1 == 1 and s2 ==2) or (s1 ==2 and s2 ==1):
            trans_mat_new[s1,s2] = -np.inf
            continue
        mat = np.zeros([m,n])
        for i in range(m):
            for j in range(n):
                mat[i,j] = f_matrix[i,j,s1]+b_matrix[index_determ(i,j,s2)] + l_para['trans_mat'][s1,s2] + \
                            l_para['emit_mat'][aa2index[seq1[i]],aa2index[seq2[j]]][s2]
        trans_mat_new[s1,s2] = logsum(mat) - logsum(gamma_matrix[:,:,s1])

In [None]:
l_para['lgtau'] = np.log(3-np.exp(trans_mat_new).sum())

In [None]:
emit_mat_new = np.zeros(l_para['emit_mat'].shape)

for aa1 in aa2index:
    for aa2 in aa2index:
        aa1_index = [index for index,value in enumerate(seq1) if value == aa1]
        aa2_index = [index for index,value in enumerate(seq2) if value == aa2]
        
        value_list = []
        for aa1_i in aa1_index:
            for aa2_i in aa2_index:
                value_list.append(gamma_matrix[aa1_i,aa2_i])
        
        if len(value_list) == 0:
            for k in range(gamma_matrix.shape[2]):
                emit_mat_new[aa2index[aa1],aa2index[aa2],k] = -np.inf
            continue
        
        value_np = np.array(value_list)  
        
        for k in range(value_np.shape[1]):
            emit_mat_new[aa2index[aa1],aa2index[aa2],k] = logsum(value_np[:,k])
            
for it in range(emit_mat_new.shape[2]):
    emit_mat_new[:,:,it] -= logsum(gamma_matrix[:,:,it])

for it in range(20):
    emit_mat_new[it,0,1] = logsum(emit_mat_new[it,:,1])
    emit_mat_new[0,it,2] = logsum(emit_mat_new[:,it,2])
    
emit_mat_new[:,:,1] = np.tile(emit_mat_new[:,0,1].reshape(-1,1),[1,20])
emit_mat_new[:,:,2] = np.tile(emit_mat_new[0,:,2].reshape(1,-1),[20,1])
    
emit_mat_new = np.clip(emit_mat_new,-10,-1e-5)


In [None]:
np.exp(test_mat_new[:,:,1]).sum()

In [None]:
np.exp(emit_mat_new[:,0,1]).sum()

In [None]:
np.exp(emit_mat_new).sum()

In [None]:
aa1 = 'C'
aa1_index = [index for index,value in enumerate(seq1) if value == aa1]

In [None]:
aa1_index

In [None]:
logsum(test_list[2])

In [None]:
for i in range(400):
    logsum(test_list[i])

In [None]:
emit_mat_new[aa2index[aa1],aa2index[aa2]] = logsum(value_list)

In [None]:
logsum(test_list)

In [None]:
num_list

In [None]:
if type(num_list) == list:
    num_list = np.array(num_list)
else:
    num_list = num_list.reshape(-1)

In [None]:
type(num_list)

In [None]:
for i in range(num_list.shape[0]):
    if (not np.isinf(num_list[i])) and (not np.isnan(num_list[i])):
        anker = num_list[i]
        second_half = np.delete(num_list,i)
        break

In [None]:
second_half.shape

In [None]:
second_half -= anker


In [None]:
anker+np.log(np.exp(second_half).sum()+1)

In [None]:

    

        else:
            return -np.inf
    second_half -= anker
    return anker+np.log(np.exp(second_half).sum()+1)

In [None]:
test_list

In [None]:
value_list

In [None]:
for aa1 in aa2index:
    print(aa1)

In [None]:
np.array(value_list).shape

In [None]:
seq1_np = np.array(seq1)

In [None]:

xi_mm = np.zeros([m,n])
for i in range(m):
    for j in range(n):
        xi_mm[i,j] = f_matrix[i,j,0] + b_matrix[i+1,j+1,0] + l_para['trans_mat'][0,0]+ \
                                    l_para['emit_M'][aa2index[seq1[i]],aa2index[seq2[j]]]
                    

trans_mat_new[0,0] = logsum(xi_mm) - logsum(gamma_matrix[:,:,0])
                


In [None]:
xi_mx = np.zeros([m,n+1])
for i in range(m):
    for j in range(n+1):
        xi_mx[i,j] = f_matrix[i,j,0] + b_matrix[i+1,j,1] + l_para['trans_mat'][0,1]+ l_para['emit_X'][aa2index[seq1[i]]]
                    

trans_mat_new[0,1] = logsum(xi_mx) - logsum(gamma_matrix[:,:,0])

In [None]:
xi_my = np.zeros([m+1,n])
for i in range(m+1):
    for j in range(n):
        xi_my[i,j] = f_matrix[i,j,0] + b_matrix[i,j+1,2] + l_para['trans_mat'][0,2]+ l_para['emit_Y'][aa2index[seq2[j]]]
                    

trans_mat_new[0,2] = logsum(xi_my) - logsum(gamma_matrix[:,:,0])

In [None]:
xi_xm = np.zeros([m,n])
for i in range(m):
    for j in range(n):
        xi_xm[i,j] = f_matrix[i,j,1] + b_matrix[i+1,j+1,0] + l_para['trans_mat'][1,0]+ \
                                    l_para['emit_M'][aa2index[seq1[i]],aa2index[seq2[j]]]
                    

trans_mat_new[1,0] = logsum(xi_xm) - logsum(gamma_matrix[:,:,1])

In [None]:
xi_xx = np.zeros([m,n])
for i in range(m):
    for j in range(n):
        xi_xx[i,j] = f_matrix[i,j,1] + b_matrix[i+1,j,1] + l_para['trans_mat'][1,1]+ l_para['emit_X'][aa2index[seq1[i]]]
                    

trans_mat_new[1,1] = logsum(xi_xx) - logsum(gamma_matrix[:,:,1])

In [None]:
xi_ym = np.zeros([m,n])
for i in range(m):
    for j in range(n):
        xi_ym[i,j] = f_matrix[i,j,2] + b_matrix[i+1,j+1,0] + l_para['trans_mat'][2,0]+ \
                                    l_para['emit_M'][aa2index[seq1[i]],aa2index[seq2[j]]]
                    

trans_mat_new[2,0] = logsum(xi_ym) - logsum(gamma_matrix[:,:,2])

In [None]:
xi_yy = np.zeros([m,n])
for i in range(m):
    for j in range(n):
        xi_yy[i,j] = f_matrix[i,j,2] + b_matrix[i,j+1,2] + l_para['trans_mat'][2,2]+ l_para['emit_Y'][aa2index[seq2[j]]]
                    

trans_mat_new[2,2] = logsum(xi_yy) - logsum(gamma_matrix[:,:,2])

In [None]:
np.exp(trans_mat_new).sum(axis=1)

In [None]:
np.exp(logsum(xi_mm) - logsum(gamma_matrix[:-1,:-1,0]))

In [None]:
#local
m = len(super_seq_pure_list[0][0])
n = len(super_seq_pure_list[0][1])
seq1 = super_seq_pure_list[0][0]
seq2 = super_seq_pure_list[0][1]
#order match:0,x:1,y:2
f_matrix = np.full([m+1,n+1, len(states)], -np.inf)
b_matrix = np.full([m+1,n+1, len(states)], -np.inf)

f_matrix[0,0,1] = np.log(0.5)
f_matrix[0,0,2] = np.log(0.5)

for i in range(2,m+1):
    f_matrix[i,0,1] = f_matrix[i-1,0,1] * np.logl_para['trans_mat'][1,1] * l_para['emit_X'][aa2index[seq1[i-1]]]
for j in range(2,n+1):
    f_matrix[0,j,2] = f_matrix[0,j-1,2]*l_para['trans_mat'][2,2]*l_para['emit_Y'][aa2index[seq2[j-1]]]

        

In [None]:
for i in range(1,m+1):
    for j in range(1,n+1):
        f_matrix[i,j,0] = forward_m_compute(i,j)
        f_matrix[i,j,1] = l_para['emit_X'][aa2index[seq1[i-1]]]*(l_para['trans_mat'][0,1]*f_matrix[i-1,j,0] + \
                                                                l_para['trans_mat'][1,1]*f_matrix[i-1,j,1])

        f_matrix[i,j,2] = l_para['emit_Y'][aa2index[seq2[j-1]]]*(l_para['trans_mat'][0,2]*f_matrix[i,j-1,0] + \
                                                                l_para['trans_mat'][2,2]*f_matrix[i,j-1,2])

In [None]:
f_matrix[:,:,0]

In [None]:
f_matrix[0,3,1]

freq_matrix_sup= np.zeros([len(one2all.keys()),len(one2all.keys())])
for root, dirs, files in os.walk(super_directory):
    for file in files:
        if file.endswith(".fasta") and file.startswith('d'):
            seq = func_file2seq(os.path.join(root,file))
            super_matrix_update(seq)

freq_matrix_twil= np.zeros([len(one2all.keys()),len(one2all.keys())])
for root, dirs, files in os.walk(twil_directory):
    for file in files:
        if file.endswith(".fasta") and file.startswith('d'):
            seq = func_file2seq(os.path.join(root,file))
            twil_matrix_update(seq)

In [None]:
freq_matrix_all = freq_matrix_sup + freq_matrix_twil

In [None]:
freq_matrix_list =[freq_matrix_sup,freq_matrix_twil,freq_matrix_all]

In [None]:
score_matrix_list = []

In [None]:
for matrix in freq_matrix_list:
    matrix = matrix[:20,:20]
    prob = matrix/np.sum(matrix)
    mar_p = np.sum(matrix,axis=1,keepdims=True)/np.sum(matrix)
    mar_p_pro = mar_p.dot(mar_p.T)+ 1e-9
    score_matrix = prob/mar_p_pro
    score_matrix = 2 * np.log2(score_matrix)
    score_matrix_list.append(score_matrix)

In [None]:
name_list = ["Super","Twilight","Combined"]

In [None]:
import matplotlib.pyplot as plt
for i in range(1,3):
    matrix= score_matrix_list[i]-score_matrix_list[0]
    plt.imshow(matrix)
    plt.colorbar()
    plt.title(f'difference between {name_list[i]} and Super')
    plt.show()

In [None]:
print(f"The difference between Twilight and Super family: {np.abs(score_matrix_list[1]-score_matrix_list[0]).sum()}")
print(f"The difference between Combined and Super family: {np.abs(score_matrix_list[2]-score_matrix_list[0]).sum()}")

From the plot above, it is clearly the combined scoring matrix is in the middle between Superfamily scoring matrix and Twilight scoring matrix.(the colorbar scale is different) On the sum of substraction between two matrix, the sum between Twilight and Superfamily is higher, suggesting the difference between twilight and superfamily is higher. The twilight scoring matrix has lower values on the diagonal while have higher values for non-diagonal values. It makes sense as twilight dataset's sequences are less similar thus the probaility distribution is more diverged, therefore generate higher scores for non-match amino acids and lower scores for match amino acids. 

In [None]:
def blosum2mat(blosum_filepath):
    seq_list = []
    with open(blosum_filepath,'r') as f1:
        for line in f1:
            if line[0] == '#' or line[3] =='A':
                continue
            else:
                seq = list(map(float,line[1:].strip().replace('\n','').split()))
                seq_list.append(seq)
    return np.array(seq_list)[:20,:20]

In [None]:
blosum_name_list=[]
blosum_mat_list=[]
blosum_filepath = "/home/chingyuenliu/TTIC31160_bioinfo/hw2/"
directory = os.fsencode(blosum_filepath)

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.startswith("BLOSUM"):
        blosum_name_list.append(filename)
        blosum_mat_list.append(blosum2mat(os.path.join(blosum_filepath,filename)))

In [None]:
# blosum_name_list=[]
# blosum_mat_list=[]
# blosum_filepath = "/home/chingyuenliu/TTIC31160_bioinfo/hw2/"

# with os.scandir(blosum_filepath) as it:
#     for entry in it:
#         if entry.name.startswith('BLOSUM') and entry.is_file():
#             blosum_name_list.append(entry.name)
#             blosum_mat_list.append(blosum2mat(entry.path))

In [None]:
blosum_name_list

In [None]:
for m in range(len(score_matrix_list)):
    for b in range(len(blosum_mat_list)):
        matrix= score_matrix_list[m]-blosum_mat_list[b]
        plt.imshow(matrix)
        plt.colorbar()
        plt.title(f'difference between {name_list[m]} and {blosum_name_list[b]}')
        plt.show()
        print(f"The difference between {name_list[m]} and {blosum_name_list[b]}: {(np.abs(matrix)).sum()}")

In [None]:
difference_mat=np.zeros([3,5])
for m in range(len(score_matrix_list)):
    for b in range(len(blosum_mat_list)):
        matrix= score_matrix_list[m]-blosum_mat_list[b]
        difference_mat[m,b] =(np.abs(matrix)).sum()

In [None]:
scaled_difference_mat=difference_mat-np.min(difference_mat,axis=1,keepdims=True)
scaled_difference_mat

In [None]:
np.sum(freq_matrix_sup)

In [None]:
np.sum(freq_matrix_twil)

For Superfamily scoring matrix, the closest BLOSUM matrix is BLOSUM62 with total difference of 355.4;
For Twilight scoring matrix, the closest BLOSUM matrix is BLOSUM62 with total difference of 564.5;
For combined scoring matrix, the closest BLOSUM matrix is BLOSUM62 with total difference of 414.7;

Given that all three scoring matrix's closest BLOSUM matrix is BLOSUM62, suggesting the three scoring matrix is not that different after all. It was expected once the difference is normalized by its minimum value(see matrix scaled_difference_mat), the Superfamily scoring matrix would have a lower difference on BLOSUM80 and BLOSUM90 than the Twilight scoring matrix have. However, it was the opposite from the data. Given that the difference is less than 3, suggesting there is not much difference between Superfamily scoring matrix and Twilight scoring matrix.

A very interesting trend is that the difference increased from BLOSUM45 to BLOSUM50 but decreased at BLOSUM62 suggesting the scoring matrix is heavily influenced by the alignment file the scoring matrix generated from.

Additionally, as there are more entries in Superfamily(42620) than Twilight(17509), the combined scoring matrix is more similar towards Superfamily scoring matrix than Twilight scoring matrix.

In [None]:
file_path="/home/chingyuenliu/TTIC31160_bioinfo/hw2/Super_test/group1/d3sdha_.ent-d1la6a_.ent.fasta"
seq = func_file2seq(file_path)

In [None]:
# def score_calculate(seq,matrix_list):

#     score = np.zeros([len(matrix_list)])
#     for le in range(seq.shape[1]):
#         if not seq[0,le] in aa2index or not seq[1,le] in aa2index:
#             continue
#         elif aa2index[seq[0,le]] >19 or aa2index[seq[1,le]] >19:
#             continue
#         else:
#             for i in range(len(matrix_list)):
#                 score[i] += matrix_list[i][aa2index[seq[0,le]],aa2index[seq[1,le]]]
#     return score

In [None]:
def score_calculate_2(seq):

    score = np.zeros([len(score_matrix_list)])
    for le in range(seq.shape[1]):
        if not seq[0,le] in aa2index or not seq[1,le] in aa2index:
            continue
        elif aa2index[seq[0,le]] >19 or aa2index[seq[1,le]] >19:
            continue
        else:
            for i in range(len(score_matrix_list)):
                score[i] += score_matrix_list[i][aa2index[seq[0,le]],aa2index[seq[1,le]]]
    return score

In [None]:
super_score=[]
for root, dirs, files in os.walk(super_directory):
    for file in files:
        if file.endswith(".fasta") and file.startswith('d'):
            seq = func_file2seq(os.path.join(root,file))
            super_score.append(score_calculate_2(seq))


In [None]:
twil_score=[]
for root, dirs, files in os.walk(twil_directory):
    for file in files:
        if file.endswith(".fasta") and file.startswith('d'):
            seq = func_file2seq(os.path.join(root,file))
            twil_score.append(score_calculate_2(seq))


In [None]:
twil_score[0]

In [None]:
super_f_score = np.array(super_score)
twil_f_score = np.array(twil_score)

In [None]:
n_bins = 50
fig, axs = plt.subplots(1, 3, figsize=(10,10),sharey=True)
fig.suptitle('Distribution of Score of Superfamily dataset')
# We can set the number of bins with the `bins` kwarg
axs[0].hist(super_f_score[:,0], bins=n_bins)
axs[0].set_title("Super")
axs[1].hist(super_f_score[:,1], bins=n_bins)
axs[1].set_title("Twilight")
axs[2].hist(super_f_score[:,2], bins=n_bins)
axs[2].set_title("Combined")
plt.show()

In [None]:
n_bins = 50
fig, axs = plt.subplots(1, 3, sharey=True)
fig.suptitle('Distribution of Score of Twilight dataset')
# We can set the number of bins with the `bins` kwarg
axs[0].hist(twil_f_score[:,0], bins=n_bins)
axs[0].set_title("Super")
axs[1].hist(twil_f_score[:,1], bins=n_bins)
axs[1].set_title("Twilight")
axs[2].hist(twil_f_score[:,2], bins=n_bins)
axs[2].set_title("Combined")
plt.show()

For both datasets, the score is higher with the scoring matrix generated from the same datasets. Superfamily dataset have higher score with the scoring matrix generated from Superfamily dataset and Twillight dataset have higher score with the scoring matrix generated from Twilight dataset. From the histograms ploted for superfamily dataset, most scores are positive, but still, the scores from twilight scoring matrix have more negative scores. The distributio for Twilight dataset is very obvious, the distribution for superfamily scoring matrix and Combined scoring matrix is more shifted towards the negative entry. 

One difference is that for Superfamily dataset, the span of the score is more different between Superfamily scoring matrix and Twilight scoring matrix. For superfamily scoring matrix, there are numerous scores over 600 but there are no scores over 60 from twilight scoring matrix. Essentially, the superfamily scoring matrix on Superfamily dataset produced more extreme high score and increased score variance. Comparing to the scores Twilight dataset produced from Superfamily scoring matrix and twilight scoring matrix, the span difference is not that different. And twilight scoring matrix have been consistently produce scores with lower variance. 