In [31]:
import numpy as np
import pandas as pd

import Levenshtein

WTYPE_ID = 32559

In [32]:
test = pd.read_csv('data/test.csv', index_col='seq_id')
wtype = test.loc[WTYPE_ID, 'protein_sequence']

In [33]:
def get_mutation(row, wtype):
    
    seq = row.protein_sequence
    
    edits = Levenshtein.editops(wtype, seq)
    assert len(edits) == 1, f'Found sequence with {len(edits)} mutations.'
    
    op, pos, _ = edits[0]
    assert op != 'insert', 'Found insertion.'
    
    if op == 'replace':
        row['wt'] = wtype[pos]
        row['mt'] = seq[pos]
        row['pos'] = pos

    if op == 'delete':
        row['wt'] = wtype[pos]
        row['mt'] = '*'
        row['pos'] = pos
        
    return row 

In [34]:
test_wo_wtype = test.drop(WTYPE_ID)
test_wo_wtype = test_wo_wtype.apply(get_mutation, wtype=wtype, axis=1)

test_wo_wtype.head()

Unnamed: 0_level_0,protein_sequence,pH,data_source,wt,mt,pos
seq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
31390,VPVNPEPDATSVENVAEKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,L,E,16
31391,VPVNPEPDATSVENVAKKTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,L,K,16
31392,VPVNPEPDATSVENVAKTGSGDSQSDPIKADLEVKGQSALPFDVDC...,8,Novozymes,L,*,16
31393,VPVNPEPDATSVENVALCTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,K,C,17
31394,VPVNPEPDATSVENVALFTGSGDSQSDPIKADLEVKGQSALPFDVD...,8,Novozymes,K,F,17


In [44]:
with open('model.txt', 'r') as fobject:
    file = fobject.readlines()
    
deepddg = pd.DataFrame([line.rstrip() for line in file[1:]], columns=['result'])
deepddg = deepddg.result.str.split(expand=True).rename(columns={0: 'chain', 1: 'wt', 2: 'pos', 3: 'mt', 4: 'ddG'})
# if(deepddg!
# for i in range(0,deepddg.shape[0]):
#     print(deepddg.pos[i])
#     if(deepddg.pos[i]!=None):
#         deepddg.pos = deepddg.pos.apply(int) - 1
#     if(deepddg.ddG[i]!=None):
#         deepddg.ddG = deepddg.ddG.apply(float)

deepddg.pos = deepddg.pos.apply(int) - 1
deepddg.ddG = deepddg.ddG.apply(float)

deepddg

Unnamed: 0,chain,wt,pos,mt,ddG
0,A,V,0,R,-0.132
1,A,V,0,N,-0.155
2,A,V,0,D,-0.158
3,A,V,0,C,-0.162
4,A,V,0,Q,-0.111
...,...,...,...,...,...
4193,A,K,220,S,-1.523
4194,A,K,220,T,-2.530
4195,A,K,220,W,-1.325
4196,A,K,220,Y,-1.250


In [49]:
test_ddg = []

for seq_id, row in test_wo_wtype.iterrows():
    
    wt, pos, mt = row.wt, row.pos, row.mt
    
    if mt != '*':
        ddg = deepddg.loc[(deepddg.wt == wt) & (deepddg.pos == pos) & (deepddg.mt == mt), 'ddG'].values[0]
    else:
        ddg = deepddg.loc[(deepddg.wt == wt) & (deepddg.pos == pos), 'ddG'].min()
    
    test_ddg.append(pd.DataFrame(ddg, index=[seq_id], columns=['ddG']))
    
test_ddg = pd.concat(test_ddg)

In [50]:
test_ddg = test_ddg.reset_index().rename(columns={'index': 'seq_id', 'ddG': 'tm'}).set_index('seq_id')
test_ddg.loc[WTYPE_ID] = 0
test_ddg = test_ddg.sort_index()
test_ddg.to_csv('deepddg.csv', index=True)
test_ddg.head()

Unnamed: 0_level_0,tm
seq_id,Unnamed: 1_level_1
31390,-0.226
31391,-0.169
31392,-0.858
31393,-1.277
31394,-1.353
