In [1]:
import re
import torch
import pickle
import argparse
import numpy as np
import pandas as pd

from transformers import BertModel, BertTokenizer

### 1. Load data

In [2]:
IC50_df = pd.read_csv("../../input_data/BindingDB/IC50_data.tsv", sep = "\t")
IC50_uniprot_IDs, IC50_uniprot_seqs = IC50_df.iloc[:, 0].values, IC50_df.iloc[:, 3].values

Ki_df = pd.read_csv("../../input_data/BindingDB/Ki_data.tsv", sep = "\t")
Ki_uniprot_IDs, Ki_uniprot_seqs = Ki_df.iloc[:, 0].values, Ki_df.iloc[:, 3].values

### 2. Get protein features

In [3]:
def get_info(dataset, uniprot_ids, uniprot_seqs):
    protein_seqs_dict = dict()
    
    for i, s in zip(uniprot_ids, uniprot_seqs):
        protein_seqs_dict[i] = s
    print(f"[{dataset}] Uniprot_IDs: {len(protein_seqs_dict)}")
    
    return protein_seqs_dict

In [4]:
IC50_protein_seqs_dict = get_info("IC50", IC50_uniprot_IDs, IC50_uniprot_seqs)
Ki_protein_seqs_dict = get_info("Ki", Ki_uniprot_IDs, Ki_uniprot_seqs)

[IC50] Uniprot_IDs: 4347
[Ki] Uniprot_IDs: 2431


In [5]:
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case = False)
prots_model = BertModel.from_pretrained("Rostlab/prot_bert") 
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

prots_model = prots_model.to(device)
prots_model = prots_model.eval()

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def get_features(protein_seqs_dict):
    protein_features_dict = dict()
    
    for PID in list(protein_seqs_dict.keys()):
        seqs_example = " ".join(list(re.sub(r"[UZOB]", "X", protein_seqs_dict[PID])))

        ids = tokenizer.batch_encode_plus([seqs_example], add_special_tokens = True, pad_to_max_length = True)
        input_ids = torch.tensor(ids['input_ids']).to(device)
        attention_mask = torch.tensor(ids['attention_mask']).to(device) 

        with torch.no_grad(): 
            embedding = prots_model(input_ids = input_ids, attention_mask = attention_mask)[0]
            embedding = embedding.cpu().numpy()
            seq_len = (attention_mask[0] == 1).sum()

            if seq_len < 1503:
                seq_emd = embedding[0][1:seq_len-1]            

            else:
                seq_len = 1502
                seq_emd = embedding[0][1:seq_len-1]

        protein_features_dict[PID] = seq_emd
        
    return protein_features_dict

In [None]:
IC50_feature_dict = get_features(IC50_protein_seqs_dict)
print(f"PDBbind features: {len(IC50_feature_dict)}")
with open("../../input_data/BindingDB/IC50_protein_features.pkl", "wb") as f:        
    pickle.dump(IC50_feature_dict, f) 
    
Ki_feature_dict = get_features(Ki_protein_seqs_dict)
print(f"PDBbind features: {len(Ki_feature_dict)}")
with open("../../input_data/BindingDB/Ki_protein_features.pkl", "wb") as f:        
    pickle.dump(Ki_feature_dict, f) 