### 1. Preprocess Data

In [7]:
import numpy as np
import torch
import scipy.sparse
import os
import pandas as pd
import numpy as np
import networkx
import obonet
import json

In [2]:
contact_maps = {}
for file in os.listdir('../data/contact_maps/sparse_matrices')[:10000]:
    if file.endswith('.npz'):
        pdb_code = file.split('.')[0]
        contact_map = scipy.sparse.load_npz('../data/contact_maps/sparse_matrices/' + file)
        contact_map.resize((3000, 3000))
        contact_maps[pdb_code] = contact_map

In [3]:
url = "http://purl.obolibrary.org/obo/go/go-basic.obo"
graph = obonet.read_obo(url)
goid_to_category = {id_: data.get('namespace') for id_, data in graph.nodes(data=True)}

In [5]:
df = pd.read_csv('../data/GO/pdb_chain_go.csv', skiprows = 1, error_bad_lines=False, warn_bad_lines = False)
df = df[['PDB', 'GO_ID']]
pdb_to_go = {}
for key, value in df.values:
    go_list = pdb_to_go.get(key.upper(), set())
    go_list.add(value)
    pdb_to_go[key.upper()] = go_list

In [10]:
with open("../data/contact_maps/pdb_sequences.json","r") as f:
    sequence_dict = json.load(f)
    f.close()

In [23]:
X_mf_data = []
y_mf_data = []
for key, value in contact_maps.items():
    go_list = []
    for go_term in pdb_to_go.get(key, []):
        if goid_to_category.get(go_term, '') == 'molecular_function':
            go_list.append(go_term)
    if len(go_list) > 0:
        X_mf_data.append((sequence_dict[key], value))
        y_mf_data.append(np.array(go_list))
        
X_mf_data = np.array(X_mf_data)
y_mf_data = np.array(y_mf_data)

In [24]:
mf_terms = set()
for labels in y_mf_data:
    for label in labels:
        mf_terms.add(label)

In [25]:
mf_vocab = {mf: idx for idx, mf in enumerate(mf_terms)}

In [26]:
def to_sparse(x):
    """ converts dense tensor x to sparse format """
    x_typename = torch.typename(x).split('.')[-1]
    sparse_tensortype = getattr(torch.sparse, x_typename)

    indices = torch.nonzero(x)
    if len(indices.shape) == 0:  # if all elements are zeros
        return sparse_tensortype(*x.shape)
    indices = indices.t()
    values = x[tuple(indices[i] for i in range(indices.shape[0]))]
    return sparse_tensortype(indices, values, x.size())

In [27]:
def make_one_hot(Xs, vocab, max_len):
    n_vocab = len(vocab)
    for idx, X in enumerate(Xs):
        if idx % 10000 == 0:
            print("Current dealing with data piece no: %s" % (idx))
        tensor = torch.zeros(max_len, n_vocab)
        for chidx, ch in enumerate(X[:max_len]):
            tensor[chidx][vocab[ch]] = 1
        return to_sparse(tensor)

In [28]:
def one_hot(label, num_class):
    ones = torch.sparse.torch.eye(num_class)
    return ones[0, label]

In [29]:
labels = np.array([mf_vocab[y] for y in y_mf_data[0]])
target = np.zeros((len(mf_vocab)))
target[labels] = 1
target = torch.tensor(target)
target.sum()

tensor(6., dtype=torch.float64)

In [33]:
import torch
torch.version.cuda

In [36]:
X_mf_data[0]

array(['KLSPF',
       <3000x3000 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in COOrdinate format>], dtype=object)

### 2. Train Model

In [32]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_add_pool as gap
from torch_geometric.data import Data
from torch_geometric.utils import from_scipy_sparse_matrix
from torch_geometric.data import DataLoader
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.model_selection import train_test_split
X_train_mf, X_val_mf, y_train_mf, y_val_mf = train_test_split(X_mf_data, y_mf_data, test_size=0.25, random_state=41)
dataset_train = []
for adj_matrix, raw_labels in zip(X_train_mf, y_train_mf):
    labels = np.array([mf_vocab[y] for y in raw_labels])
    targets = np.zeros((len(mf_vocab)))
    targets[labels] = 1
    targets = torch.tensor(targets)
    dataset_train.append(Data(x = torch.eye(adj_matrix.shape[0], 80), edge_index = from_scipy_sparse_matrix(adj_matrix)[0], y = targets))
dataset_val = []
for adj_matrix, raw_labels in zip(X_val_mf, y_val_mf):
    labels = np.array([mf_vocab[y] for y in raw_labels])
    targets = np.zeros((len(mf_vocab)))
    targets[labels] = 1
    targets = torch.tensor(targets)
    dataset_val.append(Data(x = torch.eye(adj_matrix.shape[0], 80), edge_index = from_scipy_sparse_matrix(adj_matrix)[0], y = targets))


In [15]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(80, 256)
        self.conv2 = GCNConv(256, 256)
        self.conv3 = GCNConv(256, 256)
        self.linear1 = torch.nn.Linear(256, 512)
        self.linear2 = torch.nn.Linear(512, len(mf_vocab))

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = gap(x, batch)
        x = self.linear1(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.linear2(x)
        x = torch.sigmoid(x)
        
        return x

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)
train_loader = DataLoader(dataset_train, shuffle = True, batch_size=32)
model.train()
for epoch in range(2):
    for iteration, data in enumerate(train_loader):
        optimizer.zero_grad()
        out = model(data)
        criterion = torch.nn.BCELoss()
        loss = criterion(out, data.y.float().reshape(-1, len(mf_vocab)))
        loss.backward()
        optimizer.step()
        if iteration % 50 == 0:
            print(f'epoch {epoch + 1}, iteration {iteration}, loss = {loss}')
    

epoch 1, iteration 0, loss = 0.6951300501823425
epoch 2, iteration 0, loss = 0.28696995973587036


In [17]:
#model.eval()
#with torch.no_grad():
#    precisions = []
#    recalls = []
#    for thresh in np.arange(0.1, .31, .025):
#        precision = 0.0
#        recall = 0.0
#        num_batches = 0.0
#        for data in train_loader:
#            data = data.to(device) 
#            pred = (model(data) > .2).float().numpy()
#            label = data.y.float().reshape(-1, len(mf_vocab)).numpy()
#            precision += (label * pred).sum()/label.sum()
#           recall += (label * pred).sum()/pred.sum()
#           num_batches += 1
#        precisions.append(precision/num_batches)
#        recalls.append(recall/num_batches)

In [18]:
#import matplotlib.pyplot as plt
#plt.plot(recall, precision)
#plt.show()

In [21]:
val_loader = DataLoader(dataset_val[:256], batch_size = 256)
model.eval()
with torch.no_grad():
    precisions = []
    recalls = []
    for thresh in np.arange(0.1, .31, .025):
        precision = 0.0
        recall = 0.0
        num_batches = 0.0
        for data in val_loader:
            data = data.to(device) 
            pred = (model(data) > thresh).float().numpy()
            label = data.y.float().reshape(-1, len(mf_vocab)).numpy()
            precision += (label * pred).sum()/label.sum()
            recall += (label * pred).sum()/pred.sum()
            num_batches += 1
        print('precision = ', precision/num_batches)
        print('recall = ', recall/num_batches)
        precisions.append(precision/num_batches)
        recalls.append(recall/num_batches)

precision =  0.07922912389039993
recall =  0.3303571343421936
precision =  0.07922912389039993
recall =  0.3303571343421936


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
plt.plot(recall, precision)
plt.show()