# Example: Find a model for Proteins

In [1]:
import json
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import TensorDataset, DataLoader

import matplotlib.colors as mcolors
import matplotlib.pyplot as plt

# Define color-blind-friendly palette
hex1 = ['#648FFF', '#785EF0', '#DC267F', '#FE6100', '#FFB000']
hex1_inverted = hex1[::-1]
colors1=[mcolors.to_rgb(i) for i in hex1]

## Load protein data from PDB dataset

In [2]:
jsons = []
with open('../../../supplemental_material/pdb/dataset2.txt', 'r') as file:
    for line in file:
        jsons.append(json.loads(line))

JSONDecodeError: Expecting value: line 1 column 9736 (char 9735)

In [None]:
data = jsons[0]
print(str(len(data)) + " entries")

## Generate train and test datasets from entries of the proteins

In [None]:
all_inputs = np.zeros((len(data), 7))
all_targets = np.zeros((len(data), 1))
indices = []
for i in range(len(data)):
    entry = data[i]['entry']
    if 'cell' in entry and 'exptl_crystal' in entry and len(entry['cell'].keys()) >= 7 and len(entry['exptl_crystal'][0].keys()) >= 3:
        all_inputs[i, 0] = entry['cell']['angle_alpha']
        all_inputs[i, 1] = entry['cell']['angle_beta']
        all_inputs[i, 2] = entry['cell']['angle_gamma']
        all_inputs[i, 3] = entry['cell']['length_a']
        all_inputs[i, 4] = entry['cell']['length_b']
        all_inputs[i, 5] = entry['cell']['length_c']
        all_targets[i] = entry['exptl_crystal'][0]['density_matthews']
        all_inputs[i, 6] = entry['exptl_crystal'][0]['density_percent_sol']
        indices.append(i)
    # else:
    #     print('skipped %d' % i)
all_inputs = all_inputs[indices]
all_targets = all_targets[indices]

print('Number of entries: %d' % len(all_inputs))

In [5]:
# Set random seed for reproducibility
np.random.seed(42)

# Shuffle the indices
indices = np.random.permutation(len(all_inputs))

# Split the indices for training, testing, and validation sets
train_indices = indices[:3 * len(all_inputs) // 4]
test_indices = indices[3 * len(all_inputs) // 4:]

# Create the training, testing, and validation sets
inputs_training = all_inputs[train_indices]
targets_training = all_targets[train_indices]
inputs_testing = all_inputs[test_indices]
targets_testing = all_targets[test_indices]

In [6]:
input_tensor = torch.tensor(inputs_training, dtype=torch.float32)
target_tensor = torch.tensor(targets_training, dtype=torch.float32)

## Define Model

In [7]:
# Define the model architecture using a sequential container

# define input size, hidden layer size, output size 
D_i, D_k, D_o = 7, 50, 1

# The model consists of:
model = nn.Sequential(
    nn.Linear(D_i, D_k),
    nn.ReLU(),
    nn.Linear(D_k, D_k),
    # nn.ReLU(),
    # nn.Linear(D_k, D_k),
    nn.ReLU(),
    nn.Linear(D_k, D_k),
    nn.ReLU(),
    nn.Linear(D_k, D_k),
    nn.ReLU(),
    nn.Linear(D_k, D_k),
    nn.ReLU(),
    nn.Linear(D_k, D_o),
)

In [8]:
# Define the loss function and optimizer
# Mean Squared Error (MSE) is used as the loss function
criterion = nn.MSELoss()

In [9]:
# Adam optimizer is used with weight decay (L2 regularization) to prevent overfitting
optimizer = optim.Adam(model.parameters(), weight_decay=1e-4)
# object that decreases learning rate by half every N epochs 
scheduler = StepLR(optimizer, step_size=50, gamma=0.5)

In [None]:

# Train the model
num_epochs = 1000  # Number of epochs to train the model
losses = []  # List to store the loss value for each epoch
# load data
data_loader = DataLoader(TensorDataset(input_tensor,target_tensor), batch_size=32)
# Iterate over the epochs
for epoch in range(num_epochs):
    epoch_loss = 0.0  # Initialize the loss value for this epoch
    # Iterate over the training data in batches
    for i, data in enumerate(data_loader):
        # Get the inputs and targets from the data loader
        inputs, targets = data
        # Clear the gradients of all optimized tensors
        optimizer.zero_grad()  
        
        # Forward pass: compute the model output
        outputs = model(inputs)
        # Compute the loss between the model output and the actual targets
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        loss.backward()        # Compute the gradient of the loss with respect to model parameters
        optimizer.step()       # Update the model parameters based on the computed gradients
        
        # Store the loss value for this epoch
        epoch_loss += loss.item()
        losses.append(loss.item())  # Store the loss value for this epoch
    print(f'Epoch {epoch:5d}, loss {epoch_loss:.3f}')
    # tell scheduler to consider updating learning rate 
    scheduler.step()

In [None]:
plt.plot(losses)
plt.xscale('log')
#plt.yscale('log')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.show()

In [12]:
predictions_test = model(torch.tensor(inputs_testing, dtype=torch.float32)).detach().numpy()

In [None]:
plt.plot(100*(predictions_test/targets_testing-1))
plt.xlabel('Index')
plt.ylabel('Percent Error')
plt.ylim([-25,25])
plt.show()

In [None]:
var_performance = np.sum((predictions_test - targets_testing)**2)
print('Variance of performance: %.3f' % var_performance)

In [None]:
order = np.argsort(targets_testing.flatten())
xvals = np.arange(len(targets_testing))
plt.scatter(xvals,predictions_test[order],label='Predictions',s=10,color=colors1[0])
plt.scatter(xvals,targets_testing[order],label='Targets',s=10,color=colors1[2])
plt.xlabel('Index')
plt.ylabel('Matthews Coefficient')
plt.legend()
plt.show()