<a href="https://colab.research.google.com/github/DanielOlson/CompBioAsia/blob/main/CompBioAsia_SecondaryStructure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget http://www.princeton.edu/~jzthree/datasets/ICML2014/cullpdb+profile_5926_filtered.npy.gz
!mv cullpdb+profile_5926_filtered.npy.gz data.npy.gz

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import torch
from torch import nn

In [None]:
x = np.load('data.npy.gz')

x = x.reshape(-1, 700, 57)
aminos = torch.transpose(torch.tensor(x[:,:,0:22]), -1, -2).float()
labels = torch.transpose(torch.tensor(x[:,:,22:31]), -1, -2).float()

aminos_train = aminos[:5000]
aminos_test = aminos[5000:]

labels_train = labels[:5000]
labels_test = labels[5000:]

print(aminos.shape, labels.shape)
print(aminos_train.shape, aminos_test.shape)
print(labels_train.shape, labels_test.shape)


alphabet = ['A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X','NoSeq']
alpha_to_num = {c:i for i, c in enumerate(alphabet)}
secondary_labels = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T','NoSeq']
label_to_num = {c:i for i, c in enumerate(secondary_labels)}


In [None]:
def train_with_data(x, y,
                    model, batch_size, 
                    steps, learning_rate, 
                    loss_function, checkin=100):
  dev = 'cpu'
  if torch.cuda.is_available():
    dev = 'cuda:0'

  model.to(dev)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  for step in range(steps):
    batch_i = torch.randint(0, len(x), (batch_size,))
    batch_x = x[batch_i].to(dev)
    batch_y = y[batch_i].to(dev)

    out = model(batch_x)

    loss = loss_function(out, batch_y)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if step % checkin == 0:
      print('progress: {:.2%} '.format(step / steps), 'loss:', float(loss))

  model.to('cpu')

# This function just tests our accuracy
def test_accuracy(x, y, model, threshold = 0.5, batch_size=32):
  dev = 'cpu'
  if torch.cuda.is_available():
    dev = 'cuda:0'

  model.to(dev)

  correct_aas = 0
  incorrect_aas = 0

  correct_ss = 0
  incorrect_ss = 0
  with torch.no_grad():
   # print(x.shape)
    batches = int(len(x) / batch_size) - 1# we lose a little bit of data but thats ok
   # print(batches)

    for batch in range(batches):
      start_idx = batch * batch_size

      batch_x = x[start_idx:start_idx+batch_size].to(dev)
      batch_y = y[start_idx:start_idx+batch_size].to(dev)

      out = model(batch_x)
      out = out > threshold

      correct = out == batch_y
      incorrect = out != batch_y

      

      noseq = batch_y[:,-1,:] == 0
      not_interesting = batch_y[:,:-1,:] 
      not_interesting = not_interesting > 0.5
      not_interesting = torch.sum(not_interesting, dim=1).view(-1, 1, 700)
      not_interesting = not_interesting > 0
      not_interesting = not_interesting.view(-1, 1, 700)
  #    print(not_interesting.shape)
      noseq = noseq.unsqueeze(1)
      correct *= noseq
      incorrect *= noseq

      correct_aas += torch.sum(correct)
      incorrect_aas += torch.sum(incorrect)

      correct *= not_interesting
      incorrect *= not_interesting

      correct_ss += torch.sum(correct)
      incorrect_ss += torch.sum(incorrect)

   #   comparison = torch.where(batch_y[:,8,:])

     

  model.to('cpu')
  return float(correct_aas / (correct_aas + incorrect_aas)), float(correct_ss / (correct_ss + incorrect_ss))


In [None]:
batch_size = 32 # How many images we train on at every step
steps = 2000 # How many total steps we will train for
learning_rate = 0.0001 # How fast we adjust gradients with gradient descent
loss_function = nn.MSELoss() # Which loss function we're using
checkin = int(steps / 5) # How often we print our loss (smaller = more frequently)



model = nn.Sequential(nn.Conv1d(22, 10, kernel_size=1, stride=1, padding='same'),
                      nn.ELU(),
                      nn.Conv1d(10, 9, kernel_size=5, stride=1, padding='same'),
                      nn.Sigmoid())

print(test_accuracy(aminos_test, labels_test, model))



train_with_data(aminos_train, labels_train, model, 
                batch_size, steps, learning_rate, 
                loss_function, checkin=checkin)


print(test_accuracy(aminos_test, labels_test, model))