In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

import numpy as np
import matplotlib.pyplot as plt
import sys
from Bio import SeqIO
from data_generator import data_generator

#from google.colab import drive

In [None]:
large_file = "uniref50.fasta"
small_file = "100k_rows.fasta"

data_gen = data_generator(large_file, 2000)
data_gen.gen_data(1000)
print(data_gen.data[0])

In [None]:
filename = "/content/drive_data/My Drive/proteinData/100k_rows.fasta"
#aa1_to_index = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6,
#                    'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12,
#                    'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18,
#                    'Y': 19, 'X':20, 'NOSEQ':21, '-': 22}

acid_dict = {}
#aa1_to_index = {'A':0, 'B':1}

#print(aa1_to_index['B'])
print("Reading in data...", end='')
sys.stdout.flush()
seqs = []
char_seqs = []
counter = 0
count = 0
max_seq_len = 0
print(SeqIO.parse(filename, "fasta"))

num_seq = 5000

for record in SeqIO.parse(small_file, "fasta"):

    # Stupid way to generate the dictionary
    for elem in str(record.seq).upper():
        if not (elem in acid_dict):
            temp_arr = np.zeros(26)
            temp_arr[len(acid_dict)] = 1
            acid_dict[elem] = temp_arr
    max_seq_len = max(max_seq_len, len(record.seq))
    seqs.append(np.array([acid_dict[aa] for aa in str(record.seq).upper()]))
    char_seqs.append(np.array([aa for aa in str(record.seq).upper()]))
    if (count > num_seq - 2):
        break
    else:
        count += 1
append_val = np.array([0 if i != 25 else 1 for i in range(26)]).astype('float')

print(acid_dict['A'])
acid_dict.update({'-': append_val})

In [None]:
print(seqs[0])

In [None]:
def prepare_seq(seq, dict_val, maxlen):
    temp_mat = np.full((maxlen-seq.shape[0], len(acid_dict['-'])),acid_dict['-'])
    ret_seq = np.concatenate((seq,temp_mat), axis=0)
    return torch.tensor(ret_seq, dtype=torch.long)

test_seqs = prepare_seq(seqs[0], acid_dict, max_seq_len)
print(type(seqs[0]))
#print(acid_dict['-'])
#print(len(seqs))
#print(len(seqs[0]))
#print(len(seqs[0][0]))

#for i in range(len(seqs)):
#    temp_mat = np.full((max_seq_len-seqs[i].shape[0], len(acid_dict['-'])),acid_dict['-'])
#    seqs[i] = np.concatenate((seqs[i],temp_mat), axis=0)
#    char_seqs[i] = np.pad(seqs[i],(0,(max_seq_len-seqs[i].shape[0])),'constant', constant_values=('-'))

#def prepare_seq(seq, dict_val, maxlen):
#    ret_seq = np.full((maxlen, len(acid_dict['-'])),acid_dict['-'])
#    for i, val in enumerate(seq):
#      ret_seq[i] = acid_dict[val]

#    return torch.tensor(ret_seq, dtype=torch.long)

In [None]:
#batch_size = 250
#train_samples = int(len(seqs) * 0.8)
#train_data = torch.utils.data.TensorDataset(seqs[:train_samples])
#train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
#                                               shuffle=True)
#test_data = torch.utils.data.TensorDataset(seqs[train_samples:])
#test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size,
#                                              shuffle=False)
#print(dir(train_loader))
#print(dir(train_loader.dataset))
#print(len(train_loader.dataset.tensors[0][0]))


In [None]:
class LSTM_model():
    def __init__(self, input_size, hidden_layer_size, nr_hidden_layers, feature_size):
        self.input_size = input_size
        self.hidden_layer_size = hidden_layer_size
        self.nr_hidden_layers = nr_hidden_layers
        
        self.embeddings = nn.Embedding(feature_size, input_size)
        
        self.model = nn.LSTM(input_size, hidden_layer_size, nr_hidden_layers)
        
        self.linear = nn.Linear(hidden_layer_size, feature_size)
        
    def forward(self, input_data):
        embeds = self.embeddings(input_data)
        lstm_out, _ = self.model(input_data)#embeds)#.view(len(input_data), 1, -1))
        tag_space = self.linear(lstm_out)#.view(len(input_data), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [None]:
test = LSTM_model(max_seq_len,500,1,26)

for seq in seqs:
    seq_input = prepare_seq(seq, acid_dict, max_seq_len)
    print(len(seq_input[0]))
    tag_scores = test.forward(seq_input)
    print(dir(tag_scores))
    break


#output = test.forward(char_seqs)
#output = output.detach().numpy()
#argmax = np.argmax(output[0])
#print(argmax)
#print(output[0][argmax])

In [None]:

#seqs = np.random.choice(full_seqs, 100000, False)

max_length = max([len(seq) for seq in seqs])

data = torch.empty((len(seqs), max_length))
for i, seq in enumerate(seqs):
    #print(seq)
    data[i, :] = torch.from_numpy(seq)
print("done")

print("Data shape: ", data.shape)

train_samples = int(len(data) * 0.8)
train_data = torch.utils.data.TensorDataset(data[:train_samples])
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
                                               shuffle=True)
test_data = torch.utils.data.TensorDataset(data[train_samples:])
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size,
                                              shuffle=False)