In [None]:
sequences = list()
with open('data/sequences.txt', 'r') as f:
    for line in f:
        sequences.append(line[:-1])

sequences_train = list()
sequences_test = list()
proteins_test = list()
y_train = list()
with open('data/graph_labels.txt', 'r') as f:
    for i,line in enumerate(f):
        t = line.split(',')
        if len(t[1][:-1]) == 0:
            sequences_test.append(sequences[i])
            proteins_test.append(t[0])
        else:
            sequences_train.append(sequences[i])
            y_train.append(int(t[1][:-1]))

len(sequences), len(sequences_train), len(y_train), len(sequences_test), len(proteins_test)

In [None]:
set_char = list(set([letter for sets in map(set, sequences) for letter in sets]))
set_char.sort()
tokenizer = {char:idx for idx,char in enumerate(set_char)}
tok_seq_train = list(map(lambda l: list(map(lambda x : tokenizer[x], list(l))), sequences_train))

import tensorflow as tf
import numpy as np

tfkl = tf.keras.layers

layer = tf.keras.layers.CategoryEncoding(
          num_tokens=len(set_char), output_mode="count")
          
rt = tf.ragged.constant(tok_seq_train)
count_encoding = layer(rt).numpy()


enclabels = tf.keras.layers.CategoryEncoding(
          num_tokens=18, output_mode='one_hot')(tf.expand_dims(y_train, 1)).numpy()

In [None]:
from sklearn.model_selection import train_test_split

#prevents overfitting 
X_train, X_test, y_train, y_test = train_test_split(count_encoding,
                                                    enclabels,
                                                    test_size=0.20,
                                                    random_state=42)

X_tot , y_tot = count_encoding, enclabels

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

X_train, X_test, y_train, y_test = (
    torch.tensor(X_train, dtype=torch.float32), 
    torch.tensor(X_test, dtype=torch.float32), 
    torch.tensor(y_train, dtype=torch.float32), 
    torch.tensor(y_test, dtype=torch.float32)
    )

X_tot, y_tot = torch.tensor(X_tot, dtype=torch.float32), torch.tensor(y_tot, dtype=torch.float32)

In [None]:
from torch.utils.data import DataLoader, Dataset

set_char = list(set([letter for sets in map(set, sequences) for letter in sets]))
set_char.sort()
tokenizer = {char:idx for idx,char in enumerate(set_char)}
tok_seq_train = list(map(lambda l: list(map(lambda x : tokenizer[x], list(l))), sequences_train))

layer = tf.keras.layers.CategoryEncoding(
          num_tokens=len(set_char), output_mode="count")
          
rt = tf.ragged.constant(tok_seq_train)

count_encoding = layer(rt)

# y_train = tf.constant(y_train, dtype=tf.float32)

class ClassifDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        sequence = self.sequences[index]
        label = self.labels[index]

        return sequence, label

dataset = ClassifDataset(
    torch.tensor(X_train), 
    labels=torch.tensor(y_train.numpy()))
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
device = 'cpu'

X_train = X_train.to(device)
y_train = y_train.to(device)

model = nn.Sequential(
    nn.Linear(21, 1024),
    nn.Dropout(0.3),
    nn.ReLU(),
    nn.Linear(1024, 128),
    nn.Dropout(0.2),
    nn.ReLU(),
    nn.Linear(128, 128),
    nn.Dropout(0.2),
    nn.ReLU(),
    nn.Linear(128, 18),
    nn.LogSoftmax(dim=1)
    ).to(device)


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), 
    # lr=1e-3, 
    lr=1,
    weight_decay=0.01
    )

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: 1e-8 + (1e-4 - 1e-8) * (1 - epoch / 500))

for epoch in range(500):
    for input, y in dataloader:
        optimizer.zero_grad()
        input, y = input.to(device), y.to(device)
        output = model(input)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

    scheduler.step()
    print(scheduler.get_last_lr()[0])

    train_acc = (torch.argmax(model(X_train), dim=1)==torch.argmax(y_train, dim=1)).to('cpu').numpy().mean()
    print(f"Epoch {epoch}, {100*train_acc}%")

In [None]:
model.eval()
model.training

In [None]:
train_acc = (torch.argmax(model(X_train), dim=1)==torch.argmax(y_train, dim=1)).numpy().mean()
test_acc = (torch.argmax(model(X_test), dim=1)==torch.argmax(y_test, dim=1)).numpy().mean()
train_acc, test_acc

In [None]:
from sklearn.metrics import accuracy_score, log_loss

log_loss(y_true = y_test, 
        y_pred = torch.nn.Softmax(dim=1)(model(X_test)).detach().numpy())

1.550464783515805 avec 1024 > 128 > 128 > 18 dropout a 0.3 et 2*0.2

1.5220981442226547
