In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

from Bio import SeqIO

# Chargement des données et preprocessing

In [20]:
# Chargement des données à partir d'un fichier FASTA

sequences = []
with open("/Users/hugoguillaume/Downloads/train.fa", "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        # Ajout de la séquence au tableau
        sequences.append(str(record.seq))

# Conversion des séquences en un tableau numpy

# sequences = np.asarray(sequences)
sequences = np.array(sequences)

In [21]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()

le.fit(sequences)

sequences = le.transform(sequences)

ohe = OneHotEncoder(sparse=False)

ohe.fit(sequences.reshape(-1,1))

sequences = ohe.transform(sequences.reshape(-1,1))

# Définition de l'architecture

In [22]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, latent_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.latent_size = latent_size
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, latent_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialisation des paramètres

In [23]:
input_size = len(sequences[0])
hidden_size = 128
latent_size = 64
output_size = input_size

# Création des modèles

In [24]:
encoder = Encoder(input_size, hidden_size, latent_size)
decoder = Decoder(latent_size, hidden_size, output_size)

# Définition de la fonction de coût

In [25]:
criterion = nn.MSELoss()

# Définition de l'optimiseur

In [26]:
from itertools import chain
optimizer = optim.Adam(chain(encoder.parameters(), decoder.parameters()))

# Entrainement

In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    for i, sequence in enumerate(sequences):
        # Encodage
        sequence = torch.from_numpy(sequence).type(torch.FloatTensor)
        latent_vector = encoder(sequence)
        # Décodage
        reconstructed_sequence = decoder(latent_vector)

        # Calcul de la perte
        loss = criterion(reconstructed_sequence, sequence)

        # Réinitialisation des gradients
        optimizer.zero_grad()

        # Propagation arrière
        loss.backward()

        # Mise à jour des poids
        optimizer.step()


# Génération de nouvelles protéines 

In [None]:
latent_vector = torch.randn(1, latent_size)
new_sequence = decoder(latent_vector)