In [1]:
import random

# Load the parallel corpus
with open('/content/drive/MyDrive/fr-en/europarl-v7.fr-en.fr', 'r') as f:
    de_data = f.readlines()
with open('/content/drive/MyDrive/fr-en/europarl-v7.fr-en.en', 'r') as f:
    en_data = f.readlines()

# Combine the German and English data into parallel examples
examples = list(zip(de_data, en_data))

# Shuffle the examples
random.shuffle(examples)

# Select a subset of the examples
num_examples = 10000
selected_examples = examples[:num_examples]

# Split the selected examples into separate German and English files
with open('selected_de_data.txt', 'w') as f:
    for example in selected_examples:
        f.write(example[0])
with open('selected_en_data.txt', 'w') as f:
    for example in selected_examples:
        f.write(example[1])


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Seq2Seq, self).__init__()
        self.encoder = nn.Embedding(input_size, hidden_size)
        self.decoder = nn.LSTM(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, src, trg):
        embedded = self.encoder(src)
        output, (hidden, cell) = self.decoder(embedded)
        prediction = self.fc(hidden.squeeze(0))
        return prediction

import torchtext

src_field = torchtext.data.Field(init_token='<sos>', eos_token='<eos>', lower=True)
tgt_field = torchtext.data.Field(init_token='<sos>', eos_token='<eos>', lower=True)

train_data, val_data, test_data = torchtext.datasets.Multi30k.splits(exts=('.de', '.en'), fields=(src_field, tgt_field))

src_field.build_vocab(train_data.src, min_freq=2)
tgt_field.build_vocab(train_data.tgt, min_freq=2)

src_vocab = src_field.vocab
tgt_vocab = tgt_field.vocab


# Define the model
INPUT_DIM = len(src_vocab)
OUTPUT_DIM = len(trg_vocab)
HIDDEN_DIM = 256
model = Seq2Seq(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM)

# Define the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=trg_vocab['<pad>'])

# Define the training loop
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        optimizer.zero_grad()
        output = model(src, trg[:-1])
        output_dim = output.shape[-1]
        output = output.view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# Train the model
BATCH_SIZE = 64
train_iterator = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
for epoch in range(10):
    train_loss = train(model, train_iterator, optimizer, criterion)
    print('Epoch: {}, Train Loss: {}'.format(epoch, train_loss))


AttributeError: ignored