In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import math
import copy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check if GPU is available


In [2]:
# Vocabulary class to handle mapping between words and numerical indices
class Vocabulary:
    def __init__(self):
        # Initialize dictionaries for word to index and index to word mappings
        self.word2index = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2}
        self.index2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>"}
        self.word_count = {}  # Keep track of word frequencies
        self.n_words = 3  # Start counting from 3 to account for special tokens

    def add_sentence(self, sentence):
        # Add all words in a sentence to the vocabulary
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        # Add a word to the vocabulary
        if word not in self.word2index:
            # Assign a new index to the word and update mappings
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word_count[word] = 1
            self.n_words += 1
        else:
            # Increment word count if the word already exists in the vocabulary
            self.word_count[word] += 1

def tokenize_and_pad(sentences, vocab):
    # Calculate the maximum sentence length for padding
    max_length = max(len(sentence.split(' ')) for sentence in sentences) + 2  # +2 for SOS and EOS tokens
    tokenized_sentences = []
    for sentence in sentences:
        # Convert each sentence to a list of indices, adding SOS and EOS tokens
        tokens = [vocab.word2index["<SOS>"]] + [vocab.word2index[word] for word in sentence.split(' ')] + [vocab.word2index["<EOS>"]]
        # Pad sentences to the maximum length
        padded_tokens = tokens + [vocab.word2index["<PAD>"]] * (max_length - len(tokens))
        tokenized_sentences.append(padded_tokens)
    return torch.tensor(tokenized_sentences, dtype=torch.long)

# Custom Dataset class for English to French sentences
class EngFrDataset(Dataset):
    def __init__(self, pairs):
        self.eng_vocab = Vocabulary()
        self.fr_vocab = Vocabulary()
        self.pairs = []

        # Process each English-French pair
        for eng, fr in pairs:
            self.eng_vocab.add_sentence(eng)
            self.fr_vocab.add_sentence(fr)
            self.pairs.append((eng, fr))

        # Separate English and French sentences
        self.eng_sentences = [pair[0] for pair in self.pairs]
        self.fr_sentences = [pair[1] for pair in self.pairs]
        
        # Tokenize and pad sentences
        self.eng_tokens = tokenize_and_pad(self.eng_sentences, self.eng_vocab)
        self.fr_tokens = tokenize_and_pad(self.fr_sentences, self.fr_vocab)

        # Define the embedding layers for English and French
        self.eng_embedding = torch.nn.Embedding(self.eng_vocab.n_words, 100)  # Embedding size = 100
        self.fr_embedding = torch.nn.Embedding(self.fr_vocab.n_words, 100)    # Embedding size = 100

    def __len__(self):
        # Return the number of sentence pairs
        return len(self.pairs)

    def __getitem__(self, idx):
        # Get the tokenized and padded sentences by index
        eng_tokens = self.eng_tokens[idx]
        fr_tokens = self.fr_tokens[idx]
        # Lookup embeddings for the tokenized sentences
        eng_emb = self.eng_embedding(eng_tokens)
        fr_emb = self.fr_embedding(fr_tokens)
        return eng_tokens, fr_tokens, eng_emb, fr_emb

In [3]:

# # Sample dataset of English-French sentence pairs
# english_to_french = [
#     ("I am cold", "J'ai froid"),
#     ("You are tired", "Tu es fatigué"),
#     ("He is hungry", "Il a faim"),
#     ("She is happy", "Elle est heureuse"),
#     ("We are friends", "Nous sommes amis"),
#     ("They are students", "Ils sont étudiants"),
#     ("The cat is sleeping", "Le chat dort"),
#     ("The sun is shining", "Le soleil brille"),
#     ("We love music", "Nous aimons la musique"),
#     ("She speaks French fluently", "Elle parle français couramment"),
#     ("He enjoys reading books", "Il aime lire des livres"),
#     ("They play soccer every weekend", "Ils jouent au football chaque week-end"),
#     ("The movie starts at 7 PM", "Le film commence à 19 heures"),
#     ("She wears a red dress", "Elle porte une robe rouge"),
#     ("We cook dinner together", "Nous cuisinons le dîner ensemble"),
#     ("He drives a blue car", "Il conduit une voiture bleue"),
#     ("They visit museums often", "Ils visitent souvent des musées"),
#     ("The restaurant serves delicious food", "Le restaurant sert une délicieuse cuisine"),
#     ("She studies mathematics at university", "Elle étudie les mathématiques à l'université"),
#     ("We watch movies on Fridays", "Nous regardons des films le vendredi"),
#     ("He listens to music while jogging", "Il écoute de la musique en faisant du jogging"),
#     ("They travel around the world", "Ils voyagent autour du monde"),
#     ("The book is on the table", "Le livre est sur la table"),
#     ("She dances gracefully", "Elle danse avec grâce"),
#     ("We celebrate birthdays with cake", "Nous célébrons les anniversaires avec un gâteau"),
#     ("He works hard every day", "Il travaille dur tous les jours"),
#     ("They speak different languages", "Ils parlent différentes langues"),
#     ("The flowers bloom in spring", "Les fleurs fleurissent au printemps"),
#     ("She writes poetry in her free time", "Elle écrit de la poésie pendant son temps libre"),
#     ("We learn something new every day", "Nous apprenons quelque chose de nouveau chaque jour"),
#     ("The dog barks loudly", "Le chien aboie bruyamment"),
#     ("He sings beautifully", "Il chante magnifiquement"),
#     ("They swim in the pool", "Ils nagent dans la piscine"),
#     ("The birds chirp in the morning", "Les oiseaux gazouillent le matin"),
#     ("She teaches English at school", "Elle enseigne l'anglais à l'école"),
#     ("We eat breakfast together", "Nous prenons le petit déjeuner ensemble"),
#     ("He paints landscapes", "Il peint des paysages"),
#     ("They laugh at the joke", "Ils rient de la blague"),
#     ("The clock ticks loudly", "L'horloge tic-tac bruyamment"),
#     ("She runs in the park", "Elle court dans le parc"),
#     ("We travel by train", "Nous voyageons en train"),
#     ("He writes a letter", "Il écrit une lettre"),
#     ("They read books at the library", "Ils lisent des livres à la bibliothèque"),
#     ("The baby cries", "Le bébé pleure"),
#     ("She studies hard for exams", "Elle étudie dur pour les examens"),
#     ("We plant flowers in the garden", "Nous plantons des fleurs dans le jardin"),
#     ("He fixes the car", "Il répare la voiture"),
#     ("They drink coffee in the morning", "Ils boivent du café le matin"),
#     ("The sun sets in the evening", "Le soleil se couche le soir"),
#     ("She dances at the party", "Elle danse à la fête"),
#     ("We play music at the concert", "Nous jouons de la musique au concert"),
#     ("He cooks dinner for his family", "Il cuisine le dîner pour sa famille"),
#     ("They study French grammar", "Ils étudient la grammaire française"),
#     ("The rain falls gently", "La pluie tombe doucement"),
#     ("She sings a song", "Elle chante une chanson"),
#     ("We watch a movie together", "Nous regardons un film ensemble"),
#     ("He sleeps deeply", "Il dort profondément"),
#     ("They travel to Paris", "Ils voyagent à Paris"),
#     ("The children play in the park", "Les enfants jouent dans le parc"),
#     ("She walks along the beach", "Elle se promène le long de la plage"),
#     ("We talk on the phone", "Nous parlons au téléphone"),
#     ("He waits for the bus", "Il attend le bus"),
#     ("They visit the Eiffel Tower", "Ils visitent la tour Eiffel"),
#     ("The stars twinkle at night", "Les étoiles scintillent la nuit"),
#     ("She dreams of flying", "Elle rêve de voler"),
#     ("We work in the office", "Nous travaillons au bureau"),
#     ("He studies history", "Il étudie l'histoire"),
#     ("They listen to the radio", "Ils écoutent la radio"),
#     ("The wind blows gently", "Le vent souffle doucement"),
#     ("She swims in the ocean", "Elle nage dans l'océan"),
#     ("We dance at the wedding", "Nous dansons au mariage"),
#     ("He climbs the mountain", "Il gravit la montagne"),
#     ("They hike in the forest", "Ils font de la randonnée dans la forêt"),
#     ("The cat meows loudly", "Le chat miaule bruyamment"),
#     ("She paints a picture", "Elle peint un tableau"),
#     ("We build a sandcastle", "Nous construisons un château de sable"),
#     ("He sings in the choir", "Il chante dans le chœur")
# ]

In [4]:

# sample dataset of english-french sentence pairs without capital letters
english_to_french = [
    ("i am cold", "j'ai froid"),
    ("you are tired", "tu es fatigué"),
    ("he is hungry", "il a faim"),
    ("she is happy", "elle est heureuse"),
    ("we are friends", "nous sommes amis"),
    ("they are students", "ils sont étudiants"),
    ("the cat is sleeping", "le chat dort"),
    ("the sun is shining", "le soleil brille"),
    ("we love music", "nous aimons la musique"),
    ("she speaks french fluently", "elle parle français couramment"),
    ("he enjoys reading books", "il aime lire des livres"),
    ("they play soccer every weekend", "ils jouent au football chaque week-end"),
    ("the movie starts at 7 pm", "le film commence à 19 heures"),
    ("she wears a red dress", "elle porte une robe rouge"),
    ("we cook dinner together", "nous cuisinons le dîner ensemble"),
    ("he drives a blue car", "il conduit une voiture bleue"),
    ("they visit museums often", "ils visitent souvent des musées"),
    ("the restaurant serves delicious food", "le restaurant sert une délicieuse cuisine"),
    ("she studies mathematics at university", "elle étudie les mathématiques à l'université"),
    ("we watch movies on fridays", "nous regardons des films le vendredi"),
    ("he listens to music while jogging", "il écoute de la musique en faisant du jogging"),
    ("they travel around the world", "ils voyagent autour du monde"),
    ("the book is on the table", "le livre est sur la table"),
    ("she dances gracefully", "elle danse avec grâce"),
    ("we celebrate birthdays with cake", "nous célébrons les anniversaires avec un gâteau"),
    ("he works hard every day", "il travaille dur tous les jours"),
    ("they speak different languages", "ils parlent différentes langues"),
    ("the flowers bloom in spring", "les fleurs fleurissent au printemps"),
    ("she writes poetry in her free time", "elle écrit de la poésie pendant son temps libre"),
    ("we learn something new every day", "nous apprenons quelque chose de nouveau chaque jour"),
    ("the dog barks loudly", "le chien aboie bruyamment"),
    ("he sings beautifully", "il chante magnifiquement"),
    ("they swim in the pool", "ils nagent dans la piscine"),
    ("the birds chirp in the morning", "les oiseaux gazouillent le matin"),
    ("she teaches english at school", "elle enseigne l'anglais à l'école"),
    ("we eat breakfast together", "nous prenons le petit déjeuner ensemble"),
    ("he paints landscapes", "il peint des paysages"),
    ("they laugh at the joke", "ils rient de la blague"),
    ("the clock ticks loudly", "l'horloge tic-tac bruyamment"),
    ("she runs in the park", "elle court dans le parc"),
    ("we travel by train", "nous voyageons en train"),
    ("he writes a letter", "il écrit une lettre"),
    ("they read books at the library", "ils lisent des livres à la bibliothèque"),
    ("the baby cries", "le bébé pleure"),
    ("she studies hard for exams", "elle étudie dur pour les examens"),
    ("we plant flowers in the garden", "nous plantons des fleurs dans le jardin"),
    ("he fixes the car", "il répare la voiture"),
    ("they drink coffee in the morning", "ils boivent du café le matin"),
    ("the sun sets in the evening", "le soleil se couche le soir"),
    ("she dances at the party", "elle danse à la fête"),
    ("we play music at the concert", "nous jouons de la musique au concert"),
    ("he cooks dinner for his family", "il cuisine le dîner pour sa famille"),
    ("they study french grammar", "ils étudient la grammaire française"),
    ("the rain falls gently", "la pluie tombe doucement"),
    ("she sings a song", "elle chante une chanson"),
    ("we watch a movie together", "nous regardons un film ensemble"),
    ("he sleeps deeply", "il dort profondément"),
    ("they travel to paris", "ils voyagent à paris"),
    ("the children play in the park", "les enfants jouent dans le parc"),
    ("the walks along the beach", "elle se promène le long de la plage"),
    ("we talk on the phone", "nous parlons au téléphone"),
    ("he waits for the bus", "il attend le bus"),
    ("they visit the eiffel tower", "ils visitent la tour eiffel"),
    ("the stars twinkle at night", "les étoiles scintillent la nuit"),
    ("she dreams of flying", "elle rêve de voler"),
    ("we work in the office", "nous travaillons au bureau"),
    ("he studies history", "il étudie l'histoire"),
    ("they listen to the radio", "ils écoutent la radio"),
    ("the wind blows gently", "le vent souffle doucement"),
    ("she swims in the ocean", "elle nage dans l'océan"),
    ("we dance at the wedding", "nous dansons au mariage"),
    ("he climbs the mountain", "il gravit la montagne"),
    ("they hike in the forest", "ils font de la randonnée dans la forêt"),
    ("the cat meows loudly", "le chat miaule bruyamment"),
    ("she paints a picture", "elle peint un tableau"),
    ("we build a sandcastle", "nous construisons un château de sable"),
    ("he sings in the choir", "il chante dans le chœur")
]


In [5]:
# Split dataset into training and test sets
english_to_french_train, english_to_french_test = train_test_split(english_to_french, test_size=0.3, random_state=42)
BATCH_SIZE = 24

# Initialize training dataset and DataLoader
train_dataset = EngFrDataset(english_to_french_train)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Initialize test dataset and DataLoader
test_dataset = EngFrDataset(english_to_french_test)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Initialize full dataset
engFrDataset = EngFrDataset(english_to_french)
train_loader= DataLoader(engFrDataset, batch_size=BATCH_SIZE, shuffle=True)


In [6]:
# Model based on repo codes (predicted only random)
# # Positional Encoding
# class PositionalEncoding(nn.Module):
#     def __init__(self, d_model, max_len=5000):
#         super(PositionalEncoding, self).__init__()
#         self.encoding = torch.zeros(max_len, d_model).to(device)
#         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1).to(device)
#         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)).to(device)
#         self.encoding[:, 0::2] = torch.sin(position * div_term).to(device)
#         self.encoding[:, 1::2] = torch.cos(position * div_term).to(device)
#         self.encoding = self.encoding.unsqueeze(0)

#     def forward(self, x):
#         return x + self.encoding[:, :x.size(1)].detach()

# # Define the Transformer model
# class Transformer(nn.Module):
#     def __init__(self, input_dim, output_dim, d_model=100, nhead=2, num_encoder_layers=4, num_decoder_layers=4):
#         super(Transformer, self).__init__()
        
#         self.pos_encoder = PositionalEncoding(d_model)
#         self.transformer_encoder = nn.TransformerEncoder(
#             nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead),
#             num_layers=num_encoder_layers
#         )
#         self.transformer_decoder = nn.TransformerDecoder(
#             nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead),
#             num_layers=num_decoder_layers
#         )
        
#         self.linear = nn.Linear(d_model, output_dim)

#     def forward(self, src, tgt):
#         src = self.pos_encoder(src.permute(1, 0, 2))  # Change the dimension for Transformer input
#         tgt = self.pos_encoder(tgt.permute(1, 0, 2))  # Change the dimension for Transformer input
#         # print(f"Source: {src}")
#         # print(f"Target: {tgt}")
#         memory = self.transformer_encoder(src)
#         output = self.transformer_decoder(tgt, memory)
#         output = self.linear(output)
#         # print(f"Output: {output}")
#         return output


In [7]:
# Code pulled from https://towardsdatascience.com/build-your-own-transformer-from-scratch-using-pytorch-84c850470dcb 

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1).to(device)
        output = torch.matmul(attn_probs, V).to(device)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output
    
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))
    
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model).to(device)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1).to(device)
        div_term = torch.exp(torch.arange(0, d_model, 2).float().to(device) * -(math.log(10000.0) / d_model)).to(device)
        
        pe[:, 0::2] = torch.sin(position * div_term).to(device)
        pe[:, 1::2] = torch.cos(position * div_term).to(device)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]
    
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x
    
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x
    
class TransformerProf(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(TransformerProf, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length).to(device), diagonal=1)).bool().to(device)
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [8]:
src_vocab_size = engFrDataset.eng_vocab.n_words
tgt_vocab_size = engFrDataset.fr_vocab.n_words
d_model = 1024
num_heads = 4
num_layers = 6
d_ff = 2048
max_seq_length = 11
dropout = 0.15
model = TransformerProf(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.0001)#, betas=(0.9, 0.98), eps=1e-9)


In [9]:
# Training loop
epochs = 300

for epoch in range(epochs):
    # Training phase
    model.train()  # Set the model to training mode
    total_loss = 0
    total_correct = 0
    total_tokens = 0

    for eng_tokens, fr_tokens, eng_emb, fr_emb in train_dataloader:
        eng_tokens, fr_tokens = eng_tokens.to(device), fr_tokens.to(device)
        eng_emb, fr_emb = eng_emb.to(device), fr_emb.to(device)
        optimizer.zero_grad()
        output = model(eng_tokens, fr_tokens[:, :-1]) 
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        fr_tokens_target = fr_tokens[:, 1:].contiguous().view(-1)
        loss = criterion(output, fr_tokens_target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Calculate correct predictions
        predicted = output.argmax(dim=1)
        correct = (predicted == fr_tokens_target).sum().item()

        # Number of non-padding tokens
        non_pad_tokens = (fr_tokens_target != 0).sum().item()
        
        # Accumulate correct predictions and total tokens (excluding padding tokens)
        total_correct += correct
        total_tokens += non_pad_tokens
    
    # Calculate training accuracy and loss
    train_accuracy = (total_correct / total_tokens)
    train_loss = total_loss / len(train_dataloader)
    
    if ((epoch+1) % 10 == 0):
        # Validation phase
        model.eval()  # Set the model to evaluation mode
        total_val_loss = 0
        total_val_correct = 0
        total_val_tokens = 0

        with torch.no_grad():
            for eng_tokens, fr_tokens, eng_emb, fr_emb in test_dataloader:
                eng_tokens, fr_tokens = eng_tokens.to(device), fr_tokens.to(device)
                eng_emb, fr_emb = eng_emb.to(device), fr_emb.to(device)
                output = model(eng_tokens, fr_tokens[:, :-1])
                output_dim = output.shape[-1]
                output = output.contiguous().view(-1, output_dim)
                fr_tokens_target = fr_tokens[:, 1:].contiguous().view(-1)
                loss = criterion(output, fr_tokens_target)

                total_val_loss += loss.item()

                # Calculate correct predictions
                predicted = output.argmax(dim=1)
                correct = (predicted == fr_tokens_target).sum().item()

                # Number of non-padding tokens
                non_pad_tokens = (fr_tokens_target != 0).sum().item()

                # Accumulate correct predictions and total tokens (excluding padding tokens)
                total_val_correct += correct
                total_val_tokens += non_pad_tokens

        # Calculate validation accuracy and loss
        val_accuracy = (total_val_correct / total_val_tokens)
        val_loss = total_val_loss / len(test_dataloader)

        # Print training and validation metrics
        print(f"Epoch {epoch + 1}, Train Loss: {train_loss}, Val Loss: {val_loss}, Val Accuracy: {val_accuracy}")


Epoch 10, Train Loss: 1.3989180326461792, Val Loss: 4.339879989624023, Val Accuracy: 0.23776223776223776
Epoch 20, Train Loss: 0.10658690830071767, Val Loss: 4.900830268859863, Val Accuracy: 0.3146853146853147
Epoch 30, Train Loss: 0.03141459760566553, Val Loss: 5.535618305206299, Val Accuracy: 0.2937062937062937
Epoch 40, Train Loss: 0.014569288119673729, Val Loss: 5.7225117683410645, Val Accuracy: 0.2867132867132867
Epoch 50, Train Loss: 0.01029915145287911, Val Loss: 5.911131858825684, Val Accuracy: 0.2937062937062937
Epoch 60, Train Loss: 0.008675736685593924, Val Loss: 5.986867427825928, Val Accuracy: 0.2937062937062937
Epoch 70, Train Loss: 0.006598004760841529, Val Loss: 6.089540004730225, Val Accuracy: 0.2937062937062937
Epoch 80, Train Loss: 0.005560614479084809, Val Loss: 6.173830986022949, Val Accuracy: 0.2937062937062937
Epoch 90, Train Loss: 0.00482232046003143, Val Loss: 6.267244815826416, Val Accuracy: 0.2937062937062937
Epoch 100, Train Loss: 0.003974670389046271, Val L

In [17]:
# Training loop (unchanged)

# After the training loop
# Perform inference on a validation sentence
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    # Choose a random validation sentence
    idx = random.randint(0, len(test_dataset) - 1)
    eng_tokens, fr_tokens, eng_emb, fr_emb = test_dataset[idx]
    eng_tokens, fr_tokens = eng_tokens.unsqueeze(0).to(device), fr_tokens.unsqueeze(0).to(device)
    eng_emb, fr_emb = eng_emb.unsqueeze(0).to(device), fr_emb.unsqueeze(0).to(device)

    # Run inference
    output = model(eng_tokens, fr_tokens[:, :-1])
    output_dim = output.shape[-1]
    output = output.view(-1, output_dim)
    _, predicted = torch.max(output, 1)

    # Convert target indices to French words
    input_sentence = [test_dataset.eng_vocab.index2word[idx.item()] for idx in eng_tokens.squeeze(0)]
    # Remove padding and SOS token
    input_sentence = [word for word in input_sentence if word not in ["<PAD>", '<SOS>']]

    # Convert predicted indices to French words
    predicted_sentence = [test_dataset.fr_vocab.index2word[idx.item()] for idx in predicted]
    # Remove padding and EOS token
    predicted_sentence = [word for word in predicted_sentence if word not in ["<PAD>", "<EOS>"]]

    # Convert target indices to French words
    target_sentence = [test_dataset.fr_vocab.index2word[idx.item()] for idx in fr_tokens.squeeze(0)]
    # Remove padding and SOS token
    target_sentence = [word for word in target_sentence if word not in ["<PAD>", '<SOS>']]

    # Print the original English sentence, target French sentence, and predicted French sentence
    print("Input Sentence:", input_sentence)
    print("Target Sentence:", target_sentence)
    print("Predicted French Sentence:", predicted_sentence)



Input Sentence: ['we', 'plant', 'flowers', 'in', 'the', 'garden', '<EOS>']
Target Sentence: ['nous', 'plantons', 'des', 'fleurs', 'dans', 'le', 'jardin', '<EOS>']
Predicted French Sentence: ['nous', 'froid', 'fleurs', 'livres', 'dans']


In [None]:
# Training loop (unchanged)

# After the training loop
# Perform inference on a validation sentence
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    # Choose a random validation sentence
    idx = random.randint(0, len(test_dataset) - 1)
    eng_tokens, fr_tokens, eng_emb, fr_emb = test_dataset[idx]
    eng_tokens, fr_tokens = eng_tokens.unsqueeze(0).to(device), fr_tokens.unsqueeze(0).to(device)
    eng_emb, fr_emb = eng_emb.unsqueeze(0).to(device), fr_emb.unsqueeze(0).to(device)

    # Run inference
    output = model(eng_tokens, fr_tokens[:, :-1])
    output_dim = output.shape[-1]
    output = output.view(-1, output_dim)
    _, predicted = torch.max(output, 1)

    # Convert target indices to French words
    input_sentence = [test_dataset.eng_vocab.index2word[idx.item()] for idx in eng_tokens.squeeze(0)]
    # Remove padding and SOS token
    input_sentence = [word for word in input_sentence if word not in ["<PAD>", '<SOS>']]

    # Convert predicted indices to French words
    predicted_sentence = [test_dataset.fr_vocab.index2word[idx.item()] for idx in predicted]
    # Remove padding and EOS token
    predicted_sentence = [word for word in predicted_sentence if word not in ["<PAD>", "<EOS>"]]

    # Convert target indices to French words
    target_sentence = [test_dataset.fr_vocab.index2word[idx.item()] for idx in fr_tokens.squeeze(0)]
    # Remove padding and SOS token
    target_sentence = [word for word in target_sentence if word not in ["<PAD>", '<SOS>']]

    # Print the original English sentence, target French sentence, and predicted French sentence
    print("Input Sentence:", input_sentence)
    print("Target Sentence:", target_sentence)
    print("Predicted French Sentence:", predicted_sentence)



Input Sentence: ['we', 'plant', 'flowers', 'in', 'the', 'garden', '<EOS>']
Target Sentence: ['nous', 'plantons', 'des', 'fleurs', 'dans', 'le', 'jardin', '<EOS>']
Predicted French Sentence: ['nous', 'froid', 'fleurs', 'livres', 'dans']
