In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

In [2]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model,num_heads):
    super(MultiHeadAttention, self).__init__()
    assert d_model % num_heads == 0

    self.d_model = d_model
    self.num_heads = num_heads
    self.d_k = d_model // num_heads

    self.W_q = nn.Linear(d_model, d_model)
    self.W_k = nn.Linear(d_model, d_model)
    self.W_v = nn.Linear(d_model, d_model)
    self.W_o = nn.Linear(d_model, d_model)

  def scaled_dot_product_attention(self, Q, K, V, mask = None):
    attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
    
    if mask is not None:
      attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
    
    attn_probs = torch.softmax(attn_scores, dim = -1)

    output = torch.matmul(attn_probs, V)
    return output
  
  def split_heads(self, x):
    batch_size, seq_length, embed_dim = x.size()
    x = x.view(batch_size, seq_length, self.num_heads, self.d_k)
    return x.transpose(1, 2)


  def combine_heads(self, x):
    batch_size, _, seq_length, d_k = x.size()
    return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
  def forward(self, Q,K,V, mask = None):
    Q = self.split_heads(self.W_q(Q))
    K = self.split_heads(self.W_q(K))
    V = self.split_heads(self.W_q(V))


    attn_output = self.scaled_dot_product_attention(Q,K,V,mask)
    
    output = self.W_o(self.combine_heads(attn_output))
    return output

class PositionWiseFeedForward(nn.Module):
  def __init__(self, d_model, d_ff):
    super(PositionWiseFeedForward,self).__init__()
    self.fc1 = nn.Linear(d_model, d_ff)
    self.fc2 = nn.Linear(d_ff,d_model)
    self.relu = nn.ReLU()
  def forward(self, x):
    return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
  def __init__(self,d_model,max_seq_length):
    super(PositionalEncoding,self).__init__()
    pe = torch.zeros(max_seq_length, d_model)
    position = torch.arange(0,max_seq_length,dtype = torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0,d_model,2).float()* -(math.log(10000.0)/d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    self.register_buffer('pe', pe.unsqueeze(0))

  def forward(self, x):
    return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, dropout):
    super(EncoderLayer, self).__init__()
    self.self_attn = MultiHeadAttention(d_model, num_heads)
    self.feed_forward = PositionWiseFeedForward(d_model,d_ff)
    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout)
  def forward(self, x, mask):
    attn_output = self.self_attn(x, x, x, mask)
    x = self.norm1(x +self.dropout(attn_output))
    ff_output = self.feed_forward(x)
    x = self.norm2(x +self.dropout(ff_output))
    return x

class DecoderLayer(nn.Module):
  def __init__(self,d_model, num_heads, d_ff, dropout):
    super(DecoderLayer, self).__init__()
    self.self_attn = MultiHeadAttention(d_model,num_heads)
    self.cross_attn = MultiHeadAttention(d_model,num_heads)
    self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.norm3 = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, x, enc_output, src_mask, tgt_mask):
    attn_output = self.self_attn(x,x,x, tgt_mask)
    x = self.norm1(x + self.dropout(attn_output))
    attn_output = self.cross_attn(x,enc_output,enc_output,src_mask)
    x = self.norm2(x + self.dropout(attn_output))
    ff_output = self.feed_forward(x)
    x = self.norm3(x + self.dropout(ff_output))
    return x

class Transformer(nn.Module):
  def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads,num_layers,d_ff,max_seq_length,dropout):
    super(Transformer,self).__init__()
    self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
    self.decoder_embedding = nn.Embedding(tgt_vocab_size,d_model)
    self.positional_encoding = PositionalEncoding(d_model,max_seq_length)
    self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
    self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
    self.fc = nn.Linear(d_model,tgt_vocab_size)
    self.dropout = nn.Dropout(dropout)
  
  def generate_mask(self, src, tgt):
    device = src.device 

    src_mask = (src != 0).unsqueeze(1).unsqueeze(2).to(device)  # ensure device
    tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3).to(device)

    seq_length = tgt.size(1)

    nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length, device=device), diagonal=1)).bool()

    tgt_mask = tgt_mask & nopeak_mask

    return src_mask, tgt_mask


  def forward(self,src,tgt):
    src_mask,tgt_mask = self.generate_mask(src,tgt)
    src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
    tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))
    enc_output = src_embedded
    for enc_layer in self.encoder_layers:
      enc_output = enc_layer(enc_output,src_mask)
    
    dec_output = tgt_embedded
    for dec_layer in self.decoder_layers:
      dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

    output = self.fc(dec_output)
    
    return output

In [4]:
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1



In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import spacy
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import pandas as pd
from collections import Counter
from torch.utils.data import random_split

# Load SpaCy tokenizers
spacy_en = spacy.load("en_core_web_sm")
spacy_de = spacy.load("de_core_news_sm")

def tokenize_en(text):
    return [tok.text.lower() for tok in spacy_en.tokenizer(text)]

def tokenize_de(text):
    return [tok.text.lower() for tok in spacy_de.tokenizer(text)]

# Load Multi30k train data
def load_multi30k_from_folder(path):
    with open(f"{path}/en.txt", encoding="utf-8") as f_en, open(f"{path}/de.txt", encoding="utf-8") as f_de:
        en_sentences = f_en.read().strip().split('\n')
        de_sentences = f_de.read().strip().split('\n')
    return list(zip(en_sentences, de_sentences))

def load_tatoeba_parallel(path):
    # Assuming tab-separated file (or whitespace-separated)
    df = pd.read_csv(path, sep='\t', header=None, usecols=[0,1], names=['en', 'de'], encoding='utf-8')
    
    # Convert to list of tuples (en, de)
    pairs = list(zip(df['en'].tolist(), df['de'].tolist()))
    return pairs

data = load_tatoeba_parallel("archive/deu.txt")

# data = load_multi30k_from_folder("data")  # Change path as needed
print(f"Loaded {len(data)} sentence pairs")

# Special tokens
SPECIAL_TOKENS = ["<pad>", "<unk>", "<sos>", "<eos>"]

# Build vocabularies manually
def build_vocab_manual(data, index, tokenizer, specials=SPECIAL_TOKENS, min_freq=1):
    counter = Counter()
    for pair in data:
        tokens = tokenizer(pair[index])
        counter.update(tokens)
    itos = list(specials)
    itos += [tok for tok, freq in counter.items() if freq >= min_freq and tok not in specials]
    stoi = {tok: i for i, tok in enumerate(itos)}
    return stoi, itos

src_stoi, src_itos = build_vocab_manual(data, 0, tokenize_en)
tgt_stoi, tgt_itos = build_vocab_manual(data, 1, tokenize_de)

PAD_IDX = src_stoi["<pad>"]
UNK_IDX = src_stoi["<unk>"]
SOS_IDX = src_stoi["<sos>"]
EOS_IDX = src_stoi["<eos>"]

# Dataset class using manual stoi dicts
class TranslationDataset(Dataset):
    def __init__(self, data, src_stoi, tgt_stoi, src_tokenizer, tgt_tokenizer):
        self.data = data
        self.src_stoi = src_stoi
        self.tgt_stoi = tgt_stoi
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src_sentence, tgt_sentence = self.data[idx]
        src_tokens = ["<sos>"] + self.src_tokenizer(src_sentence) + ["<eos>"]
        tgt_tokens = ["<sos>"] + self.tgt_tokenizer(tgt_sentence) + ["<eos>"]

        src_ids = torch.tensor([self.src_stoi.get(tok, UNK_IDX) for tok in src_tokens], dtype=torch.long)
        tgt_ids = torch.tensor([self.tgt_stoi.get(tok, UNK_IDX) for tok in tgt_tokens], dtype=torch.long)

        return src_ids, tgt_ids

# Collate function for padding batches
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=True)
    return src_batch, tgt_batch

SRC_VOCAB_SIZE = len(src_stoi)
TGT_VOCAB_SIZE = len(tgt_stoi)
MAX_LEN = 100
dataset = TranslationDataset(data, src_stoi, tgt_stoi, tokenize_en, tokenize_de)

# Split dataset: 80% train, 20% val (adjust as needed)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)



Loaded 221533 sentence pairs


In [69]:
torch.cuda.empty_cache()
# Model, optimizer, criterion remain the same
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = Transformer(
    SRC_VOCAB_SIZE,
    TGT_VOCAB_SIZE,
    d_model=512,
    num_heads=4,
    num_layers=3,
    d_ff=1024,
    max_seq_length=150,
    dropout=0.2,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

criterion = nn.CrossEntropyLoss(ignore_index=src_stoi["<pad>"])

print(f"Allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Reserved : {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
print(f"Max Allocated: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
print(f"Max Reserved : {torch.cuda.max_memory_reserved() / 1024**3:.2f} GB")

for epoch in range(20):
    # Train
    model.train()
    total_loss = 0
    teacher_forcing_ratio = 0.5  # e.g. 70% teacher forcing

    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        
        tgt_input = tgt[:, :-1]   # ground-truth input tokens
        tgt_output = tgt[:, 1:]   # expected output tokens
        
        batch_size, seq_len = tgt_input.shape
        
        # Forward pass once with ground-truth for initial prediction
        preds = model(src, tgt_input)  # [batch, seq_len, vocab_size]
        preds_tokens = preds.argmax(dim=-1)  # greedy predictions: [batch, seq_len]
        
        # Decide for each token whether to use ground truth or predicted token
        mask = torch.rand(batch_size, seq_len, device=device) < teacher_forcing_ratio
        
        # Construct mixed tgt_input:
        mixed_tgt_input = torch.where(mask, tgt_input, preds_tokens)
        
        # Forward pass with mixed input
        preds = model(src, mixed_tgt_input)
        preds = preds.reshape(-1, preds.shape[-1])
        tgt_output = tgt_output.reshape(-1)
        
        loss = criterion(preds, tgt_output)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # Validate
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for src, tgt in val_loader:
            src, tgt = src.to(device), tgt.to(device)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]

            preds = model(src, tgt_input)
            preds = preds.reshape(-1, preds.shape[-1])
            tgt_output = tgt_output.reshape(-1)

            val_loss = criterion(preds, tgt_output)
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    print(f"Epoch {epoch+1} Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


cuda
Allocated: 1.96 GB
Reserved : 3.17 GB
Max Allocated: 5.00 GB
Max Reserved : 11.92 GB
Epoch 1 Train Loss: 4.0575 | Val Loss: 3.1450
Epoch 2 Train Loss: 3.1308 | Val Loss: 2.6562
Epoch 3 Train Loss: 2.7901 | Val Loss: 2.4084
Epoch 4 Train Loss: 2.5910 | Val Loss: 2.3680
Epoch 5 Train Loss: 2.4510 | Val Loss: 2.1988
Epoch 6 Train Loss: 2.3472 | Val Loss: 2.1360


KeyboardInterrupt: 

In [70]:
def save_checkpoint(model, optimizer, epoch, path="checkpoint.pth"):
    checkpoint = {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "epoch": epoch
    }
    torch.save(checkpoint, path)
    print(f"Saved checkpoint to {path}")
save_checkpoint(model, optimizer, epoch, path="model_epoch_{}.pth".format(epoch+1))


Saved checkpoint to model_epoch_7.pth


In [7]:
import pickle

save_dict = {
    "src_stoi": src_stoi,
    "tgt_stoi": tgt_stoi,
    "src_itos": src_itos if 'src_itos' in locals() else None,
    "tgt_itos": tgt_itos if 'tgt_itos' in locals() else None,
    "train_dataset": train_dataset,
    "val_dataset": val_dataset
}

with open("save_data.pkl", "wb") as f:
    pickle.dump(save_dict, f)

print("✔️ Saved tokenizer and datasets to save_data.pkl")


✔️ Saved tokenizer and datasets to save_data.pkl


In [53]:
def load_checkpoint(model, optimizer, path="checkpoint.pth", device='cuda'):
    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint.get("epoch", 0)
    print(f"Loaded checkpoint from {path}, starting at epoch {epoch+1}")
    return epoch
start_epoch = load_checkpoint(model, optimizer, path="model_epoch_19+4.pth", device=device)


Loaded checkpoint from model_epoch_19+4.pth, starting at epoch 4


In [None]:
def evaluate_with_beam_search(model, src_sentence, src_stoi, tgt_stoi, tgt_itos,
                              beam_width=8, max_len=50, device='cuda'):
    model.eval()

    sos_token_id = tgt_stoi["<sos>"]
    eos_token_id = tgt_stoi["<eos>"]

    src_tensor = torch.tensor(src_sentence, dtype=torch.long).unsqueeze(0).to(device)

    # Initialize beam with sos token
    beams = [(torch.tensor([[sos_token_id]], device=device), 0.0)]  # (sequence, score)

    with torch.no_grad():
        for _ in range(max_len):
            new_beams = []
            for seq, score in beams:
                if seq[0, -1].item() == eos_token_id:
                    # Already ended beam — keep it as-is
                    new_beams.append((seq, score))
                    continue

                output = model(src_tensor, seq)  # (1, seq_len, vocab_size)
                logits = output[0, -1, :]        # last token logits
                log_probs = torch.log_softmax(logits, dim=-1)

                # Get top beam_width predictions
                topk_log_probs, topk_ids = torch.topk(log_probs, beam_width)

                for log_prob, token_id in zip(topk_log_probs, topk_ids):
                    new_seq = torch.cat([seq, token_id.view(1, 1)], dim=1)
                    new_score = score + log_prob.item()
                    new_beams.append((new_seq, new_score))

            # Keep only top `beam_width` beams
            beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

            # If all beams have ended, stop early
            if all(seq[0, -1].item() == eos_token_id for seq, _ in beams):
                break

    # Choose the best-scoring sequence
    best_seq = beams[0][0][0].tolist()  # shape (seq_len,)
    predicted_tokens = [tgt_itos[i] if i < len(tgt_itos) else "<unk>" for i in best_seq]

    return predicted_tokens


def evaluate(model, src_sentence, src_stoi, tgt_stoi, tgt_itos, max_len=50, device='cuda'):
    model.eval()
    
    # Tokenize and convert source sentence using your tokenizer, here assumed pre-tokenized list of ints
    src_tensor = torch.tensor(src_sentence, dtype=torch.long).unsqueeze(0).to(device)  # (1, src_len)
    
    sos_token_id = tgt_stoi["<sos>"]
    eos_token_id = tgt_stoi["<eos>"]
    
    tgt_tensor = torch.tensor([[sos_token_id]], dtype=torch.long).to(device)  # (1,1)
    
    for _ in range(max_len):
        with torch.no_grad():
            output = model(src_tensor, tgt_tensor)  # (1, tgt_len, vocab_size)
        
        next_token_logits = output[0, -1, :]  # logits for last token
        next_token_id = torch.argmax(next_token_logits).item()
        
        tgt_tensor = torch.cat([tgt_tensor, torch.tensor([[next_token_id]], device=device)], dim=1)
        
        if next_token_id == eos_token_id:
            break
    
    predicted_ids = tgt_tensor[0].tolist()
    predicted_tokens = [tgt_itos[i] if i < len(tgt_itos) else "<unk>" for i in predicted_ids]
    
    return predicted_tokens


def evaluate_top_p(model, src_sentence, src_stoi, tgt_stoi, tgt_itos,max_len=50, device='cuda', temperature=0.5, top_p=0.8):
    model.eval()

    # Convert src_sentence (list of token ids) to tensor and add batch dimension
    src_tensor = torch.tensor(src_sentence, dtype=torch.long).unsqueeze(0).to(device)  # shape: (1, src_len)

    sos_token_id = tgt_stoi["<sos>"]
    eos_token_id = tgt_stoi["<eos>"]

    # Start target sequence with <sos>, shape (1,1)
    tgt_tensor = torch.tensor([[sos_token_id]], dtype=torch.long).to(device)

    for _ in range(max_len):
        with torch.no_grad():
            # Pass src and current target input to the model
            output = model(src_tensor, tgt_tensor)  # shape: (1, tgt_len, vocab_size)

        logits = output[0, -1, :] / temperature  # last token logits
        probs = torch.softmax(logits, dim=-1)

        # Top-p (nucleus) sampling
        sorted_probs, sorted_indices = torch.sort(probs, descending=True)
        cumulative_probs = torch.cumsum(sorted_probs, dim=0)
        sorted_probs[cumulative_probs > top_p] = 0
        total_prob = sorted_probs.sum()

        if total_prob == 0:
            next_token_id = torch.argmax(probs).item()
        else:
            sorted_probs /= total_prob
            sampled_index = torch.multinomial(sorted_probs, 1).item()
            next_token_id = sorted_indices[sampled_index].item()

        tgt_tensor = torch.cat([tgt_tensor, torch.tensor([[next_token_id]], device=device)], dim=1)

        if next_token_id == eos_token_id:
            break

    predicted_ids = tgt_tensor[0].tolist()
    predicted_tokens = [tgt_itos[i] if i < len(tgt_itos) else "<unk>" for i in predicted_ids]

    return predicted_tokens

# Use your actual tokenizer here instead of split()
src_text = "do you know her?"
src_tokens = tokenize_en(src_text)  # <-- your actual tokenizer function
src_ids = [src_stoi.get(tok, src_stoi["<unk>"]) for tok in src_tokens]

result = evaluate_with_beam_search(model, src_ids, src_stoi, tgt_stoi, tgt_itos, device=device)
print("Generated:", " ".join(result))


Generated: <sos> du kennst sie ? ? <eos>


In [72]:
test_sentences = [
    "Hello, how are you?",
    "Despite the rain, they went hiking in the mountains.",
    "Thank you.",
    "The philosopher questioned the fabric of reality.",
    "I really like you."
]

for i, src_text in enumerate(test_sentences, 1):
    src_tokens = tokenize_en(src_text)
    src_ids = [src_stoi.get(tok, src_stoi["<unk>"]) for tok in src_tokens]
    
    result = evaluate(model, src_ids, src_stoi, tgt_stoi, tgt_itos, device=device)
    print(f"[{i}] Input: {src_text}")
    print(f"    Output: {' '.join(result)}\n")


[1] Input: Hello, how are you?
    Output: <sos> hallo , wie wie du du ? <eos>

[2] Input: Despite the rain, they went hiking in the mountains.
    Output: <sos> der regen gingen sie in in den berge . <eos>

[3] Input: Thank you.
    Output: <sos> danke ! ! <eos>

[4] Input: The philosopher questioned the fabric of reality.
    Output: <sos> das ergebnis die die der der der der der der der der der der träume des des problems . . <eos>

[5] Input: I really like you.
    Output: <sos> ich mag dich wirklich . du wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich wirklich . . . . . <eos>

