# Biomedical Machine Translation on the WMT16-Biomedical Dataset

Dataset taken from: https://www.statmt.org/wmt16/biomedical-translation-task.html

The following experiments explore mBart (facebook/mbart-large-50-many-to-many-mmt), OPUS-MT (Helsinki-NLP/opus-mt-es-en), and T5 (google-t5/t5-small) models for Machine Translation.

## Installing libraries and setup

In [None]:
# Install needed libraries
! pip install transformers datasets sacrebleu evaluate

In [None]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline
from datasets import load_dataset
import evaluate

import numpy as np
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
data_dir = "/kaggle/input/wmt16-biomed-en-es/wmt16_biomedical_data_en_es.csv"
dataset = load_dataset("csv", data_files=data_dir)

In [None]:
from datasets import DatasetDict

# split into test, train, val
# 80% train, 20% test + validation
temp = dataset["train"].train_test_split(test_size=0.2)

# Split the 20% test + valid in half test, half valid
temp2 = temp["test"].train_test_split(test_size=0.5)

dataset = DatasetDict({
"train": temp["train"],
"test": temp2["test"],
"val": temp2["train"]})

dataset

In [None]:
from datasets import load_dataset
from transformers import (
    MBartForConditionalGeneration, MBartTokenizer,
    Seq2SeqTrainingArguments, Seq2SeqTrainer
  )

import torch
from torch.utils.data import random_split
import datasets
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
import gc
import torch
from transformers import AutoTokenizer
import datasets

In [None]:
checkpoint="facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
model = MBartForConditionalGeneration.from_pretrained(checkpoint)

## Data Pre-Processing

In [None]:
max_input_length = 128
max_target_length = 128

source_lang = "es"
target_lang = "en"

def preprocess_function(examples):
    inputs = [example for example in examples["es"]]
    targets = [example for example in examples["en"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
metric = evaluate.load("sacrebleu")

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

# mBart50

## mBart Training

In [None]:
from accelerate import Accelerator
accelerator = Accelerator()

In [None]:
args = Seq2SeqTrainingArguments(output_dir="mbart_large_50_mmt_biomed_es_en_v1",
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    num_train_epochs=0.1,
    predict_with_generate=True,
    logging_dir="/logs",
    logging_steps=10000,
    save_steps=10000,
    report_to="none",
    push_to_hub=False
)

trainer = Seq2SeqTrainer(model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

tokenized_dataset, trainer = accelerator.prepare(
    tokenized_dataset, trainer
)

trainer.train()

## mBart Evaluation

In [None]:
# Load a model checkpoint
model = MBartForConditionalGeneration.from_pretrained("/kaggle/working/mbart_large_50_mmt_biomed_es_en_v1/checkpoint-288")

In [None]:
args = Seq2SeqTrainingArguments(output_dir="mbart_large_50_mmt_biomed_es_en_v1",
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    num_train_epochs=0.01,
    predict_with_generate=True,
    logging_dir="/logs",
    logging_steps=10000,
    save_steps=10000,
    report_to="none",
    push_to_hub=False
)

trainer = Seq2SeqTrainer(model=model,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

tokenized_dataset, trainer = accelerator.prepare(
    tokenized_dataset, trainer
)

trainer.evaluate()

In [None]:
predictions = trainer.predict(tokenized_dataset["val"])

In [None]:
predictions.metrics

{'test_loss': 0.7604455947875977,
 'test_bleu': 48.1032,
 'test_gen_len': 21.1874,
 'test_runtime': 1331.2933,
 'test_samples_per_second': 8.648,
 'test_steps_per_second': 1.082}

# OPUS MT

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-es-en")

In [None]:
data_dir = "/kaggle/input/wmt16-biomedical-mt-en-es/wmt16_biomedical_data_en_es.csv"
dataset = load_dataset("csv", data_files=data_dir)

In [None]:
from datasets import DatasetDict

# split into test, train, val
# 80% train, 20% test + validation
temp = dataset["train"].train_test_split(test_size=0.2)

# Split the 20% test + valid in half test, half valid
temp2 = temp["test"].train_test_split(test_size=0.5)

dataset = DatasetDict({
"train": temp["train"],
"test": temp2["test"],
"val": temp2["train"]})

dataset

In [None]:
dataset.shape

{'train': (230248, 3), 'test': (28782, 3), 'val': (28781, 3)}

In [None]:
source_lang = "es"
target_lang = "en"
prefix = ""

def preprocess_function(examples):
    inputs = [prefix + example for example in examples["es"]]
    targets = [example for example in examples["en"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True)
    return model_inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
checkpoint = "Helsinki-NLP/opus-mt-es-en"
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
metric = evaluate.load("sacrebleu")

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## OPUS-MT Training

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

### Evaluating before training

In [None]:
translator = pipeline("translation", model=checkpoint, device="cuda")

In [None]:
predictions = translator([example['es'] for example in dataset["test"]][:5])

In [None]:
predictions

[{'translation_text': 'Genetic advice.'},
 {'translation_text': 'Graphical representation of the vestibular thermal test: the nystagmogram.'},
 {'translation_text': 'Dental education program. Escuelas normales de Santiago, Viña del mar and Curicó. 1965.'},
 {'translation_text': 'Evaluation of prostatic antigen specific post rectal touch: experience in 36 patients.'},
 {'translation_text': 'The use of antimuscarinics in male patients with symptoms of the lower urinary tract due to benign prostate hyperplasia and overactive bladder symptoms.'}]

In [None]:
[example['en'] for example in dataset["test"]][:5]

['Genetic counseling.',
 'Graphic representation of the thermic vestibular test: the nystagmogram.',
 'Dental educational program. Normal schools of Santiago, Viña del mar and Curicó. 1965.',
 'Evaluation of prostatic specific antigen after a digital rectal examination: experience with 36 patients.',
 'Use of antimuscarinics in patients with lower urinary tract symptoms for BPH and overactive bladder.']

In [None]:
[example['es'] for example in dataset["test"]][:5]

['Consejo genético.',
 'Representación gráfica de la prueba termica vestibular: el nistagmograma.',
 'Programa educative odontológico. Escuelas normales de Santiago, Viña del mar y Curicó. 1965.',
 'Evaluación del antigeno prostático especifico post tacto rectal: experiencia en 36 pacientes.',
 'El uso de antimuscarínicos en pacientes varones con síntomas del tracto urinario inferior por hiperplasia benigna de próstata y sintomas de vejiga hiperactiva.']

In [None]:
result = metric.compute(predictions=[pred['translation_text'] for pred in predictions], references=[example['en'] for example in dataset["test"]][:5])
print(result)

{'score': 34.769941968239465, 'counts': [48, 28, 17, 9], 'totals': [69, 64, 59, 54], 'precisions': [69.56521739130434, 43.75, 28.8135593220339, 16.666666666666668], 'bp': 1.0, 'sys_len': 69, 'ref_len': 63}


In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'en', 'es', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 230248
    })
    test: Dataset({
        features: ['id', 'en', 'es', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 28782
    })
    val: Dataset({
        features: ['id', 'en', 'es', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 28781
    })
})

### Actual OPUS-MT Training

In [None]:
tokenized_dataset["val"]

Dataset({
    features: ['id', 'en', 'es', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 28781
})

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="opus_mt_biomed_es_en_v1",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=0.02,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.evaluate(tokenized_dataset["val"])

In [None]:
from evaluate import TranslationEvaluator
metric = TranslationEvaluator(task = "translation", default_metric_name = "bleu")
evaluation_results = metric.compute(model_or_pipeline = model, tokenizer = tokenizer,
                                    metric = "bleu", data = dataset["test"],
                                    device = "cuda",
                                    input_column = "es", label_column = "en")

## OPUS-MT Inference

In [None]:
model_path = "/kaggle/working/opus_mt_biomed_es_en_v1/checkpoint-3598"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [None]:
dataset["test"][0]

{'id': 3662268, 'en': 'Genetic counseling.', 'es': 'Consejo genético.'}

In [None]:
translator = pipeline("translation", model=model_path, device="cuda")

In [None]:
predictions = translator([example['es'] for example in dataset["test"]][:5])

In [None]:
predictions

[{'translation_text': 'Genetic counseling.'},
 {'translation_text': 'Graphic representation of vestibular thermal test: the nystagmogram.'},
 {'translation_text': 'Dental education program. Normal schools of Santiago, Viña del mar and Curicó. 1965.'},
 {'translation_text': 'Evaluation of the specific prostatic antigen after rectal touch: experience in 36 patients.'},
 {'translation_text': 'Use of antimuscarinics in male patients with symptoms of the lower urinary tract caused by benign prostatic hyperplasia and symptoms of overactive bladder.'}]

In [None]:
predictions = translator([example['es'] for example in dataset["test"]])

In [None]:
result = metric.compute(predictions=[pred['translation_text'] for pred in predictions], references=[example['en'] for example in dataset["test"]][:5])
print(result)

{'score': 54.82234750454774, 'counts': [36, 25, 16, 10], 'totals': [41, 36, 31, 26], 'precisions': [87.8048780487805, 69.44444444444444, 51.61290322580645, 38.46153846153846], 'bp': 0.9294421312368021, 'sys_len': 41, 'ref_len': 44}


# T5 (t5-small)

In [None]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
data_dir = "/kaggle/input/wmt16-biomedical-mt-en-es/wmt16_biomedical_data_en_es.csv"
dataset = load_dataset("csv", data_files=data_dir)

In [None]:
dataset = dataset["train"].train_test_split(test_size=0.2)
dataset["train"][0]

In [None]:
dataset.shape

In [None]:
source_lang = "es"
target_lang = "en"
prefix = "translate Spanish to English: "

def preprocess_function(examples):
    inputs = [prefix + example for example in examples["es"]]
    targets = [example for example in examples["en"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True)
    return model_inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
metric = evaluate.load("sacrebleu")

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Training

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="t5_small_biomed_es_en_v1",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.evaluate()

## Inference

In [None]:
model_path = "/kaggle/working/t5_small_biomed_es_en_v1/checkpoint-3598"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [None]:
translator = pipeline("translation", model=model_path, device="cuda")
pred = translator([src, src])

In [None]:
predictions = translator([example['es'] for example in dataset["test"]][:5])

In [None]:
result = metric.compute(predictions=[pred['translation_text'] for pred in predictions], references=[example['en'] for example in dataset["test"]][:5])
print(result)

In [None]:
!zip -r opus_mt_biomed_es_en_v1.zip /kaggle/working/opus_mt_biomed_es_en_v1/checkpoint-3598

## Baseline: LSTM (using PyTorch)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence

class TranslationDataset(Dataset):
    def __init__(self, es_texts, en_texts, max_length=50):
        self.es_texts = es_texts
        self.en_texts = en_texts

        # Create vocabulary with reverse mapping
        self.es_vocab = self.build_vocab(es_texts)
        self.en_vocab = self.build_vocab(en_texts)

        # Create reverse vocabulary for decoding
        self.es_idx_to_char = {v: k for k, v in self.es_vocab.items()}
        self.en_idx_to_char = {v: k for k, v in self.en_vocab.items()}

        # Encode texts
        self.es_encoded = [self.encode_text(text, self.es_vocab)[:max_length] for text in es_texts]
        self.en_encoded = [self.encode_text(text, self.en_vocab)[:max_length] for text in en_texts]

    def build_vocab(self, texts):
        # Simple character-level vocabulary
        chars = set(''.join(texts))
        vocab = {char: i+2 for i, char in enumerate(chars)}
        vocab['<PAD>'] = 0
        vocab['<UNK>'] = 1
        return vocab

    def encode_text(self, text, vocab):
        return torch.tensor([vocab.get(char, vocab['<UNK>']) for char in text])

    def decode_text(self, encoded_text, idx_to_char):
        return ''.join([idx_to_char.get(idx.item(), '') for idx in encoded_text])

    def __len__(self):
        return len(self.es_texts)

    def __getitem__(self, idx):
        return self.es_encoded[idx], self.en_encoded[idx]

def collate_fn(batch):
    # Separate source and target sequences
    es_sequences, en_sequences = zip(*batch)

    # Pad sequences
    es_padded = pad_sequence(es_sequences, batch_first=True, padding_value=0)
    en_padded = pad_sequence(en_sequences, batch_first=True, padding_value=0)

    return es_padded, en_padded

class TranslationModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, hidden_size=64):
        super().__init__()
        self.input_vocab_size = input_vocab_size
        self.output_vocab_size = output_vocab_size

        # Embedding layers instead of one-hot encoding
        self.es_embedding = nn.Embedding(input_vocab_size, hidden_size)
        self.en_embedding = nn.Embedding(output_vocab_size, hidden_size)

        self.encoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.output_layer = nn.Linear(hidden_size, output_vocab_size)

    def forward(self, input_seq, target_seq):
        # Embed input sequences
        es_embedded = self.es_embedding(input_seq)
        en_embedded = self.en_embedding(target_seq)

        # Encode input sequence
        encoder_outputs, (hidden, cell) = self.encoder(es_embedded)

        # Decoder training
        decoder_outputs, _ = self.decoder(en_embedded, (hidden, cell))
        outputs = self.output_layer(decoder_outputs)

        return outputs

    def translate(self, input_seq, max_length=50):
        # Embed input sequence
        input_embedded = self.es_embedding(input_seq)

        # Encode input sequence
        encoder_outputs, (hidden, cell) = self.encoder(input_embedded)

        # Initialize decoder input
        decoder_input = torch.zeros((input_seq.size(0), 1), dtype=torch.long, device=input_seq.device)

        # Store decoded sequences
        decoded_sequences = []

        for _ in range(max_length):
            # Embed decoder input
            decoder_embedded = self.en_embedding(decoder_input)

            # Decode
            decoder_output, (hidden, cell) = self.decoder(decoder_embedded, (hidden, cell))
            output = self.output_layer(decoder_output)

            # Get the most probable character
            predicted_char_idx = output.argmax(dim=-1)
            decoded_sequences.append(predicted_char_idx)

            # Update decoder input
            decoder_input = predicted_char_idx

        return torch.cat(decoded_sequences, dim=1)

from tqdm import tqdm
def train_model(model, dataloader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in tqdm(range(epochs)):
        total_loss = 0
        for es_batch, en_batch in dataloader:
            optimizer.zero_grad()

            # Forward pass
            outputs = model(es_batch, en_batch[:, :-1])

            # Compute loss (shift targets by one for next-token prediction)
            loss = criterion(outputs.reshape(-1, outputs.size(-1)), en_batch[:, 1:].reshape(-1))

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}')

# Main execution remains the same as in previous implementation
# Load dataset
df = pd.read_csv('/kaggle/input/wmt16-biomed-en-es/wmt16_biomedical_data_en_es.csv')

# Split into train and test sets
train_es, test_es, train_en, test_en = train_test_split(
    df['es'], df['en'], test_size=0.2, random_state=42
)

In [None]:
# Create training dataset and dataloader
train_dataset = TranslationDataset(train_es.tolist(), train_en.tolist())
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Create test dataset
test_dataset = TranslationDataset(test_es.tolist(), test_en.tolist())
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [None]:
# Initialize model
model = TranslationModel(
    input_vocab_size=len(train_dataset.es_vocab),
    output_vocab_size=len(train_dataset.en_vocab)
)

# Training setup
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters())

In [None]:
# Train
train_model(model, train_dataloader, criterion, optimizer)

In [None]:
def evaluate_model(model, test_dataset):
    model.eval()
    references = []
    candidates = []

    with torch.no_grad():
        for i in range(len(test_dataset)):
            # Get a single input sequence
            es_seq, en_seq = test_dataset[i]

            # Translate the input sequence
            translation = model.translate(es_seq.unsqueeze(0))

            # Decode original and translated text
            ref_text = test_dataset.decode_text(en_seq, test_dataset.en_idx_to_char)
            trans_text = test_dataset.decode_text(translation.squeeze(0), test_dataset.en_idx_to_char)

            # Prepare for BLEU score calculation
            references.append([list(ref_text)])
            candidates.append(list(trans_text))

    # Calculate BLEU score
    bleu_score = corpus_bleu(references, candidates)

    return bleu_score, references, candidates

In [None]:
# Evaluate
print("\nEvaluating Model...")
bleu_score, references, candidates = evaluate_model(model, test_dataset)

In [None]:
# Print evaluation results
print(f"\nBLEU Score: {bleu_score}")

In [None]:
# Print some example translations
print("\nExample Translations:")
for i in range(min(5, len(references))):
    ref = ''.join(references[i][0])
    cand = ''.join(candidates[i])
    print(f"Spanish Input:  {test_es.iloc[i]}")
    print(f"Reference:      {ref}")
    print(f"Candidate:      {cand}")
    print("---")

## Baseline: LSTM (using TensorFlow)

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
from nltk.translate.bleu_score import corpus_bleu
import nltk

# Download necessary NLTK data
nltk.download('punkt')

# Configuration
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 256
LSTM_UNITS = 512
BATCH_SIZE = 64
EPOCHS = 10

In [None]:
# Load and preprocess data
def load_and_preprocess_data(file_path):
    # Load CSV
    df = pd.read_csv(file_path)

    # Remove any rows with missing translations
    df.dropna(subset=['es', 'en'], inplace=True)

    # Preprocessing
    df['es'] = df['es'].str.lower()
    df['en'] = df['en'].str.lower()

    return df['es'].tolist(), df['en'].tolist()

# Tokenize and prepare sequences
def prepare_sequences(spanish_texts, english_texts):
    # Tokenize Spanish input
    spanish_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<OOV>')
    spanish_tokenizer.fit_on_texts(spanish_texts)
    spanish_sequences = spanish_tokenizer.texts_to_sequences(spanish_texts)
    spanish_padded = pad_sequences(spanish_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

    # Tokenize English output
    english_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<OOV>')
    english_tokenizer.fit_on_texts(english_texts)
    english_sequences = english_tokenizer.texts_to_sequences(english_texts)
    english_padded = pad_sequences(english_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

    return (spanish_padded, spanish_tokenizer,
            english_padded, english_tokenizer)

In [None]:
# Create seq2seq LSTM model
def create_translation_model(input_vocab_size, output_vocab_size):
    # Encoder
    encoder_inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
    encoder_embedding = Embedding(input_vocab_size, EMBEDDING_DIM)(encoder_inputs)
    encoder = LSTM(LSTM_UNITS, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
    decoder_embedding = Embedding(output_vocab_size, EMBEDDING_DIM)(decoder_inputs)
    decoder_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

    # Output layer
    decoder_dense = Dense(output_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Create model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    return model

In [None]:
# Compute BLEU score
def compute_bleu_score(true_translations, predicted_translations):
    # Tokenize references and hypotheses
    references = [[nltk.word_tokenize(ref)] for ref in true_translations]
    hypotheses = [nltk.word_tokenize(hyp) for hyp in predicted_translations]

    # Compute BLEU score
    bleu_score = corpus_bleu(references, hypotheses)
    return bleu_score


In [None]:
# Main training and evaluation function
def train_translation_model(data_path):
    # Load data
    spanish_texts, english_texts = load_and_preprocess_data(data_path)

    # Split data
    train_es, test_es, train_en, test_en = train_test_split(
        spanish_texts, english_texts, test_size=0.2, random_state=42
    )

    # Prepare sequences
    X_train, spanish_tokenizer, y_train, english_tokenizer = prepare_sequences(train_es, train_en)
    X_test, _, y_test, _ = prepare_sequences(test_es, test_en)

    # Create model
    model = create_translation_model(
        len(spanish_tokenizer.word_index) + 1,
        len(english_tokenizer.word_index) + 1
    )

    # Compile model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # Prepare target data for sparse categorical crossentropy
    y_train_sparse = y_train.reshape(y_train.shape[0], y_train.shape[1], 1)

    # Train model
    model.fit(
        [X_train, y_train],
        y_train_sparse,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_split=0.2
    )

    # Predict translations
    predictions = model.predict([X_test, y_test])

    # Convert predictions to text (simplified)
    predicted_translations = []
    for pred in predictions:
        # Get the most likely token for each position
        predicted_tokens = np.argmax(pred, axis=-1)

        # Convert back to text
        predicted_text = ' '.join([
            list(english_tokenizer.word_index.keys())[list(english_tokenizer.word_index.values()).index(token)]
            for token in predicted_tokens if token != 0
        ])
        predicted_translations.append(predicted_text)

    # Compute BLEU score
    bleu_score = compute_bleu_score(test_en, predicted_translations)

    print(f"BLEU Score: {bleu_score}")

    return model, spanish_tokenizer, english_tokenizer

In [None]:
# Example translation function
def translate(spanish_text):
    # Tokenize and pad input
    input_sequence = es_tokenizer.texts_to_sequences([spanish_text])
    input_padded = pad_sequences(input_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

    # Predict translation
    prediction = model.predict([input_padded, input_padded])
    predicted_tokens = np.argmax(prediction, axis=-1)

    # Convert back to text
    translated_text = ' '.join([
        list(en_tokenizer.word_index.keys())[list(en_tokenizer.word_index.values()).index(token)]
        for token in predicted_tokens[0] if token != 0
    ])

    return translated_text

In [None]:
# Main training and evaluation function
data_path = "/kaggle/input/wmt16-biomed-en-es/wmt16_biomedical_data_en_es.csv"
# Load data
spanish_texts, english_texts = load_and_preprocess_data(data_path)

# Split data
train_es, test_es, train_en, test_en = train_test_split(
    spanish_texts, english_texts, test_size=0.2, random_state=42
)

# Prepare sequences
X_train, spanish_tokenizer, y_train, english_tokenizer = prepare_sequences(train_es, train_en)
X_test, _, y_test, _ = prepare_sequences(test_es, test_en)

In [None]:
# Create model
model = create_translation_model(
    len(spanish_tokenizer.word_index) + 1,
    len(english_tokenizer.word_index) + 1
)

In [None]:
# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
# Prepare target data for sparse categorical crossentropy
y_train_sparse = y_train.reshape(y_train.shape[0], y_train.shape[1], 1)

# Train model
model.fit(
    [X_train, y_train],
    y_train_sparse,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2
)

In [None]:
# Predict translations
predictions = model.predict([X_test, y_test])

# Convert predictions to text (simplified)
predicted_translations = []
for pred in predictions:
    # Get the most likely token for each position
    predicted_tokens = np.argmax(pred, axis=-1)

    # Convert back to text
    predicted_text = ' '.join([
        list(english_tokenizer.word_index.keys())[list(english_tokenizer.word_index.values()).index(token)]
        for token in predicted_tokens if token != 0
    ])
    predicted_translations.append(predicted_text)

# Compute BLEU score
bleu_score = compute_bleu_score(test_en, predicted_translations)

print(f"BLEU Score: {bleu_score}")

return model, spanish_tokenizer, english_tokenizer