In [None]:
!pip install datasets
!pip install transformers[sentencepiece] datasets
!pip install sacremoses
import os
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import re
import json
from transformers import MarianMTModel, MarianTokenizer, AutoModelForSeq2SeqLM, AutoTokenizer


Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')


# TODO: Update the path below to the actual location of your dataset CSV on Drive.
dataset_path = '/content/drive/My Drive/Abdul FYP/Fixed MedEV data.csv'
df = pd.read_csv(dataset_path)

model_path = "/content/drive/MyDrive/Abdul FYP/MarianMT_en_vi_medical_finetuned"
model = MarianMTModel.from_pretrained(model_path)
tokenizer = MarianTokenizer.from_pretrained(model_path)


print("Total sentence pairs:", len(df))
df.head(3)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Total sentence pairs: 340897


Unnamed: 0,English,Viet
0,"To evaluate clinical, subclinical symptoms of ...","Nghiên cứu đặc điểm lâm sàng, cận lâm sàng bện..."
1,"Evaluate clinical, subclinical symptoms of pat...","Đánh giá đặc điểm lâm sàng, cận lâm sàng bệnh ..."
2,There was a relation between vasodilatation an...,Có sự liên quan giữa độ quá phát V.a với mức đ...


In [None]:
# Trim whitespace in both columns
df['English'] = df['English'].astype(str).str.strip()
df['Viet'] = df['Viet'].astype(str).str.strip()

# Drop rows with empty strings in either column
initial_count = len(df)
df = df[(df['English'] != "") & (df['Viet'] != "")]
df = df.dropna(subset=['English', 'Viet'])
df = df.drop_duplicates(subset=['English', 'Viet'])
cleaned_count = len(df)

print(f"Removed {initial_count - cleaned_count} empty or duplicate pairs. Remaining pairs: {cleaned_count}")


Removed 460 empty or duplicate pairs. Remaining pairs: 340437


code reports how many sentence pairs were removed during cleaning, So should be fairly clean now.

In [None]:
# Sample 50k pairs for training to manage memory as I think that 340k is way too much for google colab
# Im updating this 100k to see how it manages it
sample_size = 100000
if len(df) > sample_size:
    df_sampled = df.sample(n=sample_size, random_state=42)
else:
    df_sampled = df.copy()

df_sampled = df_sampled.reset_index(drop=True)
print("Sampled sentence pairs:", len(df_sampled))


Sampled sentence pairs: 100000


In [None]:
#from sklearn.model_selection import train_test_split

# 80% train, 20% temp (which will be split into val and test)
train_df, temp_df = train_test_split(df_sampled, test_size=0.2, random_state=42)
# Split temp_df equally into validation and test (10% each of original)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")


Train size: 80000, Validation size: 10000, Test size: 10000


splitting the data into testing/training/validation based on metrics ive heard from datamining module.

In [None]:
max_input_length = 128
max_target_length = 128

print("Tokenizer and model loaded successfully!")


Tokenizer and model loaded successfully!


In [None]:
from datasets import Dataset, DatasetDict

# Convert Pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset   = Dataset.from_pandas(val_df.reset_index(drop=True))
test_dataset  = Dataset.from_pandas(test_df.reset_index(drop=True))

data_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

# Define the tokenization function
def preprocess_function(examples):
    inputs = [text.strip() for text in examples["English"]]
    targets = [text.strip() for text in examples["Viet"]]
    # Tokenize English text
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Tokenize Vietnamese text as target
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the entire dataset (train/val/test)
tokenized_datasets = data_dict.map(preprocess_function, batched=True,
                                   remove_columns=data_dict["train"].column_names)

# Verify an example
print("Example tokenized input:", tokenized_datasets['train'][0]['input_ids'][:10])
print("Example tokenized label:", tokenized_datasets['train'][0]['labels'][:10])


Map:   0%|          | 0/80000 [00:00<?, ? examples/s]



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Example tokenized input: [338, 3678, 8552, 53, 24356, 1057, 5, 9487, 32, 5]
Example tokenized label: [16173, 9075, 53, 4852, 630, 563, 483, 1057, 33, 11]


My dataset should now be tokenized. Each entry in tokenized_datasets has input_ids (tokenized English sentence), attention_mask, and labels (tokenized Vietnamese sentence). The original text columns have been removed.

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
import torch
from transformers import Seq2SeqTrainingArguments

# Check if GPU is available for mixed precision
use_fp16 = torch.cuda.is_available()

training_args = Seq2SeqTrainingArguments(
    output_dir="./en-vi-mt-medical-checkpoints",
    num_train_epochs=4,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    predict_with_generate=True,
    fp16=use_fp16,
    # Other helpful args:
    save_total_limit=2,       # limit the total saved checkpoints
    report_to="none"          # no default WandB logging
)




Setting test training parameters/arguments.

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Start training
train_output = trainer.train()


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,1.6967,1.590799
2,1.5085,1.506871
3,1.3865,1.466713
4,1.3421,1.455137


In [None]:
# Define an output path on Drive to save the model
output_dir = "/content/drive/MyDrive/Abdul FYP/MarianMT_en_vi_medical_finetuned"
import os
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save model and tokenizer to the Drive folder
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to /content/drive/MyDrive/Abdul FYP/MarianMT_en_vi_medical_finetuned


In [None]:
import json
from datetime import datetime

metadata = {
    "sample_count": len(df_sampled),
    "categories": None,
    "category_counts": None,
    "train_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}

# If categorization was done, include the category distribution
if 'Category' in df_sampled.columns:
    category_counts = df_sampled['Category'].value_counts().to_dict()
    metadata["categories"] = list(category_counts.keys())
    metadata["category_counts"] = category_counts

# Save metadata to a JSON file
metadata_path = os.path.join(output_dir, "training_metadata.json")
with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=2)

print(f"Saved training metadata to {metadata_path}")
print("Metadata content:", metadata)


Saved training metadata to /content/drive/MyDrive/Abdul FYP/MarianMT_en_vi_medical_finetuned/training_metadata.json
Metadata content: {'sample_count': 100000, 'categories': None, 'category_counts': None, 'train_date': '2025-03-28 18:41:53'}


In [None]:
# Step 14: Evaluate BLEU on test set
!pip install -q sacrebleu evaluate

import evaluate
import numpy as np
from tqdm import tqdm

# Load BLEU metric
bleu = evaluate.load("sacrebleu")

# Prepare model for generation
model.eval()

# Generate translations and compare
preds = []
refs = []

for example in tqdm(tokenized_datasets["test"]):
    input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(model.device)
    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=max_target_length)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    ref = tokenizer.decode(example["labels"], skip_special_tokens=True)

    preds.append(pred)
    refs.append([ref])  # Note: sacrebleu expects list of list

# Compute BLEU
results = bleu.compute(predictions=preds, references=refs)
print(f"\nTest BLEU Score: {results['score']:.2f}")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

100%|██████████| 10000/10000 [1:18:57<00:00,  2.11it/s]



Test BLEU Score: 44.17


metrics like in the paper, BLEU score for now and then im going to fine tune my model more later. BLEU score initially 38.27

In [None]:
meteor = evaluate.load("meteor")

# compute METEOR
results_meteor = meteor.compute(predictions=preds, references=refs)
print(f"METEOR Score: {results_meteor['meteor']:.2f}")