## Task 6 Machine Translation  

### This was following the tutorial provided to us

In [2]:
import os
import sys
import transformers
import pandas as pd
import tensorflow as tf
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from ftfy import fix_text

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-es-en" #fetching pretrained translation model from hugging face

In [None]:
raw_datasets = load_dataset("Helsinki-NLP/opus_books", "en-es") #getting english to spanish dataset

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 93470/93470 [00:00<00:00, 395030.79 examples/s]


In [None]:
train_test_split = raw_datasets['train'].train_test_split(test_size=0.01)  # split 10% for test

validation_test_split = train_test_split['train'].train_test_split(test_size=0.01)  # 10% of 90% => 9% of original

# irganizing the splits into a new DatasetDict
raw_datasets = DatasetDict({
    'train': validation_test_split['train'],
    'validation': validation_test_split['test'],
    'test': train_test_split['test']
})

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
max_input_length = 128
max_target_length = 128 #max 128 tokens

source_lang = "es"
target_lang = "en" #setting it up so that it translates from spanish to english


def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]] # it gathers spansish sentences as input and english sentences as targets 
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True) # tokenizes the sentences

    # setting up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
preprocess_function(raw_datasets["train"][:2])



{'input_ids': [[711, 11673, 15, 74, 17496, 152, 4, 28, 650, 8, 526, 2, 11, 194, 3296, 12, 155, 10594, 26, 165, 565, 6, 8, 17163, 15, 74, 21972, 326, 2, 461, 2, 4, 25, 9, 2406, 33738, 2, 74, 4, 9, 27503, 28, 9563, 28851, 3, 0], [32, 14718, 1486, 15, 8924, 37066, 74, 4602, 4712, 15, 37, 15, 74, 39462, 6, 1974, 51, 37, 6, 49535, 660, 575, 1508, 2, 1760, 668, 37, 236, 6, 6202, 19, 1508, 15, 155, 3398, 2847, 484, 2, 488, 43, 212, 15, 12, 575, 10257, 187, 3135, 17368, 660, 14, 163, 4, 8275, 12, 20490, 3, 42368, 15, 1533, 1711, 212, 37, 26, 9558, 575, 1508, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[33, 9567, 174, 95, 2201, 27, 14424, 16, 125, 1883, 51, 23

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) #converts es to en translation pairs into tokenized tensors that the model can learn from 

Map: 100%|██████████| 92536/92536 [00:41<00:00, 2236.44 examples/s]
Map: 100%|██████████| 466/466 [00:00<00:00, 2519.69 examples/s]
Map: 100%|██████████| 468/468 [00:00<00:00, 1934.89 examples/s]


In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) # loads the appropriate architecure automatically 

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-es-en.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [None]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1 # all parameters subject to change

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf") #formats and pads a list of examples (the data)

In [15]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [None]:
train_dataset = model.prepare_tf_dataset( #creates and prepares the dataset for training
    tokenized_datasets["test"], #it uses the test because the dataset is so large it would take far too long to train but it should be "train"
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

validation_dataset = model.prepare_tf_dataset( #this one is user for validatioin in training
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

generation_dataset = model.prepare_tf_dataset( #uses validation set but for inference
    tokenized_datasets["validation"],
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [17]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [None]:
model.fit(train_dataset, validation_data=validation_dataset, epochs=1) #training



<keras.src.callbacks.History at 0x1d4bfec18e0>

In [23]:
model.save_pretrained("my_local_model")         # saves config + weights
tokenizer.save_pretrained("my_local_model")     # saves tokenizer files


('my_local_model\\tokenizer_config.json',
 'my_local_model\\special_tokens_map.json',
 'my_local_model\\vocab.json',
 'my_local_model\\source.spm',
 'my_local_model\\target.spm',
 'my_local_model\\added_tokens.json')

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained("my_local_model")
tokenizer = AutoTokenizer.from_pretrained("my_local_model") #loading model and tokenizer

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at my_local_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


### Code to translate spanish sentences from an excel file 

In [None]:
model_path = "my_local_model"  
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_path)

input_path = r"C:\Users\emilp\Documents\GitHub\2024-25c-fai2-adsai-EmilFox231007\datalab_tasks\Task_11\extracted_sentences.csv"    # <-- Replace with your Excel file path
df = pd.read_csv(input_path)

def translate(text):

    # Tokenize input text
    inputs = tokenizer.encode(text, return_tensors="tf", padding=True, truncation=True, max_length=256)
    # Generate translation
    outputs = model.generate(inputs, max_length=256)
    # Decode the output
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Apply translation to the column 
df['translated_sentence'] = df['Sentence'].apply(translate)

# save to excel file
output_path = r"C:\Users\emilp\Documents\GitHub\2024-25c-fai2-adsai-EmilFox231007\datalab_tasks\Task_11\extracted_sentences.csv"   # <-- Replace with desired output path
df.to_csv(output_path, index=False)


All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at my_local_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.
