# Loading Lib's & DataSet

In [31]:
!pip install datasets



In [32]:
import pandas as pd

from sklearn.model_selection import train_test_split
from datasets import Dataset
data = pd.read_csv('/content/spa.txt', sep='\t')
data

Unnamed: 0,Go.,Ve.,CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986655 (cueyayotl)
0,Go.,Vete.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Vaya.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,Váyase.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Hi.,Hola.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
4,Hi.,Hola,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
...,...,...,...
139007,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
139008,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
139009,"If you want to sound like a native speaker, yo...","Si quieres sonar como un hablante nativo, debe...",CC-BY 2.0 (France) Attribution: tatoeba.org #9...
139010,It may be impossible to get a completely error...,Puede que sea imposible obtener un corpus comp...,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


# EDA

In [55]:
data = data.sample(n= 6000, random_state= 42).reset_index(drop= True)
data

Unnamed: 0,source_text,target_text
0,Don't do two things at a time.,No hagas dos cosas al mismo tiempo.
1,She is willing to do anything for me.,Ella está dispuesta de hacer lo que sea por mí.
2,Do you know the girl standing by the window?,¿Conoces a la niña que está parada junto a la ...
3,I asked Tom a question.,Le hice una pregunta a Tom.
4,Balls are round.,Las bolas son redondas.
...,...,...
5995,I don't know your real name.,Yo no sé su verdadero nombre.
5996,I think I'll sleep well tonight.,Creo que hoy voy a dormir bien.
5997,Tom is picky.,Tomás es mañoso.
5998,That's my line!,¡Esa es mi frase!


In [34]:
# Dropping col
data.drop(columns= ['CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986655 (cueyayotl)'], inplace= True)

In [35]:
data.rename(columns={'Go.': 'source_text', 'Ve.': 'target_text'}, inplace=True)
data.isnull().sum()

Unnamed: 0,0
source_text,0
target_text,0


In [36]:
data['source_text'] = data['source_text'].astype(str)
data['target_text'] = data['target_text'].astype(str)

# Train Test Split

In [37]:
from numpy import test
train_data, val_data = train_test_split(data, test_size= 0.2, random_state= 42)
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

train_dataset.shape, val_dataset.shape

((4800, 3), (1200, 3))

In [38]:
train_dataset

Dataset({
    features: ['source_text', 'target_text', '__index_level_0__'],
    num_rows: 4800
})

# Loading Model & Tokenizer

In [39]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-es")



In [40]:
input_max_len = max([len(tokenizer.encode(text)) for text in data['source_text']])
output_max_len = max([len(tokenizer.encode(text)) for text in data['target_text']])
input_max_len, output_max_len

(51, 95)

# Preprocessing Text

In [41]:
def preprocess_text(text):
    inputs = tokenizer(
        text['source_text'],
        truncation= True,
        max_length= 128,
        padding= 'max_length',
    )
    labels = tokenizer(
        text['target_text'],
        truncation= True,
        max_length= 128,
        padding= 'max_length',
    )

    inputs['labels'] = labels['input_ids']
    return inputs

# DataSet Prepration

In [42]:
train_dataset = train_dataset.map(preprocess_text, batched= True)
val_dataset = val_dataset.map(preprocess_text, batched= True)

train_dataset.set_format(type= 'torch', columns= ['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type= 'torch', columns= ['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/4800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

In [43]:
train_dataset[0]

{'input_ids': tensor([  670,    40,   161,   370,    13,   113,    58,    21,     0, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65

# Loading Eng to Spanish translation model

In [44]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, TrainingArguments, Trainer
import torch

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-es")

# TraningArgs & Other Para's

In [45]:
from transformers import DataCollatorForSeq2Seq

# 1. Initialize the specific collator for translation
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

traning_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    weight_decay=0.01,
    warmup_steps=500,
    load_best_model_at_end=True,
    logging_steps=10,
    logging_dir='./logs',
    # predict_with_generate=True, # Added: Important for seeing translation metrics during eval
)

trainer = Trainer(
    model=model,
    args=traning_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator, # <--- THIS IS THE CRITICAL FIX
)

trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Epoch,Training Loss,Validation Loss
1,0.1308,0.136115
2,0.0931,0.108462
3,0.0615,0.103734
4,0.0364,0.1025
5,0.0226,0.104197
6,0.0184,0.104628
7,0.0148,0.105266


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


TrainOutput(global_step=4200, training_loss=0.1135110128227444, metrics={'train_runtime': 820.9044, 'train_samples_per_second': 40.93, 'train_steps_per_second': 5.116, 'total_flos': 1138985061580800.0, 'train_loss': 0.1135110128227444, 'epoch': 7.0})

# Saving Model

In [46]:
import os

model_dir = './model_dir'
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

loaded_model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_dir)



# Translation Func

In [47]:
def machine_translation(text):
    inputs = loaded_tokenizer(text, return_tensors= 'pt', padding= True, truncation= True, max_length= 128)
    outputs = loaded_model.generate(inputs['input_ids'], max_length= 128, num_beams= 4, early_stopping= True)
    translation = loaded_tokenizer.decode(outputs[0], skip_special_tokens= True)
    return translation

In [53]:
random_row = data.sample(n=1, random_state=42)
display(random_row)

Unnamed: 0,source_text,target_text
1782,Don't do two things at a time.,No hagas dos cosas al mismo tiempo.


# Example Testing

In [48]:
examp1 = 'You are too tall'
translated_text = machine_translation(examp1)
print(f'Machine Translation: {translated_text}')

Machine Translation: Eres demasiado alto.


In [52]:
examp2 = 'The boy admitted having broken the vase'
translated_text = machine_translation(examp2)
print(f'Machine Translation: {translated_text}')

Machine Translation: El niño reconoció haber roto el jarrón.


In [54]:
examp3 = 'Do not do two things at a time'
translated_text = machine_translation(examp3)
print(f'Machine Translation: {translated_text}')

Machine Translation: No hagas dos cosas a la vez
