<a href="https://colab.research.google.com/github/10Dennisw/translation-from-english-to-dutch/blob/main/transfer_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Installing libraries
!pip install transformers torch sentencepiece datasets

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading sa

In [2]:
# Importing modules
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [3]:
# Accessing drive to import data later
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Corpus/

Mounted at /content/drive
/content/drive/My Drive/Corpus


In [4]:
# Opening English file and reading the sentences
text_file = "europarl-v7.nl-en(english).txt"
with open(text_file) as f:
    lines_english = f.read().split("\n")[:-1]

# creating a text lines for English
text_eng = []
for line in lines_english:
    english = line.split("\t")
    text_eng.append(english)

# Opening Dutch file and reading the sentences
text_file = "europarl-v7.nl-en(nl).txt"
with open(text_file) as f:
    lines_dutch = f.read().split("\n")[:-1]

# creating a list for lines in Dutch
text_nl = []
for line in lines_dutch:
    dutch = line.split("\t")
    text_nl.append(dutch)

# Cutting the list due to resource constraints
new_text_eng = []
new_text_nl = []

# selecting sentences which are 15 words or less
# when 10,000 sentences are extracted, the loop broken
for i in range(len(text_eng)):
  for j in text_eng[i]:
    eng = j
  eng = eng.split(" ")
  if len(eng) >= 1 and len(eng) <= 15:
    new_text_eng.append(text_eng[i])
    new_text_nl.append(text_nl[i])
  if len(new_text_eng) == 100000:
    break

# Pairing the sentences together
text_pairs = []
for i in range(100000):
  english_sent = str(new_text_eng[i])
  dutch_sent = str(new_text_nl[i])
  text_pairs.append((english_sent, dutch_sent))

In [5]:
import random

# Creating the Training, validation and testing pairs
random.shuffle(text_pairs)

num_val_samples = int(0.15 * len(text_pairs)) #15%
num_train_samples = len(text_pairs) - 2 * num_val_samples #70%

train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

In [6]:
def generate_tokens(tokenizer, text_pairs):
  '''
  A function to tokenise the text_pairs and return the tokenised data
  tokenizer: is the tokenizer imported that belongs to the model
  text pairs: is the text pairs. This will be two sentences. One in English, one in dutch.
  tokenised_data: the function will return the tokenised_data
  '''

  # Extractng the sentences from the text pairs
  english_texts = [pair[0][2:-2] for pair in text_pairs]  # Removing square brackets
  dutch_texts = [pair[1][2:-2] for pair in text_pairs]   # Removing square brackets

  # Tokenizing the text pairs
  inputs = tokenizer(english_texts, return_tensors="pt", padding=True, truncation=True, max_length=30) #inputs are tokenized english sentences and attention mask
  labels = tokenizer(dutch_texts, return_tensors="pt", padding=True, truncation=True, max_length=30).input_ids #labels are tokenized dutch sentences

  # Generating the tokenised data, including labels (tokenised dutch sentence)
  tokenized_data = {
      "input_ids": inputs["input_ids"],
      "attention_mask": inputs["attention_mask"],
      "labels": labels
  }

  return tokenized_data  # returning the tokenised data, including input, atttention mask and labels

In [7]:
from transformers import AutoTokenizer

# Load the tokenizer associated with your pretrained model
model_name = "Helsinki-NLP/opus-mt-en-nl"
tokenizer = AutoTokenizer.from_pretrained(model_name)


training_tokenized_data = generate_tokens(tokenizer, train_pairs)
eval_tokenized_data = generate_tokens(tokenizer, val_pairs)

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.66M [00:00<?, ?B/s]



In [8]:
# Defining the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading pytorch_model.bin:   0%|          | 0.00/316M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [9]:
from torch.utils.data import Dataset

# Defining function to create dictionary
class TranslationDataset(Dataset):
    # initisialisation of input_ids, attention mask and labels
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']
        self.labels = tokenized_data['labels']

    # Getting the number of steps
    # This needs to be known for training (learning rate scheduler)
    def __len__(self):
        return len(self.input_ids)

    # returns a a dictionairy of the input_ids, attention mask and labels
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx],
        }


In [10]:
import torch

# Using fucntion to generating Train and Evaluation Datasets
train_dataset = TranslationDataset(training_tokenized_data)
eval_dataset = TranslationDataset(eval_tokenized_data)

In [11]:
!pip install --upgrade accelerate



In [12]:
# Defining training arguments
batch_size = 128
training_args_dict = {
    "output_dir": f"{model_name}-finetuned-english-to-nl",
    "evaluation_strategy": "epoch",
    "learning_rate": 5e-5,
    "per_device_train_batch_size": batch_size,
    "per_device_eval_batch_size": batch_size,
    "weight_decay": 0.01,
    "save_total_limit": 3,
    "num_train_epochs": 10,
    "predict_with_generate": True,
    "push_to_hub": False,
}

# Defining training arguments for the Seq2Seq problem
args = Seq2SeqTrainingArguments(**training_args_dict)

In [13]:
# Creating a Seq2SeqTrainer instance and specifying training parameter
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    )

# Setting model to training mode
model.train()

# Training the model on the corpus
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.6547,1.094242
2,1.0816,0.966734
3,0.944,0.918374
4,0.8607,0.892297
5,0.8067,0.88196
6,0.7597,0.872128
7,0.729,0.868407
8,0.7041,0.867407
9,0.6833,0.86626
10,0.6676,0.866799


TrainOutput(global_step=5470, training_loss=0.8691419193470064, metrics={'train_runtime': 3928.7599, 'train_samples_per_second': 178.173, 'train_steps_per_second': 1.392, 'total_flos': 5561450496000000.0, 'train_loss': 0.8691419193470064, 'epoch': 10.0})

In [14]:
# Setting device to Cuda to make sure that tensor input is move to GPU
# Neccesary to run on GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [15]:
def translate(text, model, tokenizer):
  '''
  inputs: text: source texts
  model: the fine-tuned model
  tokenizer: the tokenizer used to tokenize the source texts, corresponding to the model
  output: the decoded translated text
  '''
  inputs = tokenizer(text, return_tensors="pt", max_length=30, truncation=True) # tokenizing the input sentence
  inputs = {key: tensor.to(device) for key, tensor in inputs.items()}  # Move inputs to the cuda/ GPU device

  # Setting the model in evaluation mode
  model.eval()
  # Generating translation in the form of tokens using fine-tuned model
  outputs = model.generate(inputs['input_ids'], max_length=30, num_beams=4, early_stopping=True)
  # Decoding the outputs to generate a sentence
  translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return translated_text # Returning the decoded translation


In [23]:
# Creating an index list using the random library in the test pair
indx_lst = []
for i in range(10):
  indx_lst.append(random.randint(0, len(test_pairs)-1))

# Printing the initial English Sentence, Translated English Sentence and Actual Dutch Sentence according to corpus
for i in indx_lst:
  source_text = test_pairs[i][0][2:-2] # Extracting English sentence
  translated_text = translate(source_text, model, tokenizer) # Getting Dutch Tranlsation using function translate defined earlier
  actual_translation = test_pairs[i][1][2:-2] # Extracting Dutch translation according to corpus

  # Printing the source, translated and actual translation, which are defined above
  print("--> Source Text:", source_text)
  print("--> Translation:", translated_text)
  print("--> Translation (according to corpus):", actual_translation)

  # Printing a space after to make it easy to identify source, translated and actual translation
  if i == 9:
    break
  else:
    print()

--> Source Text: In the common position, minimum harmonisation was all of a sudden opted for.
--> Translation: In het gemeenschappelijk standpunt werd opeens voor een minimumharmonisatie gekozen.
--> Translation (according to corpus): In het gemeenschappelijk standpunt gaat men ineens kiezen voor een minimale harmonisatie.

--> Source Text: All the signs indicate that we are still to see a further stage added.
--> Translation: Alle signalen geven aan dat er nog een nieuwe fase aan toegevoegd mo
--> Translation (according to corpus): Het ziet ernaar uit dat het afronden van dit proces nog geruime tijd in beslag zal nemen.

--> Source Text: This is clearly the path we need to take.
--> Translation: Dat is duidelijk de weg die we moeten bewandelen.
--> Translation (according to corpus): Wij moeten streven naar een Europa waarin volledige werkgelegenheid, concurrentievermogen, verhoogde levenskwaliteit en betere toekomstmogelijkheden troef zijn.

--> Source Text: There is clearly a more am