## Loading the libraries

In [1]:
#Installation
import datasets
import transformers
import pandas as pd
from datasets import Dataset

#Tokenizer
from transformers import RobertaTokenizerFast

#Encoder-Decoder Model
from transformers import EncoderDecoderModel

#Training
# When using previous version of the library you need the following two lines
#from seq2seq_trainer import Seq2SeqTrainer
#from transformers import TrainingArguments

from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
from dataclasses import dataclass, field
from typing import Optional

import os

Define parameters for data location and model folders

Load the datafile with the product descriptions and names:

In [2]:
# Load the dataset from a CSV file
df=pd.read_csv('chess_data/1.csv', usecols=['input','target'])
print('Num Examples: ',len(df))
print('Null Values\n', df.isna().sum())

Num Examples:  1000000
Null Values
 input     0
target    0
dtype: int64


In [3]:
df.head()

Unnamed: 0,input,target
0,e4 : Nf3 c5 e4 Nc6 d4 cxd4 Nxd4 Nf6 Nc3 d6 Bg5...,Nxd3+ cxd3 Bd7 Qh4 Qg8 Nh2 Nd4 Ng4 Nf5 Qh3 Rc8...
1,e4 : g3 e5 d4 exd4 Nf3 c5 c3 d5 cxd4 c4 Bg2 Nf...,Bd6 Qxa4 O-O Nxd7 Qxd7 Qxd7 Nxd7 c5 Be7 Nc3 Nf...
2,e4 : e4 e5 Nf3 Nc6 Bb5 a6 Ba4 Nf6 O-O Nxe4 d4 ...,Bf5 Rab1 Rd7 Bc2 Rad8 Rfd1 Bg6 Bxd3 Rxd3 Rxd3 ...
3,e4 : e4 c5 Nc3 d6 Nf3 g6 d4 cxd4 Nxd4 Nf6 Bc4 ...,f5 Nf3# 0-1
4,e4 : c4 f5 Nc3 Nf6 g3 g6 Bg2 Bg7 Rb1 a5 d3 O-O...,c6 Qc2 Nd8 Nd2 Nf7 Ra1 e5 Rfb1 Qe7 Qb3 Kh8 Qc2...


## Split the data into train and validation dataset

We split the dataset into a training dataset (90%) and a validation dataset (10%). To choose the examples, there is a sampling method to randomly extract the training dataset.

In [4]:
# Splitting the data into training and validation
# Defining the train size. So 90% of the data will be used for training and the rest will be used for validation. 
train_size = 0.99

# Sampling 90% fo the rows from the dataset
train_dataset=df.sample(frac=train_size,random_state = 42)

# Reset the indexes
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print('Length Train dataset: ', len(train_dataset))
print('Length Val dataset: ', len(val_dataset))

Length Train dataset:  990000
Length Val dataset:  10000


In the next section, we try to limit the number of examples to train on in order to reduce the cost and time for training during the experiments. When the model is ready to be trained, we must train on the whole training dataset.

In [5]:
# To limit the training and validation dataset, for testing
start = 0
stop = 1

# Create a Dataset from a pandas dataframe for training and validation
train_data=Dataset.from_pandas(train_dataset) #[start*200_000:stop*200_000])
val_data=Dataset.from_pandas(val_dataset)

# Create the encoder-decoder model from a pretrained RoBERTa model

## Setting the model and training parameters

Now it is time to set the model and training parameters, they will be passed to the dataset generator and to the Trainer object in a latter section.

In [6]:
TRAIN_BATCH_SIZE = 64  # input batch size for training (default: 64)
VALID_BATCH_SIZE = 2   # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 1    # number of epochs to train (default: 10)
VAL_EPOCHS = 1 
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 512      # Max length for product description
SUMMARY_LEN = 512      # Max length for product names

## Load the trained tokenizer on our specific language
As we mentioned previously, we have trained a tokenizer and a RoBERTa model from scratch using the Masked Language Modelling technique trying to focus our model on our specific task. Now we can configure our encoder-decoder using this pretrained model.

The first step is loading the tokenizer we need to apply to generate our input and target tokens and transform them into a vector representation of the text data.

In [7]:
# Loading the RoBERTa Tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('MixedTokens',  max_len=MAX_LEN)
# Setting the BOS and EOS token
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

In [8]:
batch_size=TRAIN_BATCH_SIZE  # change to 16 for full training
encoder_max_length=MAX_LEN
decoder_max_length=SUMMARY_LEN

def process_data_to_model_inputs(batch):
  # Tokenize the input and target data
  inputs = tokenizer(batch["input"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["target"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

# Preprocessing the training data
train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["input", "target"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
# Preprocessing the validation data
val_data = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["input", "target"]
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
# Shuffle the dataset when it is needed
#dataset = dataset.shuffle(seed=42, buffer_size=10, reshuffle_each_iteration=True)


  0%|          | 0/15469 [00:00<?, ?ba/s]

  0%|          | 0/157 [00:00<?, ?ba/s]

## Define the RoBERTa Encoder-Decoder model

In [9]:
# set encoder decoder tying to True
pretrainedmodel_folder = 'MixedTokens'
roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained(pretrainedmodel_folder, pretrainedmodel_folder, tie_encoder_decoder=True)

# Show the vocab size to check it has been loaded
print('Vocab Size: ',roberta_shared.config.encoder.vocab_size)

Some weights of the model checkpoint at MixedTokens were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at MixedTokens and are newly initialized: ['roberta.encoder.layer.9.crossattention.self.key.bias', 'roberta.encoder.layer.10.crossattention.self.value.bias', 'roberta.encoder.layer.3.crossatte

Vocab Size:  50000


In [10]:
# set special tokens
roberta_shared.config.decoder_start_token_id = tokenizer.bos_token_id                                             
roberta_shared.config.eos_token_id = tokenizer.eos_token_id
roberta_shared.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
# set decoding params                               
roberta_shared.config.max_length = SUMMARY_LEN
roberta_shared.config.early_stopping = True
roberta_shared.config.no_repeat_ngram_size = 1
roberta_shared.config.length_penalty = 2.0
roberta_shared.config.repetition_penalty = 3.0
roberta_shared.config.num_beams = 10
roberta_shared.config.vocab_size = roberta_shared.config.encoder.vocab_size

# Training the encoder-decoder

In [11]:
# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

## Create the Trainer

Now it is time to set the training arguments: batch_size, training epochs, save the model, etc. And then we can instantiate a `Seq2SeqTrainer`, a subclass of the `Trainer`object we mentioned, selecting the model to train, the training arguments, the metrics computation, the train, and the evaluation datasets.


In [12]:
#batch_size = 4
training_args = Seq2SeqTrainingArguments(
    output_dir='MixedTokens',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    #evaluate_during_training=True,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    logging_steps=1_000,  
    save_steps=1_000, 
    warmup_steps=10,  
    #max_steps=1500, # delete for full training
    num_train_epochs = TRAIN_EPOCHS, #TRAIN_EPOCHS
    overwrite_output_dir=True,
    save_total_limit=1,
    fp16=True, 
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    tokenizer=tokenizer,
    model=roberta_shared,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)

Using amp fp16 backend


Now, we start training the model:

In [None]:
# Fine-tune the model, training and evaluating on the train dataset
trainer.train()

***** Running training *****
  Num examples = 990000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 247500


Epoch,Training Loss,Validation Loss


Saving model checkpoint to MixedTokens/checkpoint-1000
Configuration saved in MixedTokens/checkpoint-1000/config.json
Model weights saved in MixedTokens/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in MixedTokens/checkpoint-1000/tokenizer_config.json
Special tokens file saved in MixedTokens/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [MixedTokens/checkpoint-16000] due to args.save_total_limit
Deleting older checkpoint [MixedTokens/checkpoint-17000] due to args.save_total_limit
Saving model checkpoint to MixedTokens/checkpoint-2000
Configuration saved in MixedTokens/checkpoint-2000/config.json
Model weights saved in MixedTokens/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in MixedTokens/checkpoint-2000/tokenizer_config.json
Special tokens file saved in MixedTokens/checkpoint-2000/special_tokens_map.json
Deleting older checkpoint [MixedTokens/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to MixedTokens/checkp

Save the encoder-decoder model just trained:

In [None]:
# Save the encoder-decoder model just trained
trainer.save_model('FinetunedModel')

# Evaluate the model on the test dataset

Once we have our model trained, we can use it to generate names for our products and check the result of our fine-tuning process on our objective task. 

We load a test dataset, a subset of our original dataset and delete rows containing null values.

In [None]:
# Load the dataset: sentence in english, sentence in spanish 
df=pd.read_csv('chess_data/23.csv')
print('Num Examples: ',len(df))
print('Null Values\n', df.isna().sum())
print(df.head(5))

test_data=Dataset.from_pandas(df[:30])
print(test_data)

If you need to **restore the trained model from a checkpoint** run the next cell, selecting the folder where the checkpoint was saved.

checkpoint_path = os.path.abspath(os.path.join(model_folder,'checkpoint-3072'))
print(checkpoint_path)

Then we load the Tokenizer and the fine-tuned model from a saved version.

In [None]:
#Load the Tokenizer and the fine-tuned model
tokenizer = RobertaTokenizerFast.from_pretrained('FinetunedModel')
model = EncoderDecoderModel.from_pretrained('FinetunedModel')

model.to("cuda")

In order to improve the results, we will define two methods to generate the text, using the Beam search decoding strategy and random sampling, and we will apply them and compare the results.


In [None]:
# Generate the text without setting a decoding strategy
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["input"], padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    #outputs = roberta_shared.generate(input_ids, attention_mask=attention_mask)
    outputs = roberta_shared.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch


In [None]:
# Generate a text using beams search
def generate_summary_beam_search(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["input"], padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = roberta_shared.generate(input_ids, attention_mask=attention_mask,
                                  num_beams=15,
                                  repetition_penalty=3.0, 
                                  length_penalty=2.0, 
                                  num_return_sequences = 1
    )

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch

# Generate a text using beams search
def generate_summary_topk(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["input"], padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = roberta_shared.generate(input_ids, attention_mask=attention_mask,
                                  repetition_penalty=3.0, 
                                  length_penalty=2.0, 
                                  num_return_sequences = 1,
                                  do_sample=True,
                                  top_k=50, 
                                  top_p=0.95,

    )

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch


Now, we can make predictions for the test dataset using Beam search strategy and top-k sampling technique.

In [None]:
batch_size = TRAIN_BATCH_SIZE

#results = test_data.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["description"])
# Generate predictions using beam search
results = test_data.map(generate_summary_beam_search, batched=True, batch_size=batch_size, remove_columns=["input"])
pred_str_bs = results["pred"]
# Generate predictions using top-k sampling
results = test_data.map(generate_summary_topk, batched=True, batch_size=batch_size, remove_columns=["input"])
pred_str_topk = results["pred"]

#label_str = results["Summary"]


Now, we can see some results from our trained model to check its performance on the task.

In [None]:
#Show an example
print("Moves: ",df['input'][1])
print("Predicted using BS: ", pred_str_bs[1])
print("Predicted using Top-K Sampling: ", pred_str_topk[1])


In [None]:
#Show an example
print("Moves: ",df['input'][10])
print("Predicted using BS: ", pred_str_bs[10])
print("Predicted using Top-K Sampling: ", pred_str_topk[10])

When more than one output are generated we need to join them on a single list 

In [None]:
import numpy as np

preds=np.reshape(pred_str, (-1, 3))
print('Predictions Shape: ',preds.shape)
predictions = [','.join(p) for p in preds]
print('Num predictions: ', len(predictions),predictions)

In [None]:
print(predictions)

Save the predictions to a file:

In [None]:
final_df = pd.DataFrame({'name':pred_str})
final_df.to_csv(outputfile_path, index=False)
print('Output Files generated for review')