In [3]:
import pandas as pd
from datasets import Dataset

#Tokenizer
from transformers import RobertaTokenizerFast

#Encoder-Decoder Model
from transformers import EncoderDecoderModel

#Training
# from seq2seq_trainer import Seq2SeqTrainer
from transformers import TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments
from dataclasses import dataclass, field
from typing import Optional
from sklearn.model_selection import train_test_split
import evaluate

# Data processing

In [None]:
data_df = pd.read_csv("Reviews.csv")
data_df.head(3)

In [None]:
# Removing the unnecessary columns
data_df = data_df.drop(columns=['Id',	'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'])
data_df.head(3)

In [None]:
print(f'size of the dataset: {len(data_df)}')

In [None]:
nan_values = data_df.isnull()

# Sum the NaN values across columns
nan_counts = nan_values.sum()
print(nan_counts)

In [6]:
data_df = data_df.dropna()

In [8]:
summary_values = data_df['Summary'].values
text_values = data_df['Text'].values
longest_summary = 0
longest_text = 0

for v_s, v_t in zip(summary_values, text_values):
    if len(v_s) > longest_summary:
        longest_summary = len(v_s)
        
    if len(v_t) > longest_text:
        longest_text = len(v_t)


        
print(f'longest_summary: {longest_summary}')
print(f'longest_text: {longest_text}')

longest_summary: 128
longest_text: 21409


In [9]:
train_df, test_df = train_test_split(data_df, test_size = 0.1, random_state= 42) # 25% test = 142,113 
train_df, val_df = train_test_split(test_df, test_size = 0.1, random_state= 42) # val 15%= 85,268, train 60% = 341,072

In [10]:
train_data = Dataset.from_pandas(train_df) 
val_data = Dataset.from_pandas(val_df)
test_data =Dataset.from_pandas(test_df)

## Tokenization

In [11]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

In [12]:
#parameter setting
batch_size=256  #
encoder_max_length=40
decoder_max_length=8

In [13]:
def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["Text"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["Summary"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because RoBERTa automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
  
  return batch


#processing training data
train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["Text", "Summary"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

#processing validation data
val_data = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["Text", "Summary"]
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

  0%|          | 0/445 [00:00<?, ?ba/s]

  0%|          | 0/112 [00:00<?, ?ba/s]

In [14]:
roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base", tie_encoder_decoder=True).to("cuda")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.encoder.layer.2.crossattention.output.dense.bias', 'roberta.encoder.layer.0.crossattention.self.value.bias', 'roberta.encoder.layer.3.crossattention.self.value.we

In [15]:
# set special tokens
roberta_shared.config.decoder_start_token_id = tokenizer.bos_token_id                                             
roberta_shared.config.eos_token_id = tokenizer.eos_token_id

# sensible parameters for beam search
# set decoding params                               
roberta_shared.config.max_length = 40
roberta_shared.config.early_stopping = True
roberta_shared.config.no_repeat_ngram_size = 3
roberta_shared.config.length_penalty = 2.0
roberta_shared.config.num_beams = 4
roberta_shared.config.vocab_size = roberta_shared.config.encoder.vocab_size

In [16]:
# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    rouge = datasets.load_metric("rouge")

    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str)
    return rouge_output 
    # return {
    #     "rouge2_precision": round(rouge_output.precision, 4),
    #     "rouge2_recall": round(rouge_output.recall, 4),
    #     "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    # }

In [None]:
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="model",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    # predict_with_generate=True,
    do_train=True,
    do_eval=True,
    logging_steps=2, # Number of update steps between two logs if
    save_steps=16, #  Number of update steps between two evaluations
    eval_steps=500, 
    warmup_steps=500, # number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`
    overwrite_output_dir=True,
    save_total_limit=1, # Deletes the older checkpoints in `output_dir`
    fp16=True, # Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=roberta_shared,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()