## Installing Transformers library

In [1]:
#Installing transformer package 
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Rating Estimation by ChatGPT

In [None]:
!pip install openai

In [None]:
import os 
import openai
os.environ["OpenAI_API_Key"] = ""
api_key = os.environ.get("OpenAI_API_Key")
openai.api_key = api_key

In [None]:
# Loss to be added in the custom loss
def get_chatgpt_rating(prompt, sample):
  completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", 
     "content": f"{prompt} {sample}"}
  ])
  return 10 - int(completion.choices[0].message)

In [None]:
# Different prompts tested in order to generate a sensible rating 
prompt1 = "Provide me rating between 0 and 10 (without any explanation), where 0 is the best and 10 is the worst, for the following story summary: " 
prompt2 = "Assign a rating between 0 (best) and 10 (worst) to the given artificial story summary (only give rating as the response):"
prompt3 = "Assign a rating between 0 (best) and 10 (worst) to the given artificial story summary (only give rating as the response). The rating should be based on writing style, coherence and capture strength. Summary:"
prompt4 = "Assign a rating between 0 and 10 to the given artificial story summary. Only give rating as the response (no reasoning). The rating should be based on writing style, coherence, and capture strength. Summary:" # Best
prompt5 = "Provide me rating between 0 and 10 (without any explanation),  for the following story summary:" 

## Data Tokenization/Encoding

In [2]:
#Loading the standard T5 small model and tokenizer 
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Can try different T5 models such as T5-large, T5-3B, T5-11B
base_tokenizer = AutoTokenizer.from_pretrained('t5-base')
base_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
# Encoding the sequences
def encode_sequences(x, base_tokenizer = base_tokenizer):
  try:
    # Input consists of different aspects of the story on which the output will be conditioned
    input = str(x['Input']) 
    # Label is the conditioned output - Story
    label = str(x['Summary'])
    # Max length of the input sequence in T5 is 512 tokens (BART could be used for longer sequences - 1024 max length limit) 
    model_input = base_tokenizer(input, max_length = 512, truncation=True, padding='max_length')
    model_input['labels'] = base_tokenizer(label, max_length = 512, truncation=True, padding='max_length')['input_ids']
    return model_input
  except:
    # By performing this model will also be robust to empty inputs (a type of adversarial input)
    input = ''
    label = ''
    model_input = base_tokenizer(input, max_length = 512, truncation=True, padding='max_length')
    model_input['labels'] = base_tokenizer(label, max_length = 512, truncation=True, padding='max_length')['input_ids']
    return model_input

In [4]:
# Loading the final dataset
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Visual Story Telling/Dataset - Story Generation/Movies/M_Dataset_0_2K')

In [5]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.1)

In [6]:
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [7]:
# Tokenizing the dataset
train_df = train_df.apply(encode_sequences, axis=1)
test_df = test_df.apply(encode_sequences, axis = 1)

## T5 Training Setup - Custom Loss Function

In [8]:
# Initializing the Data Collator for batching of the dataset
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
        tokenizer=base_tokenizer,
        return_tensors="pt"
    )

In [10]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Path where model training loss and intermediate weights will be stored
model_path = f'/content/drive/MyDrive/Visual Story Telling/Story_Gen_Model'

#Specifying the training argument 
training_args = Seq2SeqTrainingArguments(
    output_dir=model_path,
    per_device_train_batch_size=4, 
    overwrite_output_dir = True, 
    evaluation_strategy="no", 
    gradient_accumulation_steps=8, 
    num_train_epochs=15,
    weight_decay=0.01, 
    lr_scheduler_type="cosine",
    learning_rate=5e-4 
    fp16=True 
)

In [18]:
# Overwrite the Trainer API for utilizing custom loss function 
import torch.nn as nn
class CustomSeq2SeqTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, tokenizer, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = custom_loss_function(logits, labels, tokenizer) 
        return (loss, outputs) if return_outputs else loss

# Check whether the tokenizer is being passed as an argument 
def custom_loss_function(logits, labels, tokenizer):
    loss_fct = nn.CrossEntropyLoss()
    loss_cross_entropy = loss_fct(logits,labels)
    generated_summary = tokenizer.decode(logits)
    loss_GPT = get_chatgpt_rating(prompt4, generated_summary)
    return loss_cross_entropy+loss_GPT


In [20]:
# Initializing the trainer
trainer = CustomSeq2SeqTrainer(
    model=base_model,                         # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=train_df,       # training dataset
    eval_dataset = test_df,
)

## Model Training 

In [None]:
# Starting the training
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
# Saving the final model
trainer.save_model()

## Model Evaluation

In [None]:
trainer.eval()

In [None]:
# Code to get BLEU Score rating of the model on test dataset 

"""
Refer:

1) https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
2) https://huggingface.co/docs/evaluate/choosing_a_metric

"""