## Installing Transformers library

In [None]:
#Installing transformer package 
!pip install transformers

## Rating Estimation by ChatGPT

In [None]:
# Loss to be added in the custom loss
def get_chatgpt_rating(prompt, sample):
  completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", 
     "content": f"{prompt} {sample}"}
  ])
  return 10 - int(completion.choices[0].message)

In [None]:
# Different prompts tested in order to generate a sensible rating 
prompt1 = "Provide me rating between 0 and 10 (without any explanation), where 0 is the best and 10 is the worst, for the following story summary: " 
prompt2 = "Assign a rating between 0 (best) and 10 (worst) to the given artificial story summary (only give rating as the response):"
prompt3 = "Assign a rating between 0 (best) and 10 (worst) to the given artificial story summary (only give rating as the response). The rating should be based on writing style, coherence and capture strength. Summary:"
prompt4 = "Assign a rating between 0 and 10 to the given artificial story summary. Only give rating as the response (no reasoning). The rating should be based on writing style, coherence, and capture strength. Summary:" # Best
prompt5 = "Provide me rating between 0 and 10 (without any explanation),  for the following story summary:" 

## Data Tokenization/Encoding

In [None]:
#Loading the standard T5 small model and tokenizer 
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Can try different T5 models such as T5-large, T5-3B, T5-11B
base_tokenizer = AutoTokenizer.from_pretrained('t5-base')
base_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

In [None]:
# Encoding the sequences
def encode_sequences(x, base_tokenizer = base_tokenizer):
  try:
    # Input consists of different aspects of the story on which the output will be conditioned
    input = str(x['Input'])
    # Label is the conditioned output - Story
    label = str(x['Summary'])
    # Max length of the input sequence in T5 is 512 tokens (BART could be used for longer sequences - 1024 max length limit) 
    model_input = base_tokenizer(input, max_length = 512, truncation=True, padding='max_length')
    model_input['labels'] = base_tokenizer(label, max_length = 512, truncation=True, padding='max_length')['input_ids']
    return model_input
  except:
    # By performing this model will also be robust to empty inputs (a type of adversarial input)
    input = ''
    label = ''
    model_input = base_tokenizer(input, max_length = 512, truncation=True, padding='max_length')
    model_input['labels'] = base_tokenizer(label, max_length = 512, truncation=True, padding='max_length')['input_ids']
    return model_input

In [None]:
# Loading the final dataset
import pandas as pd
df = pd.read_csv('')

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.1)

In [None]:
# Tokenizing the dataset
train_df = train_df.apply(encode_sequences(lambda x: x[['Input','Summary']]))
test_df = test_df.apply(encode_sequences(lambda x: x[['Input','Summary']]))

## T5 Training Setup (PyTorch) - Custom Loss Function

In [None]:
# Edit code
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [None]:
# Edit code
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

## T5 Training Setup (Trainer) - Custom Loss Function

In [None]:
# Initializing the Data Collator for batching of the dataset
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
        tokenizer=base_tokenizer,
        return_tensors="pt"
    )

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Path where model training loss and intermediate weights will be stored
model_path = f'/content/drive/MyDrive/Visual Story Telling/Story_Gen_Model'

#Specifying the training argument 
training_args = Seq2SeqTrainingArguments(
    output_dir=model_path,
    per_device_train_batch_size=4, 
    overwrite_output_dir = True, 
    evaluation_strategy="no", 
    gradient_accumulation_steps=8, 
    num_train_epochs=15,
    weight_decay=0.01, 
    lr_scheduler_type="cosine",
    learning_rate=5e-4, 
    fp16=True 
)

In [None]:
# Initializing the trainer
trainer = Seq2SeqTrainer(
    model=base_model,                         # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=train_df,       # training dataset
    eval_dataset = test_df
)

In [None]:
# Overwrite the Trainer API for utilizing custom loss function
# Edit Trainer Functionality
class MyTrainer(Trainer):
  def __init__(self):

  def compute_loss(self, model, inputs):
      labels = inputs.pop("labels")
      outputs = model(**inputs)
      logits = outputs[0]
      return my_custom_loss(logits, labels)

## Model Training 

In [None]:
# Starting the training
trainer.train()

In [None]:
# Saving the final model
trainer.save_model()