## Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np 
import random
import torch
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, Trainer
from tqdm import tqdm, trange
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"]="1" 

## Model and Tokenizer Initialization

In [2]:
base_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
base_tokenizer = AutoTokenizer.from_pretrained('distilgpt2')

## Data Preprocessing

In [3]:
def combine(x, base_tokenizer=base_tokenizer):
    return x["Input"]+" "+"Summary: "+x['Summary']

In [27]:
import pandas as pd
df = pd.read_csv("Plot_Summary_Dataset")

In [28]:
df['Input'] = df.apply(combine, axis=1)
df = df['Input']
df.to_csv('Plot_Summary_DistilGPT2', index=False)

## Data Split

In [13]:
from sklearn.model_selection import train_test_split
df = pd.read_csv("Plot_Summary_DistilGPT2")
train_df, test_df = train_test_split(df, test_size=0.1, random_state=1)

In [14]:
train_df.to_csv('P_S_D_Train',index=False)
test_df.to_csv('P_S_D_Test',index=False)

## Data Tokenization

In [3]:
def encode_sequences(x, base_tokenizer = base_tokenizer):
    # Max length of the input sequence in DistilGPT2 is 1024 tokens 
    return base_tokenizer(x, max_length = 1024, truncation=True, add_special_tokens = True)['input_ids']

In [4]:
import pandas as pd
train_df = pd.read_csv("P_S_D_Train")
test_df = pd.read_csv("P_S_D_Test")

In [5]:
train_df = train_df['Input'].apply(encode_sequences)

In [6]:
test_df = test_df['Input'].apply(encode_sequences)

In [7]:
train_df.dropna(inplace = True)
test_df.dropna(inplace = True)

In [8]:
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

## Model Training

In [20]:
#Setting pad token same as eos token
base_tokenizer.pad_token = base_tokenizer.eos_token

#Initializing Data Collator that forms batches and sends the input in a proper language modeling format to the model for training and evaluation
data_collator = DataCollatorForLanguageModeling(
        tokenizer=base_tokenizer,
        mlm=False,
        return_tensors="pt"
    )

In [21]:
model_path = 'Story_Gen_Model/DistilGPT2'

#Specifying the Training arguments for the model training 
training_args = TrainingArguments(
    output_dir=model_path,
    overwrite_output_dir = True, 
    evaluation_strategy="no", 
    gradient_accumulation_steps=8, 
    num_train_epochs=30,
    weight_decay=0.01, 
    lr_scheduler_type="cosine",
    learning_rate=5e-4, 
    fp16=True 
)

In [22]:
# Initializing the trainer

trainer = Trainer(
    model=base_model,                         # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=train_df,       # training dataset
    eval_dataset = test_df,
)

In [10]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [11]:
# Starting the training
trainer.train(resume_from_checkpoint="Story_Gen_Model/DistilGPT2/checkpoint-8000")

You are resuming training from a checkpoint trained with 4.27.2 of Transformers but your current version is 4.29.2. This is not recommended and could yield to errors or unwanted behaviors.


Step,Training Loss
8500,2.2158
9000,2.2068
9500,2.171
10000,2.1411
10500,2.1094
11000,2.0812
11500,2.0546
12000,2.0342
12500,2.0144
13000,1.9923


TrainOutput(global_step=16290, training_loss=1.0405259387049666, metrics={'train_runtime': 56691.3036, 'train_samples_per_second': 18.392, 'train_steps_per_second': 0.287, 'total_flos': 1.5087687350530867e+17, 'train_loss': 1.0405259387049666, 'epoch': 30.0})

In [12]:
# Saving the final model
trainer.save_model()

## Model Evaluation (Perplexity)

In [23]:
eval_results = trainer.evaluate()

In [24]:
eval_results

{'eval_loss': 3.0107593536376953,
 'eval_runtime': 18.2229,
 'eval_samples_per_second': 211.931,
 'eval_steps_per_second': 26.505}

In [25]:
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 20.30


## Loading and Testing 

In [26]:
base_model = AutoModelForCausalLM.from_pretrained("Story_Gen_Model/DistilGPT2")
base_tokenizer = AutoTokenizer.from_pretrained('distilgpt2')

In [27]:
base_tokenizer.pad_token = base_tokenizer.eos_token

In [28]:
"""
Format of the input
Title: ADD_TITLE. Genre: ADD_GENRES. Characters: ADD_CHARS. Relations: Neutral: <>. Positive: <>. Negative: <>.  Plot: ADD_PLOT
"""

from transformers import pipeline

input = "Title: Saiyan Warrior. Genre: Action, Adventure, Drama. Characters: Goku, Vegeta, Beerus. Relations: Neutral: <Beerus, Vegeta>. Positive: <Goku, Vegeta>. Negative: <Goku, Beerus>. Plot: Beerus is a god of destruction, who came to Earth to destroy it. Goku and Vegeta in order to protect their family fight and defeat him."
generator = pipeline("text-generation", model=base_model, tokenizer = base_tokenizer)
do_sample = True
num_beams = 1
top_p = 0.9
generator(input, num_beams = num_beams, top_p = top_p, do_sample = do_sample, min_length =150, max_length = 250)[0]['generated_text'][len(input)+1:]

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"Summary: Goku is a god of darkness, destroying everything in his way. He's been raised by an alien called a Gowan. When an alien called Gowan crashes into Goku's village, Goku and his new family are forced to escape from the Gowan and into the desert, but Goku survives by transforming into a powerful being. However, it is gone for a time and it's time to warn the world of a Bulma God of the approaching disaster. Goku and Vegeta are now in the service of Bulma King, who has ordered the destruction of the entire planet. In order to protect their family Goku and Vegeta are forced to protect their village from Bulma King's evil sorceress. However, unbeknownst to them, Gowan plans to use the Gowan's gravity to transform Goku into Uranus. When they"

## BLEU Score Evaluation

## Loss Curve

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_json('Add path to the json file')

In [None]:
def get_epoch(x):
  return x['epoch']

def get_loss(x):
  return x['loss']

In [None]:
df['epoch_number'] = df['log_history'].apply(get_epoch)
df ['loss'] = df['log_history'].apply(get_loss)

In [None]:
plt.plot(df['epoch_number'].values, df['loss'].values) 
plt.xlabel('#Epochs')
plt.ylabel('Loss value')
plt.title('Training curve T5')