## Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np 
import random
import torch
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, TrainingArguments, Trainer
from tqdm import tqdm, trange
import math

## Model and Tokenizer Initialization

In [None]:
base_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
base_tokenizer = AutoTokenizer.from_pretrained('distilgpt2')

## Data Preprocessing

In [None]:
def combine(x, base_tokenizer=base_tokenizer):
    return x["Input"]+" "+"Summary: "+x['Summary']

In [None]:
def encode_sequences(x, base_tokenizer = base_tokenizer):
    # Max length of the input sequence in DistilGPT2 is 1024 tokens 
    return base_tokenizer(x, max_length = 1024, truncation=True, add_special_tokens = True)['input_ids']

In [None]:
import pandas as pd
df = pd.read_csv('Plot_Summary_Dataset')

In [None]:
df['Input'] = df.apply(combine, axis=1)
df = df['Input']

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.1, random_state=1)

In [None]:
train_df = train_df['Input'].apply(encode_sequences)
test_df = test_df['Input'].apply(encode_sequences)

In [None]:
train_df.dropna(inplace = True)
test_df.dropna(inplace = True)

In [None]:
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

## Model Training

In [None]:
#Setting pad token same as eos token
base_tokenizer.pad_token = base_tokenizer.eos_token

#Initializing Data Collator that forms batches and sends the input in a proper language modeling format to the model for training and evaluation
data_collator = DataCollatorForLanguageModeling(
        tokenizer=base_tokenizer,
        mlm=False,
        return_tensors="pt"
    )

In [None]:
model_path = 'Story_Gen_Model/DistilGPT2'

#Specifying the Training arguments for the model training 
training_args = TrainingArguments(
    output_dir=model_path,
    per_device_train_batch_size=8, 
    overwrite_output_dir = True, 
    evaluation_strategy="no", 
    gradient_accumulation_steps=8, 
    num_train_epochs=15,
    weight_decay=0.01, 
    lr_scheduler_type="cosine",
    learning_rate=5e-4, 
    fp16=True 
)

In [None]:
# Initializing the trainer

trainer = Seq2SeqTrainer(
    model=base_model,                         # the instantiated  Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=train_df,       # training dataset
    eval_dataset = test_df,
)

In [None]:
# Starting the training
trainer.train()

In [None]:
# Saving the final model
trainer.save_model()

## Model Evaluation (Perplexity)

In [None]:
trainer.evaluate()

## Loading and Testing 

In [None]:
base_model = AutoModelForCausalLM.from_pretrained("add path to trained model")
base_tokenizer = AutoTokenizer.from_pretrained('distilgpt2')

In [None]:
baseline_tokenizer.pad_token = baseline_tokenizer.eos_token

In [None]:
from transformers import pipeline

input = ""
generator = pipeline("text-generation", model=base_model, tokenizer = base_tokenizer)
do_sample = False
num_beams = 1
top_p = 0.9
generator(input, num_beams = num_beams, top_p = top_p, do_sample = do_sample, max_length = 150)[0]['generated_text']

## BLEU Score Evaluation

## Loss Curve

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_json('Add path to the json file')

In [None]:
def get_epoch(x):
  return x['epoch']

def get_loss(x):
  return x['loss']

In [None]:
df['epoch_number'] = df['log_history'].apply(get_epoch)
df ['loss'] = df['log_history'].apply(get_loss)

In [None]:
plt.plot(df['epoch_number'].values, df['loss'].values) 
plt.xlabel('#Epochs')
plt.ylabel('Loss value')
plt.title('Training curve T5')