### Analysing the Dataset

In [None]:
import pandas as pd

In [None]:
train_file = pd.read_csv('Shakespeare_data.csv')

In [None]:
train_file.head(5)

In [None]:
train_file['Play'].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()
encoded_col = encoder.fit_transform(train_file['Play'])

In [None]:
train_file['Play'] = pd.Series(encoded_col)

In [None]:
train_file1 = train_file['PlayerLine'].loc[train_file['ActSceneLine'].isna() == False]

In [None]:
train_file1.head()

We need to combine the rows which have the same `PlayerLinenumber` and where the `ActSceneLine` is of the format 1.1.x. After combining these rows, we split the text in the `PlayerLine` column and give each sentence a row.

In [None]:
train_file1 = train_file.dropna(axis=0)
train_file1.reset_index()

In [None]:
train_file1.drop(['Dataline', 'Player'], inplace=True, axis = 1)

In [None]:
train_file1 = train_file1.reset_index(drop=True)

In [None]:
train_file1.head()

In [None]:
split_cols = train_file1['ActSceneLine'].str.split('.', expand=True)
train_file1['Act'] = split_cols[0]
train_file1['Scene'] = split_cols[1]
train_file1['Line'] = split_cols[2]

In [None]:
train_file1.drop('ActSceneLine', inplace=True, axis = 1)

In [None]:
train_file1.head()

In [None]:
other_cols = ['PlayerLine']
group_cols = ['PlayerLinenumber', 'Act', 'Scene', 'Play']
def combine_rows(group):
        # Convert each row to a string and join them
        rows_combined = '\n'.join([
            ' '.join(map(str, row))
            for row in group[other_cols].values
        ])
        
        # Create a new row with group columns and combined string
        new_row = pd.Series()
        for col in ['PlayerLinenumber', 'Act', 'Scene', 'Play']:
            new_row[col] = group[col].iloc[0]
        new_row['combined_data'] = rows_combined
        
        return new_row

    # Group and apply the combining function
result = train_file1.groupby(group_cols, as_index=False).apply(combine_rows)

In [None]:
result['combined_data'].iloc[80]

In [None]:
text_col = result['combined_data']

### Generating text

In [None]:
import torch
import numpy as np
from torch import nn
from transformers import GPT2Tokenizer, GPT2Config, GPT2Model, GPT2PreTrainedModel
from torch.optim import AdamW
from tqdm import tqdm
from torch.nn import functional as F
import matplotlib.pyplot as plt

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
text_col = text_col + ' ' + "<|endoftext|>"

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', pad_token='<|pad|>')

In [None]:
class GPT2_Model(GPT2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.transformer = GPT2Model.from_pretrained('gpt2')
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2', pad_token='<|pad|>')

        self.transformer.resize_token_embeddings(len(tokenizer))

        self.lm_head = nn.Linear(config.n_embd, len(tokenizer), bias=False)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        x = self.transformer(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[0]
        x = self.lm_head(x)

        return x

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, input_data, tokenizer, gpt2_type="gpt2", max_length=280):
        self.texts = [tokenizer(data, truncation=True, max_length=max_length, padding="max_length", return_tensors="pt")
                    for data in input_data]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [None]:
class CrossEntropyLossFunction(nn.Module):
    def __init__(self):
        super(CrossEntropyLossFunction, self).__init__()
        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, lm_logits, labels):
        shift_logits = lm_logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        loss = self.loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        return loss

In [None]:
def train(model, tokenizer, train_data, epochs, learning_rate, epsilon=1e-8):
    train = CustomDataset(train_data, tokenizer)
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=16, shuffle=True)

    optimizer = AdamW(model.parameters(), lr = learning_rate, eps = epsilon)
    criterion = CrossEntropyLossFunction().to(device)
    model = model.to(device)

    best_loss = 1000

    for epoch_i in range(0, epochs):

        total_train_loss = 0
        total_val_loss = 0
        for train_input in tqdm(train_dataloader):

            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].to(device)

            outputs = model(input_id, attention_mask = mask, token_type_ids=None)

            loss = criterion(outputs, input_id)

            batch_loss = loss.item()
            total_train_loss += batch_loss

            loss.backward()
            optimizer.step()
            model.zero_grad()

        avg_train_loss = total_train_loss / len(train_dataloader)

        print(f"Epoch: {epoch_i}, Avg train loss: {np.round(avg_train_loss, 2)}")


epochs = 35
learning_rate = 1e-5
configuration = GPT2Config()
gpt_model = GPT2_Model(configuration).to(device)

train(gpt_model, tokenizer, text_col, epochs, learning_rate)

In [None]:
def generate(idx, max_new_tokens, context_size, tokenizer, model, top_k=10, top_p=0.95):
    for _ in range(max_new_tokens):
        if idx[:,-1].item() != tokenizer.encode(tokenizer.eos_token)[0]:
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -context_size:]
            # get the predictions
            logits = model(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sort probabilities in descending order
            sorted_probs, indices = torch.sort(probs, descending=True)
            # compute cumsum of probabilities
            probs_cumsum = torch.cumsum(sorted_probs, dim=1)
            # choose only top_p tokens
            sorted_probs, indices = sorted_probs[:, :probs_cumsum[[probs_cumsum < top_p]].size()[0] + 1], indices[:, :probs_cumsum[[probs_cumsum < top_p]].size()[0] +1]
            # choose only top_k tokens
            sorted_probs, indices = sorted_probs[:,:top_k], indices[:,:top_k]
            # sample from the distribution
            sorted_probs = F.softmax(sorted_probs, dim=-1)
            idx_next = indices[:, torch.multinomial(sorted_probs, num_samples=1)].squeeze(0)
            # append new token ids
            idx = torch.cat((idx, idx_next), dim=1)
        else:
            break

    return idx

In [None]:
gpt_model.eval()

prompt = "Generate a funny tweet."
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

sample_outputs = generate(generated, max_new_tokens=200, context_size=256, tokenizer=tokenizer, model=gpt_model, top_k=10, top_p=0.95)

for i, sample_output in enumerate(sample_outputs):
    print(f"{tokenizer.decode(sample_output, skip_special_tokens=True)}")