In [None]:
import pandas as pd
import re
import numpy as np
from collections import Counter, defaultdict
from nltk.util import ngrams
import os
import tensorflow as tf
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset


In [None]:
import copy

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
cd drive/MyDrive/

/content/drive/MyDrive


In [None]:
#Import GPT2's tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token

GPT2model = GPT2LMHeadModel.from_pretrained('distilgpt2')


In [None]:
#Test out model and tokenizer with a prompting example
sequence = "write a movie review:"

In [None]:
inputs = tokenizer.encode(sequence, return_tensors='pt')

In [None]:
attention_mask = torch.ones(inputs.shape, dtype=torch.long)

In [None]:
outputs = GPT2model.generate(
    inputs,
    attention_mask=attention_mask,  # Add the attention mask here
    max_length=50,
    do_sample=True,
    top_k=100,
    pad_token_id=tokenizer.pad_token_id  # Explicitly set the pad token ID
)

In [None]:
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
generated_text

#Non-sensical due to no-training.

'write a movie review:'

In [None]:
#Acquire the dataset from AllReviews csv file and clean it up to a usable form
AllReviews = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AllReviews.csv')
AllReviews = AllReviews.loc[(AllReviews["titleType"] == "tvMovie")|(AllReviews["titleType"] == "movie")]

# Filter relevant columns and remove duplicates
Reviews_Genres_Title = AllReviews[["Review", "genres"]].drop_duplicates()
# Remove rows with missing reviews
Reviews_Genres_Title = Reviews_Genres_Title.dropna(subset=["Review"])
# Remove rows with undefined genres
Reviews_Genres_Title = Reviews_Genres_Title[Reviews_Genres_Title["genres"] != '\\N']

# Copy for further processing
Complete_Reviews = Reviews_Genres_Title.copy()
# Convert reviews to lowercase
Complete_Reviews['Review'] = Complete_Reviews['Review'].str.lower()
# Remove HTML tags
Complete_Reviews['Review'] = Complete_Reviews['Review'].str.replace('<[^>]+>', '', regex=True)
# Remove non-alphanumeric characters
Complete_Reviews['Review'] = Complete_Reviews['Review'].str.replace('[^A-Za-z0-9 ]+', '', regex=True)


In [None]:
#Breaking up review genres
Complete_Reviews['genres']=Complete_Reviews['genres'].str.replace(',',' ')
Complete_Reviews['genres']=Complete_Reviews['genres'].str.split()

In [None]:
#Currently genre is not used in the review generation or training
Complete_Reviews['genres'][0:10]

29    [Action, Adventure, Biography]
30    [Action, Adventure, Biography]
31    [Action, Adventure, Biography]
32    [Action, Adventure, Biography]
63                           [Drama]
64                           [Drama]
65                           [Drama]
66                           [Drama]
67                           [Drama]
71          [Drama, Fantasy, Horror]
Name: genres, dtype: object

In [None]:
torch.cuda.is_available()
torch.cuda.memory_allocated()

0

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
GPT2model.to(device)
torch.cuda.memory_allocated()

334744576

In [None]:
#Get data into the form of a list to feed to the encoder
text = Complete_Reviews['Review'].to_numpy().tolist()

In [None]:
text[1]

'the story of the kelly gang is believed to be the worlds first feature length film running at between 65 and 70 minutes it was billed at the time as the longest film ever made it toured australia for nine years and was an enormous successtoday only fragments survive and it is hard to judge the films artistic merits about nine minutes of footage exists  some found on a garbage dump in melbourne some of this footage may be outtakes the footage is held by screensound australia the national screen and sound archive in canberrathe sequences show some enthusiastic acting although the camerawork is static like most films of the period the most remarkable shot is probably when a priest carrying a wounded man over his shoulder walks toward and just past the camera creating a strong sense of drama and movement the final shootout scene is also well filmed  with ned kelly moving and shooting toward the camera as troopers flee to the sidesa remarkable film of great historical importance that all f

In [None]:
len(text)

79709

In [None]:
test_text = text[0:100]

In [None]:
len(test_text)

100

In [None]:
# Setting up training with truncated reviews
input_ids = []
max_length = 256  # GPT-2's maximum sequence length

for review in text:
    encoded_review = tokenizer.encode(review, max_length=max_length, truncation=True, return_tensors='pt', padding = 'max_length')
    input_ids.append(encoded_review)

In [None]:
#Setting up training with 79709 reviews
#max_length = 256
#input_ids = []
#masks = []
##Iterate the encoder over the entire length of the text
#for i in range(len(text)):
#    enc = tokenizer(text[i], return_tensors='pt', max_length=max_length, truncation=True, padding = 'max_length')
#
#    input_ids.append(enc['input_ids'])
#    masks.append(enc['attention_mask'])

In [None]:
#input_ids = tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True).to(device)

In [None]:
input_ids[0]

tensor([[ 1169,  1621,   286,   262,   885, 12810,  7706, 40538,  6194,  4340,
          1111,   262,  4082,   286,   262, 38132,  1373,   666,  2646,  2831,
           290,   262, 22106,   286,   281, 38132,  1373,   666,  5369,   772,
           517,  5566,   340, 28783,    82,   262, 22106,   286,   262,  3895,
          2646,  1296,  1078,   258, 11621,   717,  3895, 13664,  3807,   373,
          7924,   416,  1149,   829,   256,  4548,   290, 18976,   379,   262,
           256,  4548,  1641,    82,  1149,  1010,  4244,  7964,   287,   262,
          7758, 12544, 23200,   286,   339,  5943,  3900,  6198,   612,   547,
           645,   987,    83, 30540, 45127,   373,  6157,   416,   281, 38500,
         40228,   508,   635,  2810,  2128,  3048,  1390, 29276,   290,  8169,
          1659,  1350,  1381,   340,  1575,  8576,   284,   787,   475,   326,
          1637,   290,   517,   373, 11911,  1626,   663,   717,  1285,   286,
         14135,   340, 44119,   287,  7758, 12544,  

In [None]:
#Visual of how to decode first element of encoded input ids
test_output = tokenizer.decode(input_ids[0][0], skip_special_tokens=True)
test_output

'the story of the kelly gang 1906 symbolizes both the birth of the australian film industry and the emergence of an australian identity even more significantly it heralds the emergence of the feature film formatthe worlds first featurelength movie was directed by charles tait and filmed at the tait familys chartersville estate in the melbourne suburb of heidelberg originally there were no intertitles narration was performed by an onstage lecturer who also provided sound effects including gunfire and hoofbeats it cost 1000 to make but that money and more was recovered within its first week of screening it premiered in melbourne on boxing day 1906 and was later shown across australia in new zealand and in britainonly fragments of the original production of more than one hour are known to exist and are preserved at the national film and sound archive canberra while some of the footage is almost pristine other segments are severely distorted the sensitive nitrate stock on which the film wa

In [None]:
torch.cuda.memory_allocated()

334744576

In [None]:
# set training parameters
train_batch_size = 20
num_train_epochs = 5
learning_rate = 5e-5

In [None]:
# Ensure all tensors are 1D
input_ids_1d = [t.flatten() for t in input_ids]
#masks_1d = [t.flatten() for t in masks]

# Now pad the sequence
#padded_input_ids = pad_sequence(input_ids_1d, batch_first=True)
#padded_masks = pad_sequence(masks_1d, batch_first=True)
padded_input_ids = torch.stack(input_ids_1d)

# Create a dataset and dataloader
#dataset = TensorDataset(padded_input_ids, padded_masks)
#data_loader = DataLoader(dataset, batch_size=train_batch_size, num_workers=4)
torch.cuda.memory_allocated()

334744576

In [None]:
# initialize optimizer and scheduler
optimizer = torch.optim.AdamW(GPT2model.parameters(), lr=learning_rate)
total_steps = len(input_ids) * num_train_epochs // train_batch_size
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

torch.cuda.empty_cache()
torch.cuda.memory_allocated()

334744576

In [None]:
# train the model
GPT2model.train()
for epoch in range(num_train_epochs):
    epoch_loss = 0.0
    for i in range(0, len(input_ids)-1, train_batch_size):
    #for input, mask in data_loader:
        # slice the input ids tensor to get the current batch
        batch_input_ids = padded_input_ids[i:i+train_batch_size]
        #batch_masks = padded_masks[i:i+train_batch_size].cuda()
        batch_labels = batch_input_ids.clone()
        batch_labels[:, :-1] = batch_labels[:, 1:].clone()
        # set label ids to -100 for padded tokens
        batch_labels[batch_labels == tokenizer.pad_token_id] = -100
        ## create shifted labels for each input in the batch
        #batch_labels = batch_input_ids.clone()
        #batch_labels[:, :-1] = batch_labels[:, 1:].clone()
        # set label ids to -100 for padded tokens
        #batch_labels[batch_labels == tokenizer.pad_token_id] = -100
        # clear gradients
        optimizer.zero_grad()
        # forward pass
        #outputs = GPT2model(input_ids=batch_input_ids, attention_mask = batch_masks, labels=batch_labels)
        outputs = GPT2model(input_ids=batch_input_ids.cuda(), labels=batch_labels.cuda())
        loss = outputs[0]
        # backward pass
        loss.backward()
        epoch_loss += loss.item()
        # clip gradients to prevent exploding gradients problem
        torch.nn.utils.clip_grad_norm_(GPT2model.parameters(), 1.0)
        # update parameters
        optimizer.step()
        scheduler.step()
        torch.cuda.empty_cache()
        if i % 5000 == 0:
            print(i, '/', len(input_ids))
    print('Epoch: {}, Loss: {:.4f}'.format(epoch+1, epoch_loss/len(input_ids)))


0 / 79709
5000 / 79709
10000 / 79709
15000 / 79709
20000 / 79709
25000 / 79709
30000 / 79709
35000 / 79709
40000 / 79709
45000 / 79709
50000 / 79709
55000 / 79709
60000 / 79709
65000 / 79709
70000 / 79709
75000 / 79709
Epoch: 1, Loss: 0.3107
0 / 79709
5000 / 79709
10000 / 79709
15000 / 79709
20000 / 79709
25000 / 79709
30000 / 79709
35000 / 79709
40000 / 79709
45000 / 79709
50000 / 79709
55000 / 79709
60000 / 79709
65000 / 79709
70000 / 79709
75000 / 79709
Epoch: 2, Loss: 0.2998
0 / 79709
5000 / 79709
10000 / 79709
15000 / 79709
20000 / 79709
25000 / 79709
30000 / 79709
35000 / 79709
40000 / 79709
45000 / 79709
50000 / 79709
55000 / 79709
60000 / 79709
65000 / 79709
70000 / 79709
75000 / 79709
Epoch: 3, Loss: 0.2956
0 / 79709
5000 / 79709
10000 / 79709
15000 / 79709
20000 / 79709
25000 / 79709
30000 / 79709
35000 / 79709
40000 / 79709
45000 / 79709
50000 / 79709
55000 / 79709
60000 / 79709
65000 / 79709
70000 / 79709
75000 / 79709
Epoch: 4, Loss: 0.2932
0 / 79709
5000 / 79709
10000 / 7

In [None]:
# save the trained model
output_dir = './results/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
GPT2model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.json',
 './results/merges.txt',
 './results/added_tokens.json')

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('./results/')
tokenizer.pad_token = tokenizer.eos_token

GPT2model = GPT2LMHeadModel.from_pretrained('./results/')
max_length = 256

In [None]:
torch.cuda.empty_cache()
torch.cuda.memory_allocated()

1930968576

In [None]:
#Try the same prompt with the newly fine tuned model
sequence = ["write a movie review:"]

In [None]:
#Iterate the encoder over the entire length of the text
seq_ids = tokenizer.encode(sequence, return_tensors='pt')

attention_mask = torch.ones(seq_ids.shape, dtype=torch.long)

In [None]:
#outputs = GPT2model.generate(seq_ids, max_length=max_length, do_sample=True, top_k=50, max_new_tokens = max_length)
outputs = outputs = GPT2model.generate(
    seq_ids,
    attention_mask=attention_mask,  # Add the attention mask here
    max_length=50,
    do_sample=True,
    top_k=100,
    pad_token_id=tokenizer.pad_token_id  # Explicitly set the pad token ID
)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [None]:
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
generated_text

' a story an story i say so you tell and it be you the is to seen see if are characters that make movie well a which to who the is are a matter fact the is of in one in to world is and to the part which'