In [2]:
import pandas as pd
import time
from tqdm import tqdm
import re

In [3]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Define the file paths
lines_file = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/Cornell_Movie_Dialogue_Corpus/movie_lines.txt"
conversation_file = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/Cornell_Movie_Dialogue_Corpus/movie_conversations.txt"

# Read the lines file with specified encoding
lines = pd.read_csv(lines_file, sep=r'\s*\+\+\+\$\+\+\+\s*', header=None, engine='python', encoding='ISO-8859-1')
lines.columns = ['line_id', 'character_id', 'movie_id', 'character', 'text']

# Read the conversations file with specified encoding
conversations = pd.read_csv(conversation_file, sep=r'\s*\+\+\+\$\+\+\+\s*', header=None, engine='python', encoding='ISO-8859-1')
conversations.columns = ['character1_id', 'character2_id', 'movie_id', 'utterance_ids']

# Display the first few rows of each DataFrame
print(lines.head())
print(conversations.head())


  line_id character_id movie_id character          text
0   L1045           u0       m0    BIANCA  They do not!
1   L1044           u2       m0   CAMERON   They do to!
2    L985           u0       m0    BIANCA    I hope so.
3    L984           u2       m0   CAMERON     She okay?
4    L925           u0       m0    BIANCA     Let's go.
  character1_id character2_id movie_id                     utterance_ids
0            u0            u2       m0  ['L194', 'L195', 'L196', 'L197']
1            u0            u2       m0                  ['L198', 'L199']
2            u0            u2       m0  ['L200', 'L201', 'L202', 'L203']
3            u0            u2       m0          ['L204', 'L205', 'L206']
4            u0            u2       m0                  ['L207', 'L208']


In [5]:
# Function to clean the text
def clean_text(text):
    # Retain specific punctuation and remove others
    text = re.sub(r"[^\w\s.!?'’]", '', text)  # Keep punctuation and apostrophes
    text = text.strip()  # Strip whitespace
    text = ' '.join(text.split())  # Normalize whitespace
    return text

In [6]:
# Create a function to extract the conversation history and responses
def create_dataset(conversations, lines):
    # Create a dictionary for fast line access
    line_dict = dict(zip(lines['line_id'], lines['text']))

    dataset = []

    # Use tqdm to show progress
    for _, row in tqdm(conversations.iterrows(), total=conversations.shape[0], desc="Processing conversations"):
        utterance_ids = row['utterance_ids'].strip('[]').replace("'", "").split(', ')

        # Initialize conversation history
        conversation_history = []

        for utterance_id in utterance_ids:
            # Access the text from the dictionary
            text = line_dict.get(utterance_id)
            if isinstance(text, str):  # Ensure text is a string
                # Add the current line to the history
                conversation_history.append(text)

                # Create a record with the current history and response
                if len(conversation_history) > 1:
                    # The last entry is the response
                    response = conversation_history[-1]
                    history = conversation_history[:-1]

                    # Ensure that the history is made of strings
                    dataset.append({
                        'conversation_history': ' '.join(h for h in history if isinstance(h, str)),
                        'response': response
                    })

    return pd.DataFrame(dataset)




In [7]:
# Create the dataset
start_time = time.time()
chatbot_dataset = create_dataset(conversations, lines)
end_time = time.time()

# Clean the conversation history and responses, replacing the original columns
chatbot_dataset['conversation_history'] = chatbot_dataset['conversation_history'].apply(clean_text)
chatbot_dataset['response'] = chatbot_dataset['response'].apply(clean_text)

# Display the first few rows of the cleaned dataset
print(chatbot_dataset[['conversation_history', 'response']].head())
print(f"Execution Time: {end_time - start_time} seconds")



Processing conversations: 100%|██████████| 83097/83097 [00:04<00:00, 18372.11it/s]


                                conversation_history  \
0  Can we make this quick? Roxanne Korrine and An...   
1  Can we make this quick? Roxanne Korrine and An...   
2  Can we make this quick? Roxanne Korrine and An...   
3  You're asking me out. That's so cute. What's y...   
4  No no it's my fault we didn't have a proper in...   

                                            response  
0  Well I thought we'd start with pronunciation i...  
1  Not the hacking and gagging and spitting part....  
2  Okay... then how 'bout we try out some French ...  
3                                         Forget it.  
4                                           Cameron.  
Execution Time: 4.8355553150177 seconds


In [8]:
chatbot_dataset.head()

Unnamed: 0,conversation_history,response
0,Can we make this quick? Roxanne Korrine and An...,Well I thought we'd start with pronunciation i...
1,Can we make this quick? Roxanne Korrine and An...,Not the hacking and gagging and spitting part....
2,Can we make this quick? Roxanne Korrine and An...,Okay... then how 'bout we try out some French ...
3,You're asking me out. That's so cute. What's y...,Forget it.
4,No no it's my fault we didn't have a proper in...,Cameron.


In [9]:
chatbot_dataset.to_csv("/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/attempt_2.csv", index=False)

## Model


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.optim import AdamW
from tqdm import tqdm
import os



In [11]:
# Load your dataset (adjust the path as necessary)
# chatbot_dataset = pd.read_csv('path/to/your/cornell_movie_chatbot_dataset.csv')


In [12]:
# Split the dataset into train and test sets (80% train, 20% test)
train_data, test_data = train_test_split(chatbot_dataset, test_size=0.2, random_state=42)

# Further split the train data into training and validation sets (80% train, 20% validation)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Display the sizes of the datasets
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")



Training set size: 141674
Validation set size: 35419
Test set size: 44274


In [13]:
# Load the GPT-2 tokenizer with left padding
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='left')
# Set pad token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the datasets with time tracking
def tokenize_data(data):
    start_time = time.time()  # Start the timer
    # Use tqdm to show progress while processing the dataset
    encodings = tokenizer(
        list(tqdm(data['conversation_history'].tolist(), desc="Tokenizing")),  # Wrap the list with tqdm
        truncation=True,
        padding='max_length',  # Ensure all sequences are the same length
        max_length=100,  # Set maximum length for the sequences
        return_tensors='pt',  # Return PyTorch tensors
        return_attention_mask=True,  # Include the attention mask
        add_special_tokens=True,  # Add special tokens if necessary
    )
    end_time = time.time()  # End the timer
    print(f"Time taken for tokenization: {end_time - start_time:.2f} seconds")  # Print the elapsed time
    return encodings

# Tokenize each split
train_encodings = tokenize_data(train_data)
val_encodings = tokenize_data(val_data)
test_encodings = tokenize_data(test_data)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Tokenizing: 100%|██████████| 141674/141674 [00:00<00:00, 2821964.21it/s]


Time taken for tokenization: 42.36 seconds


Tokenizing: 100%|██████████| 35419/35419 [00:00<00:00, 2567010.88it/s]


Time taken for tokenization: 10.03 seconds


Tokenizing: 100%|██████████| 44274/44274 [00:00<00:00, 2672730.11it/s]


Time taken for tokenization: 12.28 seconds


In [14]:
class ChatbotDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets for training, validation, and testing
train_dataset = ChatbotDataset(train_encodings)
val_dataset = ChatbotDataset(val_encodings)
test_dataset = ChatbotDataset(test_encodings)


In [15]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, num_workers=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=4, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=4, shuffle=False)


In [16]:
# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [20]:
# Step 9: Check for GPU Availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define the checkpoint directory in Google Drive
checkpoint_dir = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint"
os.makedirs(checkpoint_dir, exist_ok=True)

# Move the model to the GPU
model.to(device)

# Step 10: Define Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop with Validation and Early Stopping
best_val_loss = float('inf')
patience = 3  # Number of epochs to wait for improvement
epochs_without_improvement = 0

# Step 11: Training Loop with Validation
for epoch in range(50):  # Number of epochs
    start_time = time.time()  # Start time for the epoch
    model.train()  # Set model to training mode

    # Training phase
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
        optimizer.zero_grad()

        # Move input data to the GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)


                # Compute the loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            # Move input data to the GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)  # Move attention mask to the device

            # Compute the loss
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            val_loss += outputs.loss.item()

    # Calculate the time taken for the epoch
    end_time = time.time()
    epoch_time = end_time - start_time

    # Print training and validation losses along with epoch time
    print(f"Epoch: {epoch + 1}, Training Loss: {loss.item():.4f}, Validation Loss: {val_loss / len(val_loader):.4f}, Time: {epoch_time:.2f} seconds")

    # Check for improvement
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0  # Reset the counter
        checkpoint_path = os.path.join(checkpoint_dir, f"model_epoch_{epoch + 1}.pt")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_val_loss': best_val_loss,
        }, checkpoint_path)
        print(f"Checkpoint saved at {checkpoint_path}")
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs.")
            break


Using device: cuda


Training Epoch 1: 100%|██████████| 4428/4428 [31:41<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.33it/s]


Epoch: 1, Training Loss: 1.1776, Validation Loss: 1.1920, Time: 2052.94 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_1.pt


Training Epoch 2: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 2, Training Loss: 1.3148, Validation Loss: 1.0511, Time: 2053.52 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_2.pt


Training Epoch 3: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 3, Training Loss: 1.0799, Validation Loss: 0.9433, Time: 2054.26 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_3.pt


Training Epoch 4: 100%|██████████| 4428/4428 [31:43<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.33it/s]


Epoch: 4, Training Loss: 0.5881, Validation Loss: 0.8634, Time: 2054.01 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_4.pt


Training Epoch 5: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.33it/s]


Epoch: 5, Training Loss: 0.9873, Validation Loss: 0.7996, Time: 2053.75 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_5.pt


Training Epoch 6: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.33it/s]


Epoch: 6, Training Loss: 0.5071, Validation Loss: 0.7484, Time: 2054.00 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_6.pt


Training Epoch 7: 100%|██████████| 4428/4428 [31:43<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.33it/s]


Epoch: 7, Training Loss: 0.8941, Validation Loss: 0.7069, Time: 2054.04 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_7.pt


Training Epoch 8: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.33it/s]


Epoch: 8, Training Loss: 0.6186, Validation Loss: 0.6714, Time: 2054.04 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_8.pt


Training Epoch 9: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.33it/s]


Epoch: 9, Training Loss: 0.7212, Validation Loss: 0.6417, Time: 2053.68 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_9.pt


Training Epoch 10: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.33it/s]


Epoch: 10, Training Loss: 0.2442, Validation Loss: 0.6153, Time: 2053.66 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_10.pt


Training Epoch 11: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.33it/s]


Epoch: 11, Training Loss: 0.3690, Validation Loss: 0.5932, Time: 2053.99 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_11.pt


Training Epoch 12: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.33it/s]


Epoch: 12, Training Loss: 0.5515, Validation Loss: 0.5799, Time: 2053.88 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_12.pt


Training Epoch 13: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 13, Training Loss: 0.3935, Validation Loss: 0.5619, Time: 2053.79 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_13.pt


Training Epoch 14: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 14, Training Loss: 0.3121, Validation Loss: 0.5436, Time: 2053.97 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_14.pt


Training Epoch 15: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.33it/s]


Epoch: 15, Training Loss: 0.3989, Validation Loss: 0.5304, Time: 2053.77 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_15.pt


Training Epoch 16: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 16, Training Loss: 0.5311, Validation Loss: 0.5237, Time: 2054.18 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_16.pt


Training Epoch 17: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 17, Training Loss: 0.3987, Validation Loss: 0.5107, Time: 2054.10 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_17.pt


Training Epoch 18: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 18, Training Loss: 0.7259, Validation Loss: 0.5042, Time: 2053.88 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_18.pt


Training Epoch 19: 100%|██████████| 4428/4428 [31:43<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 19, Training Loss: 0.2596, Validation Loss: 0.4955, Time: 2054.29 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_19.pt


Training Epoch 20: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 20, Training Loss: 0.1848, Validation Loss: 0.5003, Time: 2053.86 seconds


Training Epoch 21: 100%|██████████| 4428/4428 [31:43<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.31it/s]


Epoch: 21, Training Loss: 0.2736, Validation Loss: 0.4862, Time: 2054.49 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_21.pt


Training Epoch 22: 100%|██████████| 4428/4428 [31:43<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 22, Training Loss: 0.1344, Validation Loss: 0.4834, Time: 2054.29 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_22.pt


Training Epoch 23: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 23, Training Loss: 0.2159, Validation Loss: 0.4891, Time: 2054.02 seconds


Training Epoch 24: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.33it/s]


Epoch: 24, Training Loss: 0.1509, Validation Loss: 0.4830, Time: 2053.96 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_24.pt


Training Epoch 25: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 25, Training Loss: 0.3293, Validation Loss: 0.4725, Time: 2053.91 seconds
Checkpoint saved at /content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt_checkpoint/model_epoch_25.pt


Training Epoch 26: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 26, Training Loss: 0.3247, Validation Loss: 0.4772, Time: 2054.05 seconds


Training Epoch 27: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 27, Training Loss: 0.2051, Validation Loss: 0.4747, Time: 2054.15 seconds


Training Epoch 28: 100%|██████████| 4428/4428 [31:42<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]

Epoch: 28, Training Loss: 0.1361, Validation Loss: 0.4775, Time: 2054.10 seconds
Early stopping triggered after 28 epochs.





In [21]:
# Saving the model
gpt_model = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_50_epoch"
gpt_tokenizer = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_50_epoch"

# Save the model and tokenizer
model.save_pretrained(gpt_model)
tokenizer.save_pretrained(gpt_tokenizer)


('/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_50_epoch/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_50_epoch/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_50_epoch/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_50_epoch/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_50_epoch/added_tokens.json')

In [22]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=98750f13b13222ac6a8a8d921fc438aa860ec4dbe81d2800434261bcc91719b1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [23]:
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score

In [24]:
# Sample function to compute the average loss and perplexity
def compute_average_loss_and_perplexity(loader, model, device):
    model.eval()
    total_loss = 0
    num_batches = 0

    # Start timing
    start_time = time.time()

    with torch.no_grad():
        # Use tqdm to show progress
        for batch in tqdm(loader, desc="Computing Average Loss and Perplexity"):
            input_ids = batch['input_ids'].to(device)
            outputs = model(input_ids=input_ids, labels=input_ids)
            loss = outputs.loss
            total_loss += loss.item()
            num_batches += 1

    avg_loss = total_loss / num_batches
    perplexity = np.exp(avg_loss)

    # Calculate the time taken
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken: {elapsed_time:.2f} seconds")

    return avg_loss, perplexity

# Assuming you have a validation DataLoader
avg_loss, perplexity = compute_average_loss_and_perplexity(val_loader, model, device)
print(f"Average Loss: {avg_loss:.4f}, Perplexity: {perplexity:.4f}")

Computing Average Loss and Perplexity: 100%|██████████| 1107/1107 [02:29<00:00,  7.39it/s]

Time taken: 149.73 seconds
Average Loss: 1.1925, Perplexity: 3.2953





In [25]:
# Function to compute BLEU score
def compute_bleu(predictions, references):
    # Calculate BLEU score for each prediction
    bleu_scores = [sentence_bleu([ref.split()], pred.split()) for pred, ref in zip(predictions, references)]
    avg_bleu = np.mean(bleu_scores)  # Average BLEU score across all predictions
    return avg_bleu


In [26]:
import time
from tqdm import tqdm
import torch

# Start the timer
start_time = time.time()

# Generate predictions
model.eval()
predictions = []
references = []

# Use tqdm for progress tracking
with torch.no_grad():
    with tqdm(total=len(test_loader), desc="Generating Predictions", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}]", leave=True) as pbar:
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Generate predictions
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=50,
                num_return_sequences=1
            )

            # Decode the generated output
            generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            predictions.extend(generated_texts)

            # Collect reference texts
            reference_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
            references.extend(reference_texts)

            # Update the progress bar
            pbar.update(1)

# End the timer
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken for generating predictions: {elapsed_time:.2f} seconds")



Generating Predictions:   0%|          | 0/1384 [00:00]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Predictions:   0%|          | 1/1384 [00:01]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Predictions:   0%|          | 2/1384 [00:01]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Predictions:   0%|          | 3/1384 [00:02]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Predictions:   0%|          | 4/1384 [00:03]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Predictions:   0%|          | 5/1384 [00:03]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Predictions:   0%|          | 6/1384 [00:04]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Generating Predictions:   1%|          | 7/1384 [00:05]Setting `pad_token_id` to `eos_token_id`:50256 for open-

Time taken for generating predictions: 977.32 seconds





In [27]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Prepare the references in the required format for BLEU calculation
formatted_references = [[word_tokenize(ref)] for ref in references]  # Tokenize reference texts
formatted_predictions = [word_tokenize(pred) for pred in predictions]  # Tokenize generated texts

# Calculate BLEU score
bleu_score = corpus_bleu(formatted_references, formatted_predictions)
print(f"BLEU Score: {bleu_score:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


BLEU Score: 0.4063


In [28]:
pip install rouge-score



In [29]:
from rouge_score import rouge_scorer

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE scores
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for ref, pred in zip(references, predictions):
    scores = scorer.score(ref, pred)
    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

# Calculate average ROUGE scores
average_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
average_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
average_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])

# Print average ROUGE scores
print(f"Average ROUGE-1 Score: {average_rouge1:.4f}")
print(f"Average ROUGE-2 Score: {average_rouge2:.4f}")
print(f"Average ROUGE-L Score: {average_rougeL:.4f}")

Average ROUGE-1 Score: 0.4765
Average ROUGE-2 Score: 0.4605
Average ROUGE-L Score: 0.4765


In [30]:
import nltk
nltk.download('wordnet')
from nltk.translate import meteor_score
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [31]:
# Calculate METEOR scores
meteor_scores = []

# Tokenize references and predictions
for ref, pred in zip(references, predictions):
    # Tokenize both reference and prediction
    tokenized_ref = word_tokenize(ref)
    tokenized_pred = word_tokenize(pred)

    # Calculate the METEOR score using tokenized inputs (as lists)
    score = meteor_score.single_meteor_score(tokenized_ref, tokenized_pred)
    meteor_scores.append(score)

# Calculate average METEOR score
average_meteor = sum(meteor_scores) / len(meteor_scores)

# Print average METEOR score
print(f"Average METEOR Score: {average_meteor:.4f}")

Average METEOR Score: 0.6655


In [33]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the pad token to the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token  # Assign pad token
tokenizer.pad_token_id = tokenizer.eos_token_id  # Ensure padding ID is the same as EOS

# Set the model to evaluation mode
model.eval()

print("Chatbot is ready! Type 'exit' to stop the conversation.")

while True:
    # Get user input
    input_text = input("You: ")

    # Check for exit condition
    if input_text.lower() in ['exit', 'quit']:
        print("Chatbot: Goodbye!")
        break

    # Tokenize the input
    encoded_input = tokenizer.encode_plus(
        input_text,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=50,
        clean_up_tokenization_spaces=True  # Set to avoid warnings
    )

    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    # Generate output
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,  # Generate up to 50 new tokens
            num_return_sequences=1
        )

    # Decode the generated output
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"Chatbot: {response}")

Chatbot is ready! Type 'exit' to stop the conversation.
You: hi


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: hi. It's okay. I'm here. Where are you? We don't see you. I'm three quarters of the way up the lot behind the concession stand. Stay there. I don't want you within two hundred feet. Park down by
You: what are you doing


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: what are you doingto meet the man who's gonna kill you? You know the rules. What about a drink on the way home? I'll meet him at his place at eleven o'clock. He'll come into the house. I don't think so.
You: what drink


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: what drink? What? What? Oh! Who would jack us back? Hell! A umanu ujacku... You! I worked a gig with an actress! Four years!... You can't work in a gig with an actress! I...
You: who is jack


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: who is jackandewizard? I cannot tell you. I cannot tell you. You've got to get somebody to talk to you. If we're not talking to each other you are. You must be crazy. She's my sister. She's the one
You: who is your sister


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: who is your sister? My sister. There are many a beautiful women in this city. And you have no control over her. Believe me. She's got no idea what she's in. No? No. She knows things that nobody else knows. She is a
You: who is jack again


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: who is jack again? The guy who called you the other day. He's been making phone calls for weeks. He had a girlfriend but she broke up with him last month. You know who it was? He's not a girlfriend. I'm sorry. Look I


KeyboardInterrupt: Interrupted by user