In [1]:
import pandas as pd
import time
from tqdm import tqdm
import re

In [2]:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Define the file paths
lines_file = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/Cornell_Movie_Dialogue_Corpus/movie_lines.txt"
conversation_file = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/Cornell_Movie_Dialogue_Corpus/movie_conversations.txt"

# Read the lines file with specified encoding
lines = pd.read_csv(lines_file, sep=r'\s*\+\+\+\$\+\+\+\s*', header=None, engine='python', encoding='ISO-8859-1')
lines.columns = ['line_id', 'character_id', 'movie_id', 'character', 'text']

# Read the conversations file with specified encoding
conversations = pd.read_csv(conversation_file, sep=r'\s*\+\+\+\$\+\+\+\s*', header=None, engine='python', encoding='ISO-8859-1')
conversations.columns = ['character1_id', 'character2_id', 'movie_id', 'utterance_ids']

# Display the first few rows of each DataFrame
print(lines.head())
print(conversations.head())


  line_id character_id movie_id character          text
0   L1045           u0       m0    BIANCA  They do not!
1   L1044           u2       m0   CAMERON   They do to!
2    L985           u0       m0    BIANCA    I hope so.
3    L984           u2       m0   CAMERON     She okay?
4    L925           u0       m0    BIANCA     Let's go.
  character1_id character2_id movie_id                     utterance_ids
0            u0            u2       m0  ['L194', 'L195', 'L196', 'L197']
1            u0            u2       m0                  ['L198', 'L199']
2            u0            u2       m0  ['L200', 'L201', 'L202', 'L203']
3            u0            u2       m0          ['L204', 'L205', 'L206']
4            u0            u2       m0                  ['L207', 'L208']


In [4]:
# Function to clean the text
def clean_text(text):
    # Retain specific punctuation and remove others
    text = re.sub(r"[^\w\s.!?'’]", '', text)  # Keep punctuation and apostrophes
    text = text.strip()  # Strip whitespace
    text = ' '.join(text.split())  # Normalize whitespace
    return text

In [5]:
# Create a function to extract the conversation history and responses
def create_dataset(conversations, lines):
    # Create a dictionary for fast line access
    line_dict = dict(zip(lines['line_id'], lines['text']))

    dataset = []

    # Use tqdm to show progress
    for _, row in tqdm(conversations.iterrows(), total=conversations.shape[0], desc="Processing conversations"):
        utterance_ids = row['utterance_ids'].strip('[]').replace("'", "").split(', ')

        # Initialize conversation history
        conversation_history = []

        for utterance_id in utterance_ids:
            # Access the text from the dictionary
            text = line_dict.get(utterance_id)
            if isinstance(text, str):  # Ensure text is a string
                # Add the current line to the history
                conversation_history.append(text)

                # Create a record with the current history and response
                if len(conversation_history) > 1:
                    # The last entry is the response
                    response = conversation_history[-1]
                    history = conversation_history[:-1]

                    # Ensure that the history is made of strings
                    dataset.append({
                        'conversation_history': ' '.join(h for h in history if isinstance(h, str)),
                        'response': response
                    })

    return pd.DataFrame(dataset)




In [6]:
# Create the dataset
start_time = time.time()
chatbot_dataset = create_dataset(conversations, lines)
end_time = time.time()

# Clean the conversation history and responses, replacing the original columns
chatbot_dataset['conversation_history'] = chatbot_dataset['conversation_history'].apply(clean_text)
chatbot_dataset['response'] = chatbot_dataset['response'].apply(clean_text)

# Display the first few rows of the cleaned dataset
print(chatbot_dataset[['conversation_history', 'response']].head())
print(f"Execution Time: {end_time - start_time} seconds")



Processing conversations: 100%|██████████| 83097/83097 [00:04<00:00, 17931.99it/s]


                                conversation_history  \
0  Can we make this quick? Roxanne Korrine and An...   
1  Can we make this quick? Roxanne Korrine and An...   
2  Can we make this quick? Roxanne Korrine and An...   
3  You're asking me out. That's so cute. What's y...   
4  No no it's my fault we didn't have a proper in...   

                                            response  
0  Well I thought we'd start with pronunciation i...  
1  Not the hacking and gagging and spitting part....  
2  Okay... then how 'bout we try out some French ...  
3                                         Forget it.  
4                                           Cameron.  
Execution Time: 4.973172664642334 seconds


In [7]:
chatbot_dataset.head()

Unnamed: 0,conversation_history,response
0,Can we make this quick? Roxanne Korrine and An...,Well I thought we'd start with pronunciation i...
1,Can we make this quick? Roxanne Korrine and An...,Not the hacking and gagging and spitting part....
2,Can we make this quick? Roxanne Korrine and An...,Okay... then how 'bout we try out some French ...
3,You're asking me out. That's so cute. What's y...,Forget it.
4,No no it's my fault we didn't have a proper in...,Cameron.


In [8]:
chatbot_dataset.to_csv("/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/attempt_2.csv", index=False)

## Model


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.optim import AdamW
from tqdm import tqdm




In [10]:
# Load your dataset (adjust the path as necessary)
# chatbot_dataset = pd.read_csv('path/to/your/cornell_movie_chatbot_dataset.csv')


In [11]:
# Assuming chatbot_dataset is already defined
# Split the dataset into train and test sets (80% train, 20% test)
train_data, test_data = train_test_split(chatbot_dataset, test_size=0.2, random_state=42)

# Further split the train data into training and validation sets (80% train, 20% validation)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Display the sizes of the datasets
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")




Training set size: 141674
Validation set size: 35419
Test set size: 44274


In [12]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set pad token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
# Define a new dataset class to maintain context
class ContextualChatbotDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=100):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        # Get the conversation history and the response
        conversation_history = self.data['conversation_history'].iloc[idx]
        response = self.data['response'].iloc[idx]

        # Prepare the input text (include the history)
        input_text = f"{conversation_history} {response}"

        # Tokenize the input
        encodings = self.tokenizer.encode_plus(
            input_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            return_attention_mask=True,
            add_special_tokens=True,
        )

        return {
            'input_ids': encodings['input_ids'].squeeze(0),  # Remove batch dimension
            'attention_mask': encodings['attention_mask'].squeeze(0),  # Remove batch dimension
            'labels': encodings['input_ids'].squeeze(0),  # Set labels to input_ids for language modeling
        }

    def __len__(self):
        return len(self.data)

# Create datasets for training, validation, and testing
train_dataset = ContextualChatbotDataset(train_data, tokenizer)
val_dataset = ContextualChatbotDataset(val_data, tokenizer)
test_dataset = ContextualChatbotDataset(test_data, tokenizer)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, num_workers=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=4, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=4, shuffle=False)


In [14]:
# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Check for GPU Availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the GPU
model.to(device)

# Define Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop with Validation and Early Stopping
best_val_loss = float('inf')
patience = 3  # Number of epochs to wait for improvement
epochs_without_improvement = 0

# Training Loop
for epoch in range(50):  # Number of epochs
    start_time = time.time()  # Start time for the epoch
    model.train()  # Set model to training mode

    # Training phase
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
        optimizer.zero_grad()

        # Move input data to the GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Compute the loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            # Move input data to the GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Compute the loss
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            val_loss += outputs.loss.item()

    # Calculate the time taken for the epoch
    end_time = time.time()
    epoch_time = end_time - start_time

    # Print training and validation losses along with epoch time
    print(f"Epoch: {epoch + 1}, Training Loss: {loss.item():.4f}, Validation Loss: {val_loss / len(val_loader):.4f}, Time: {epoch_time:.2f} seconds")

    # Check for improvement
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0  # Reset the counter
        # Optionally, save the model checkpoint here
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs.")
            break

Using device: cuda


Training Epoch 1: 100%|██████████| 4428/4428 [31:14<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 1, Training Loss: 1.6789, Validation Loss: 1.5777, Time: 2025.58 seconds


Training Epoch 2: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.31it/s]


Epoch: 2, Training Loss: 1.6970, Validation Loss: 1.4361, Time: 2030.89 seconds


Training Epoch 3: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.31it/s]


Epoch: 3, Training Loss: 2.1421, Validation Loss: 1.3199, Time: 2031.24 seconds


Training Epoch 4: 100%|██████████| 4428/4428 [31:20<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 4, Training Loss: 1.4596, Validation Loss: 1.2304, Time: 2031.24 seconds


Training Epoch 5: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 5, Training Loss: 1.0711, Validation Loss: 1.1561, Time: 2030.87 seconds


Training Epoch 6: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.31it/s]


Epoch: 6, Training Loss: 0.8944, Validation Loss: 1.0941, Time: 2031.00 seconds


Training Epoch 7: 100%|██████████| 4428/4428 [31:20<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 7, Training Loss: 0.9886, Validation Loss: 1.0460, Time: 2031.49 seconds


Training Epoch 8: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 8, Training Loss: 1.0996, Validation Loss: 1.0001, Time: 2030.86 seconds


Training Epoch 9: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 9, Training Loss: 0.7152, Validation Loss: 0.9626, Time: 2030.84 seconds


Training Epoch 10: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 10, Training Loss: 1.0288, Validation Loss: 0.9329, Time: 2030.87 seconds


Training Epoch 11: 100%|██████████| 4428/4428 [31:20<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 11, Training Loss: 0.7160, Validation Loss: 0.9009, Time: 2031.68 seconds


Training Epoch 12: 100%|██████████| 4428/4428 [31:20<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 12, Training Loss: 0.7676, Validation Loss: 0.8870, Time: 2031.65 seconds


Training Epoch 13: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 13, Training Loss: 0.9956, Validation Loss: 0.8626, Time: 2031.05 seconds


Training Epoch 14: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 14, Training Loss: 0.6742, Validation Loss: 0.8410, Time: 2030.95 seconds


Training Epoch 15: 100%|██████████| 4428/4428 [31:20<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.31it/s]


Epoch: 15, Training Loss: 0.6049, Validation Loss: 0.8207, Time: 2031.85 seconds


Training Epoch 16: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 16, Training Loss: 0.4724, Validation Loss: 0.8050, Time: 2031.21 seconds


Training Epoch 17: 100%|██████████| 4428/4428 [31:20<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.31it/s]


Epoch: 17, Training Loss: 0.5962, Validation Loss: 0.7919, Time: 2031.64 seconds


Training Epoch 18: 100%|██████████| 4428/4428 [31:20<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.31it/s]


Epoch: 18, Training Loss: 0.5263, Validation Loss: 0.7786, Time: 2031.65 seconds


Training Epoch 19: 100%|██████████| 4428/4428 [31:20<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 19, Training Loss: 0.5476, Validation Loss: 0.7729, Time: 2031.67 seconds


Training Epoch 20: 100%|██████████| 4428/4428 [31:20<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 20, Training Loss: 0.3703, Validation Loss: 0.7674, Time: 2031.41 seconds


Training Epoch 21: 100%|██████████| 4428/4428 [31:20<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.31it/s]


Epoch: 21, Training Loss: 0.3620, Validation Loss: 0.7505, Time: 2031.52 seconds


Training Epoch 22: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 22, Training Loss: 0.5583, Validation Loss: 0.7560, Time: 2031.13 seconds


Training Epoch 23: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.31it/s]


Epoch: 23, Training Loss: 0.3637, Validation Loss: 0.7457, Time: 2030.80 seconds


Training Epoch 24: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.33it/s]


Epoch: 24, Training Loss: 0.3741, Validation Loss: 0.7438, Time: 2030.55 seconds


Training Epoch 25: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 25, Training Loss: 0.5933, Validation Loss: 0.7375, Time: 2030.26 seconds


Training Epoch 26: 100%|██████████| 4428/4428 [31:19<00:00,  2.36it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 26, Training Loss: 0.2775, Validation Loss: 0.7286, Time: 2030.29 seconds


Training Epoch 27: 100%|██████████| 4428/4428 [31:06<00:00,  2.37it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.30it/s]


Epoch: 27, Training Loss: 0.3608, Validation Loss: 0.7277, Time: 2018.62 seconds


Training Epoch 28: 100%|██████████| 4428/4428 [31:22<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.29it/s]


Epoch: 28, Training Loss: 0.4108, Validation Loss: 0.7298, Time: 2034.62 seconds


Training Epoch 29: 100%|██████████| 4428/4428 [31:22<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.30it/s]


Epoch: 29, Training Loss: 0.6891, Validation Loss: 0.7223, Time: 2034.19 seconds


Training Epoch 30: 100%|██████████| 4428/4428 [31:22<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.29it/s]


Epoch: 30, Training Loss: 0.4125, Validation Loss: 0.7341, Time: 2034.29 seconds


Training Epoch 31: 100%|██████████| 4428/4428 [31:22<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.29it/s]


Epoch: 31, Training Loss: 0.5769, Validation Loss: 0.7377, Time: 2033.84 seconds


Training Epoch 32: 100%|██████████| 4428/4428 [31:22<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.30it/s]

Epoch: 32, Training Loss: 0.2043, Validation Loss: 0.7225, Time: 2034.19 seconds
Early stopping triggered after 32 epochs.





In [15]:
# Saving the model
gpt_model = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_model_2_dot_long"
gpt_tokenizer = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_2_dot_long"

# Save the model and tokenizer
model.save_pretrained(gpt_model)
tokenizer.save_pretrained(gpt_tokenizer)


('/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_2_dot_long/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_2_dot_long/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_2_dot_long/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_2_dot_long/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_2_dot_long/added_tokens.json')

In [16]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=88eb1ac8098f2fcbbfe5648b82e10bb084cab8eab45be122eeeb5e58ba0895ab
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [17]:
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score

In [18]:
# Sample function to compute the average loss and perplexity
def compute_average_loss_and_perplexity(loader, model, device):
    model.eval()
    total_loss = 0
    num_batches = 0

    # Start timing
    start_time = time.time()

    with torch.no_grad():
        # Use tqdm to show progress
        for batch in tqdm(loader, desc="Computing Average Loss and Perplexity"):
            input_ids = batch['input_ids'].to(device)
            outputs = model(input_ids=input_ids, labels=input_ids)
            loss = outputs.loss
            total_loss += loss.item()
            num_batches += 1

    avg_loss = total_loss / num_batches
    perplexity = np.exp(avg_loss)

    # Calculate the time taken
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken: {elapsed_time:.2f} seconds")

    return avg_loss, perplexity

# Assuming you have a validation DataLoader
avg_loss, perplexity = compute_average_loss_and_perplexity(val_loader, model, device)
print(f"Average Loss: {avg_loss:.4f}, Perplexity: {perplexity:.4f}")

Computing Average Loss and Perplexity: 100%|██████████| 1107/1107 [02:30<00:00,  7.37it/s]

Time taken: 150.13 seconds
Average Loss: 0.7231, Perplexity: 2.0608





In [27]:
# Function to compute BLEU score
def compute_bleu(predictions, references):
    # Calculate BLEU score for each prediction
    bleu_scores = [sentence_bleu([ref.split()], pred.split()) for pred, ref in zip(predictions, references)]
    avg_bleu = np.mean(bleu_scores)  # Average BLEU score across all predictions
    return avg_bleu

In [28]:
import time
from tqdm import tqdm
import torch

# Start the timer
start_time = time.time()

# Generate predictions
model.eval()
predictions = []
references = []

# Use tqdm for progress tracking
with torch.no_grad():
    with tqdm(total=len(test_loader), desc="Generating Predictions", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}]", leave=True) as pbar:
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Generate predictions
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=50,
                num_return_sequences=1
            )

            # Decode the generated output
            generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            predictions.extend(generated_texts)

            # Collect reference texts
            reference_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
            references.extend(reference_texts)

            # Update the progress bar
            pbar.update(1)

# End the timer
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken for generating predictions: {elapsed_time:.2f} seconds")

Generating Predictions:   0%|          | 0/1384 [00:00]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Generating Predictions:   0%|          | 1/1384 [00:00]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Generating Predictions:   0%|          | 2/1384 [00:01]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Generating Predictions:   0%|          | 3/1384 [00:02]Setting `pad_token_id` to `eos_token_id`:502

Time taken for generating predictions: 771.53 seconds





In [29]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Prepare the references in the required format for BLEU calculation
formatted_references = [[word_tokenize(ref)] for ref in references]  # Tokenize reference texts
formatted_predictions = [word_tokenize(pred) for pred in predictions]  # Tokenize generated texts

# Calculate BLEU score
bleu_score = corpus_bleu(formatted_references, formatted_predictions)
print(f"BLEU Score: {bleu_score:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


BLEU Score: 0.9452


In [31]:
from rouge_score import rouge_scorer

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE scores
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for ref, pred in zip(references, predictions):
    scores = scorer.score(ref, pred)
    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

# Calculate average ROUGE scores
average_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
average_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
average_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])

# Print average ROUGE scores
print(f"Average ROUGE-1 Score: {average_rouge1:.4f}")
print(f"Average ROUGE-2 Score: {average_rouge2:.4f}")
print(f"Average ROUGE-L Score: {average_rougeL:.4f}")

Average ROUGE-1 Score: 0.9888
Average ROUGE-2 Score: 0.9887
Average ROUGE-L Score: 0.9888


In [30]:
import nltk
nltk.download('wordnet')
from nltk.translate import meteor_score
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
# Calculate METEOR scores
meteor_scores = []

# Tokenize references and predictions
for ref, pred in zip(references, predictions):
    # Tokenize both reference and prediction
    tokenized_ref = word_tokenize(ref)
    tokenized_pred = word_tokenize(pred)

    # Calculate the METEOR score using tokenized inputs (as lists)
    score = meteor_score.single_meteor_score(tokenized_ref, tokenized_pred)
    meteor_scores.append(score)

# Calculate average METEOR score
average_meteor = sum(meteor_scores) / len(meteor_scores)

# Print average METEOR score
print(f"Average METEOR Score: {average_meteor:.4f}")

Average METEOR Score: 0.9920


In [40]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the pad token to the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

# Move model to the device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Set the model to evaluation mode
model.eval()

# Initialize conversation history
conversation_history = []

print("Chatbot is ready! Type 'exit' to stop the conversation.")

while True:
    # Get user input
    user_input = input("You: ")

    # Check for exit condition
    if user_input.lower() in ['exit', 'quit']:
        print("Chatbot: Goodbye!")
        break

    # Update conversation history
    conversation_history.append(f"User: {user_input}\n")

    # Keep the conversation history manageable (limiting to the last 5 exchanges)
    if len(conversation_history) > 10:  # 5 turns (user + bot)
        conversation_history = conversation_history[-10:]

    # Prepare the input by joining the conversation history
    input_text = "".join(conversation_history) + "Chatbot:"

    # Tokenize the input
    encoded_input = tokenizer.encode_plus(
        input_text,
        return_tensors='pt',
        padding='max_length',  # Pad input if necessary
        truncation=True,       # Truncate to max_length
        max_length=150         # Adjust max length as needed
    )

    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    # Generate output
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,  # Generate up to 50 new tokens
            num_return_sequences=1,
            temperature=0.7,    # Control randomness in output
            top_k=50,           # Top-k sampling for variety
            top_p=0.95,         # Nucleus sampling
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode the generated output
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Print chatbot's response
    print(f"Chatbot: {response}")

    # Append only the chatbot's latest response to the conversation history
    conversation_history.append(f"Chatbot: {response}\n")


Chatbot is ready! Type 'exit' to stop the conversation.
You: What is the meaning of life?
Chatbot: User: What is the meaning of life?
Chatbot:
Chatbot: What is the meaning of life?
Chatbot: What is the meaning of life?
Chatbot: What is the meaning of life?
Chatbot: What is the meaning of life?
Chatbot: What is
You: What is the meaning of life?
Chatbot: User: What is the meaning of life?
Chatbot: User: What is the meaning of life?
Chatbot:
Chatbot: What is the meaning of life?
Chatbot: What is the meaning of life?
Chatbot: What is the meaning of life?
Chatbot: What is the meaning of life?
Chatbot: What is
User: What is the meaning of life?
Chatbot:The meaning of life is
Chatbot:The meaning of life is
Chatbot:The meaning of life is
Chatbot:The meaning of life is
Chatbot:The meaning of life is
Chatbot:The meaning of life is


KeyboardInterrupt: Interrupted by user