In [None]:
import pandas as pd
import time
from tqdm import tqdm
import re

In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Show all columns
pd.set_option('display.max_columns', None)

# Show all rows (adjust if needed, can use 'None' for no limit)
pd.set_option('display.max_rows', None)

# Set the maximum width of each column (adjust if needed)
pd.set_option('display.max_colwidth', None)

In [None]:
# Define the file paths
lines_file = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/Cornell_Movie_Dialogue_Corpus/movie_lines.txt"
conversation_file = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/Cornell_Movie_Dialogue_Corpus/movie_conversations.txt"

# Read the lines file with specified encoding
lines = pd.read_csv(lines_file, sep=r'\s*\+\+\+\$\+\+\+\s*', header=None, engine='python', encoding='ISO-8859-1')
lines.columns = ['line_id', 'character_id', 'movie_id', 'character', 'text']

# Read the conversations file with specified encoding
conversations = pd.read_csv(conversation_file, sep=r'\s*\+\+\+\$\+\+\+\s*', header=None, engine='python', encoding='ISO-8859-1')
conversations.columns = ['character1_id', 'character2_id', 'movie_id', 'utterance_ids']

# Display the first few rows of each DataFrame
print(lines.head())
print(conversations.head())


  line_id character_id movie_id character          text
0   L1045           u0       m0    BIANCA  They do not!
1   L1044           u2       m0   CAMERON   They do to!
2    L985           u0       m0    BIANCA    I hope so.
3    L984           u2       m0   CAMERON     She okay?
4    L925           u0       m0    BIANCA     Let's go.
  character1_id character2_id movie_id                     utterance_ids
0            u0            u2       m0  ['L194', 'L195', 'L196', 'L197']
1            u0            u2       m0                  ['L198', 'L199']
2            u0            u2       m0  ['L200', 'L201', 'L202', 'L203']
3            u0            u2       m0          ['L204', 'L205', 'L206']
4            u0            u2       m0                  ['L207', 'L208']


In [None]:
# Function to clean the text
def clean_text(text):
    # Retain specific punctuation and remove others
    text = re.sub(r"[^\w\s.!?'’]", '', text)  # Keep punctuation and apostrophes
    text = text.strip()  # Strip whitespace
    text = ' '.join(text.split())  # Normalize whitespace
    return text

In [None]:
def create_dataset(conversations, lines):
    # Create a dictionary for fast line access
    line_dict = dict(zip(lines['line_id'], lines['text']))

    dataset = []

    # Use tqdm to show progress
    for _, row in tqdm(conversations.iterrows(), total=conversations.shape[0], desc="Processing conversations"):
        utterance_ids = row['utterance_ids'].strip('[]').replace("'", "").split(', ')

        # Initialize conversation history
        conversation_history = []

        for utterance_id in utterance_ids:
            # Access the text from the dictionary
            text = line_dict.get(utterance_id)
            if isinstance(text, str):  # Ensure text is a string
                # Add the current line to the history with [SEP] separator between turns
                conversation_history.append(f"{text} [SEP]")

                # Create a record with the current history and response
                if len(conversation_history) > 1:
                    # The last entry is the response
                    response = conversation_history[-1]
                    history = conversation_history[:-1]

                    # Ensure that the history is made of strings
                    dataset.append({
                        'conversation_history': ' '.join(h for h in history if isinstance(h, str)),
                        'response': response.strip(' [SEP]')  # Remove the last [SEP] in the response
                    })

    return pd.DataFrame(dataset)




In [None]:
# Create the dataset
start_time = time.time()
chatbot_dataset = create_dataset(conversations, lines)
end_time = time.time()

# Clean the conversation history and responses, replacing the original columns
chatbot_dataset['conversation_history'] = chatbot_dataset['conversation_history'].apply(clean_text)
chatbot_dataset['response'] = chatbot_dataset['response'].apply(clean_text)

# Display the first few rows of the cleaned dataset
#print(chatbot_dataset[['conversation_history', 'response']].head())
print(f"Execution Time: {end_time - start_time} seconds")



Processing conversations: 100%|██████████| 83097/83097 [00:04<00:00, 17739.94it/s]


Execution Time: 4.995898246765137 seconds


In [None]:
chatbot_dataset.head()

Unnamed: 0,conversation_history,response
0,Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break up on the quad. Again. SEP,Well I thought we'd start with pronunciation if that's okay with you.
1,Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break up on the quad. Again. SEP Well I thought we'd start with pronunciation if that's okay with you. SEP,Not the hacking and gagging and spitting part. Please.
2,Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break up on the quad. Again. SEP Well I thought we'd start with pronunciation if that's okay with you. SEP Not the hacking and gagging and spitting part. Please. SEP,Okay... then how 'bout we try out some French cuisine. Saturday? Night?
3,You're asking me out. That's so cute. What's your name again? SEP,Forget it.
4,No no it's my fault we didn't have a proper introduction SEP,Cameron.


In [None]:
chatbot_dataset.to_csv("/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/attempt_3.csv", index=False)

## Model


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.optim import AdamW
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F




In [None]:
# Load your dataset (adjust the path as necessary)
# chatbot_dataset = pd.read_csv('path/to/your/cornell_movie_chatbot_dataset.csv')


In [None]:
# Assuming chatbot_dataset is already defined
# Sample 30% of the dataset before splitting
sampled_dataset = chatbot_dataset.sample(frac=0.3, random_state=42)

# Split the dataset into train and test sets (80% train, 20% test)
train_data, test_data = train_test_split(sampled_dataset, test_size=0.2, random_state=42)

# Further split the train data into training and validation sets (80% train, 20% validation)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Display the sizes of the sampled datasets
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")




Training set size: 42502
Validation set size: 10626
Test set size: 13282


In [None]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set pad token to be the same as the end-of-sequence token
tokenizer.padding_side = 'left'  # Ensure left-padding
tokenizer.pad_token = tokenizer.eos_token

# Adding special tokens to the tokenizer
special_tokens = {'additional_special_tokens': ['[USER]', '[BOT]', '[SEP]']}
tokenizer.add_special_tokens(special_tokens)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



3

In [None]:
# Define a new dataset class to maintain context with limited history and turn markers
class ContextualChatbotDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=100, history_window=3):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.history_window = history_window  # Number of turns to retain in history

    def __getitem__(self, idx):
        # Get the conversation history and the response
        # Split conversation history by the [SEP] token between turns
        conversation_history = self.data['conversation_history'].iloc[idx].split(" [SEP] ")
        response = self.data['response'].iloc[idx]

        # Limit the conversation history to the last `history_window` turns
        limited_history = conversation_history[-self.history_window:]

        # Add special tokens for user and bot turns
        input_text = f"[USER] {' [BOT] '.join(limited_history)} [BOT] {response}"

        # Tokenize the input
        encodings = self.tokenizer.encode_plus(
            input_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            return_attention_mask=True,
            add_special_tokens=True,
        )

        return {
            'input_ids': encodings['input_ids'].squeeze(0),  # Remove batch dimension
            'attention_mask': encodings['attention_mask'].squeeze(0),  # Remove batch dimension
            'labels': encodings['input_ids'].squeeze(0),  # Set labels to input_ids for language modeling
        }

    def __len__(self):
        return len(self.data)

# Create datasets for training, validation, and testing
train_dataset = ContextualChatbotDataset(train_data, tokenizer, history_window=3)  # Limit to last 3 turns
val_dataset = ContextualChatbotDataset(val_data, tokenizer, history_window=3)
test_dataset = ContextualChatbotDataset(test_data, tokenizer, history_window=3)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, num_workers=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=4, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=4, shuffle=False)


In [15]:
# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

# Check for GPU Availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the GPU
model.to(device)

# Define Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define ReduceLROnPlateau scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)

# Gradient Accumulation Settings
gradient_accumulation_steps = 4  # Adjust as needed
best_val_loss = float('inf')
patience = 3  # Number of epochs to wait for improvement
epochs_without_improvement = 0

# Training Loop with Gradient Accumulation
for epoch in range(50):  # Number of epochs
    start_time = time.time()  # Start the timer for the epoch

    model.train()  # Set model to training mode
    total_loss = 0

    # Training phase
    for step, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch + 1}")):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss  # Use standard cross-entropy loss

        # Normalize the loss to account for gradient accumulation
        loss = loss / gradient_accumulation_steps
        total_loss += loss.item()

        loss.backward()  # Backpropagate gradients

        # Update weights after accumulating gradients
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step(total_loss / (step + 1))  # Update the scheduler with the average loss

    print(f"Epoch {epoch + 1}, Average Training Loss: {total_loss / len(train_loader):.4f}")

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Forward pass for validation
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            val_loss += outputs.loss.item()

    val_loss /= len(val_loader)
    scheduler.step(val_loss)

    # Calculate the time taken for the epoch
    end_time = time.time()
    epoch_time = end_time - start_time  # Calculate how long the epoch took

    # Print training and validation losses along with epoch time
    print(f"Epoch {epoch + 1}, Validation Loss: {val_loss:.4f}, Time: {epoch_time:.2f} seconds")

    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0  # Reset the patience counter
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs.")
            break

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using device: cuda


Training Epoch 1: 100%|██████████| 1329/1329 [08:39<00:00,  2.56it/s]


Epoch 1, Average Training Loss: 0.9076


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.31it/s]


Epoch 1, Validation Loss: 2.1885, Time: 565.42 seconds


Training Epoch 2: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 2, Average Training Loss: 0.5969


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.33it/s]


Epoch 2, Validation Loss: 2.1888, Time: 572.60 seconds


Training Epoch 3: 100%|██████████| 1329/1329 [08:46<00:00,  2.52it/s]


Epoch 3, Average Training Loss: 0.5967


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.29it/s]


Epoch 3, Validation Loss: 2.1884, Time: 572.09 seconds


Training Epoch 4: 100%|██████████| 1329/1329 [08:46<00:00,  2.52it/s]


Epoch 4, Average Training Loss: 0.5968


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.30it/s]


Epoch 4, Validation Loss: 2.1879, Time: 572.44 seconds


Training Epoch 5: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 5, Average Training Loss: 0.5966


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.32it/s]


Epoch 5, Validation Loss: 2.1873, Time: 573.00 seconds


Training Epoch 6: 100%|██████████| 1329/1329 [08:46<00:00,  2.52it/s]


Epoch 6, Average Training Loss: 0.5964


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.31it/s]


Epoch 6, Validation Loss: 2.1866, Time: 572.35 seconds


Training Epoch 7: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 7, Average Training Loss: 0.5964


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.30it/s]


Epoch 7, Validation Loss: 2.1859, Time: 573.17 seconds


Training Epoch 8: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 8, Average Training Loss: 0.5962


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.33it/s]


Epoch 8, Validation Loss: 2.1853, Time: 573.07 seconds


Training Epoch 9: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 9, Average Training Loss: 0.5960


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.31it/s]


Epoch 9, Validation Loss: 2.1846, Time: 572.92 seconds


Training Epoch 10: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 10, Average Training Loss: 0.5961


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.30it/s]


Epoch 10, Validation Loss: 2.1840, Time: 573.31 seconds


Training Epoch 11: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 11, Average Training Loss: 0.5960


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.30it/s]


Epoch 11, Validation Loss: 2.1835, Time: 573.56 seconds


Training Epoch 12: 100%|██████████| 1329/1329 [08:46<00:00,  2.52it/s]


Epoch 12, Average Training Loss: 0.5958


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.30it/s]


Epoch 12, Validation Loss: 2.1829, Time: 572.58 seconds


Training Epoch 13: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 13, Average Training Loss: 0.5955


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.30it/s]


Epoch 13, Validation Loss: 2.1825, Time: 573.17 seconds


Training Epoch 14: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 14, Average Training Loss: 0.5958


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.30it/s]


Epoch 14, Validation Loss: 2.1820, Time: 573.49 seconds


Training Epoch 15: 100%|██████████| 1329/1329 [08:46<00:00,  2.52it/s]


Epoch 15, Average Training Loss: 0.5955


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.31it/s]


Epoch 15, Validation Loss: 2.1817, Time: 572.39 seconds


Training Epoch 16: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 16, Average Training Loss: 0.5956


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.33it/s]


Epoch 16, Validation Loss: 2.1813, Time: 572.57 seconds


Training Epoch 17: 100%|██████████| 1329/1329 [08:46<00:00,  2.52it/s]


Epoch 17, Average Training Loss: 0.5953


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.30it/s]


Epoch 17, Validation Loss: 2.1811, Time: 572.47 seconds


Training Epoch 18: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 18, Average Training Loss: 0.5955


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.31it/s]


Epoch 18, Validation Loss: 2.1808, Time: 572.83 seconds


Training Epoch 19: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 19, Average Training Loss: 0.5953


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.30it/s]


Epoch 19, Validation Loss: 2.1806, Time: 573.37 seconds


Training Epoch 20: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 20, Average Training Loss: 0.5951


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.30it/s]


Epoch 20, Validation Loss: 2.1804, Time: 573.44 seconds


Training Epoch 21: 100%|██████████| 1329/1329 [08:48<00:00,  2.52it/s]


Epoch 21, Average Training Loss: 0.5952


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.30it/s]


Epoch 21, Validation Loss: 2.1801, Time: 573.68 seconds


Training Epoch 22: 100%|██████████| 1329/1329 [08:48<00:00,  2.52it/s]


Epoch 22, Average Training Loss: 0.5952


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.31it/s]


Epoch 22, Validation Loss: 2.1799, Time: 573.58 seconds


Training Epoch 23: 100%|██████████| 1329/1329 [08:48<00:00,  2.52it/s]


Epoch 23, Average Training Loss: 0.5952


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.31it/s]


Epoch 23, Validation Loss: 2.1797, Time: 573.59 seconds


Training Epoch 24: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 24, Average Training Loss: 0.5953


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.30it/s]


Epoch 24, Validation Loss: 2.1795, Time: 573.49 seconds


Training Epoch 25: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 25, Average Training Loss: 0.5950


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.29it/s]


Epoch 25, Validation Loss: 2.1793, Time: 573.50 seconds


Training Epoch 26: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 26, Average Training Loss: 0.5949


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.30it/s]


Epoch 26, Validation Loss: 2.1790, Time: 573.52 seconds


Training Epoch 27: 100%|██████████| 1329/1329 [08:47<00:00,  2.52it/s]


Epoch 27, Average Training Loss: 0.5946


Validating: 100%|██████████| 333/333 [00:45<00:00,  7.29it/s]


Epoch 27, Validation Loss: 2.1788, Time: 573.58 seconds


Training Epoch 28:  90%|████████▉ | 1191/1329 [07:53<00:54,  2.52it/s]


KeyboardInterrupt: 

In [None]:
# Saving the model
gpt_model = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_model_3_1_1"
gpt_tokenizer = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_3_1_1"

# Save the model and tokenizer
model.save_pretrained(gpt_model)
tokenizer.save_pretrained(gpt_tokenizer)


('/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_3_1/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_3_1/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_3_1/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_3_1/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_3_1/added_tokens.json')

In [None]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=9636436b04b7c6d0aaceeb0b84353a69c794b43f8f4e94e4508f06ca6f508ee7
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score

In [None]:
# Sample function to compute the average loss and perplexity
def compute_average_loss_and_perplexity(loader, model, device):
    model.eval()
    total_loss = 0
    num_batches = 0

    # Start timing
    start_time = time.time()

    with torch.no_grad():
        # Use tqdm to show progress
        for batch in tqdm(loader, desc="Computing Average Loss and Perplexity"):
            input_ids = batch['input_ids'].to(device)
            outputs = model(input_ids=input_ids, labels=input_ids)
            loss = outputs.loss
            total_loss += loss.item()
            num_batches += 1

    avg_loss = total_loss / num_batches
    perplexity = np.exp(avg_loss)

    # Calculate the time taken
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken: {elapsed_time:.2f} seconds")

    return avg_loss, perplexity

# Assuming you have a validation DataLoader
avg_loss, perplexity = compute_average_loss_and_perplexity(val_loader, model, device)
print(f"Average Loss: {avg_loss:.4f}, Perplexity: {perplexity:.4f}")

Computing Average Loss and Perplexity: 100%|██████████| 333/333 [00:45<00:00,  7.32it/s]

Time taken: 45.51 seconds
Average Loss: 4.6321, Perplexity: 102.7302





In [None]:
from rouge_score import rouge_scorer

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Initialize dictionaries to store ROUGE scores
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for ref, pred in zip(references, predictions):
    # Ensure both reference and prediction are not empty before scoring
    if ref.strip() and pred.strip():
        scores = scorer.score(ref, pred)
        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

# Safeguard against division by zero by checking lengths
def safe_average(scores):
    return sum(scores) / len(scores) if scores else 0.0

# Calculate average ROUGE scores
average_rouge1 = safe_average(rouge_scores['rouge1'])
average_rouge2 = safe_average(rouge_scores['rouge2'])
average_rougeL = safe_average(rouge_scores['rougeL'])

# Print average ROUGE scores
print(f"Average ROUGE-1 Score: {average_rouge1:.4f}")
print(f"Average ROUGE-2 Score: {average_rouge2:.4f}")
print(f"Average ROUGE-L Score: {average_rougeL:.4f}")


Average ROUGE-1 Score: 0.9888
Average ROUGE-2 Score: 0.9887
Average ROUGE-L Score: 0.9888


In [None]:
# Required libraries
import torch  # For working with the model and tensors
from transformers import GPT2LMHeadModel, GPT2Tokenizer  # For GPT-2 model and tokenizer
from tqdm import tqdm  # For progress bars
import numpy as np  # For numerical operations
import nltk  # For tokenization and BLEU score calculation
from nltk.tokenize import word_tokenize  # For word tokenization
from nltk.translate.bleu_score import corpus_bleu  # For corpus-level BLEU score
import re  # For text normalization and cleaning

# For NLTK, download the 'punkt' tokenizer
nltk.download('punkt')


In [None]:
# Function to clean and normalize text for BLEU score calculation
def clean_text_eval(text):
    # Lowercase the text for case-insensitive comparison
    text = text.lower()

    # Retain specific punctuation and remove others
    text = re.sub(r"[^\w\s.!?'’]", '', text)  # Keep punctuation and apostrophes

    # Separate punctuation from words (optional but can help with BLEU n-gram matching)
    text = re.sub(r"([.!?'’])", r" \1", text)

    text = text.strip()  # Strip whitespace
    text = ' '.join(text.split())  # Normalize whitespace
    return text

# Generate predictions and references with normalization
model.eval()
predictions = []
references = []

start_time = time.time()

# Use tqdm for progress tracking
with torch.no_grad():
    with tqdm(total=len(test_loader), desc="Generating Predictions", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}]", leave=True) as pbar:
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Generate predictions
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=50,
                num_return_sequences=1
            )

            # Decode the generated output
            generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

            # Clean and normalize predictions
            predictions.extend([clean_text_eval(pred) for pred in generated_texts])

            # Collect and normalize reference texts
            reference_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
            references.extend([clean_text_eval(ref) for ref in reference_texts])

            # Update the progress bar
            pbar.update(1)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken for generating predictions: {elapsed_time:.2f} seconds")

# Prepare the references in the required format for BLEU calculation
formatted_references = [[word_tokenize(ref)] for ref in references]  # Tokenize reference texts
formatted_predictions = [word_tokenize(pred) for pred in predictions]  # Tokenize generated texts

# Calculate BLEU score
bleu_score = corpus_bleu(formatted_references, formatted_predictions)
print(f"Corpus BLEU Score: {bleu_score:.4f}")


In [None]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')  # Ensure 'punkt' is downloaded for tokenization
from nltk.translate import meteor_score
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Calculate METEOR scores with the unified clean_text_eval function
meteor_scores = []

# Tokenize references and predictions
for ref, pred in zip(references, predictions):
    # Clean and normalize both reference and prediction using clean_text_eval
    ref_clean = clean_text_eval(ref)
    pred_clean = clean_text_eval(pred)

    # Ensure neither reference nor prediction is empty before calculating METEOR
    if ref_clean.strip() and pred_clean.strip():
        # Tokenize both reference and prediction
        tokenized_ref = word_tokenize(ref_clean)
        tokenized_pred = word_tokenize(pred_clean)

        # Calculate the METEOR score using tokenized inputs
        score = meteor_score.single_meteor_score(tokenized_ref, tokenized_pred)
        meteor_scores.append(score)

# Safeguard against division by zero when averaging
average_meteor = sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0.0

# Print average METEOR score
print(f"Average METEOR Score: {average_meteor:.4f}")

Average METEOR Score: 0.9920


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the pad token to the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

# Move model to the device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Set the model to evaluation mode
model.eval()

# Initialize conversation history
conversation_history = []

print("Chatbot is ready! Type 'exit' to stop the conversation.")

while True:
    # Get user input
    user_input = input("You: ")

    # Check for exit condition
    if user_input.lower() in ['exit', 'quit']:
        print("Chatbot: Goodbye!")
        break

    # Update conversation history
    conversation_history.append(f"User: {user_input}\n")

    # Keep the conversation history manageable (limiting to the last 5 exchanges)
    if len(conversation_history) > 10:  # 5 turns (user + bot)
        conversation_history = conversation_history[-10:]

    # Prepare the input by joining the conversation history
    input_text = "".join(conversation_history) + "Chatbot:"

    # Tokenize the input
    encoded_input = tokenizer.encode_plus(
        input_text,
        return_tensors='pt',
        padding='max_length',  # Pad input if necessary
        truncation=True,       # Truncate to max_length
        max_length=150         # Adjust max length as needed
    )

    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    # Generate output
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,  # Generate up to 50 new tokens
            num_return_sequences=1,
            temperature=0.7,    # Control randomness in output
            top_k=50,           # Top-k sampling for variety
            top_p=0.95,         # Nucleus sampling
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode the generated output
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Print chatbot's response
    print(f"Chatbot: {response}")

    # Append only the chatbot's latest response to the conversation history
    conversation_history.append(f"Chatbot: {response}\n")


Chatbot is ready! Type 'exit' to stop the conversation.
You: What is the meaning of life?
Chatbot: User: What is the meaning of life?
Chatbot:
Chatbot: What is the meaning of life?
Chatbot: What is the meaning of life?
Chatbot: What is the meaning of life?
Chatbot: What is the meaning of life?
Chatbot: What is
You: What is the meaning of life?
Chatbot: User: What is the meaning of life?
Chatbot: User: What is the meaning of life?
Chatbot:
Chatbot: What is the meaning of life?
Chatbot: What is the meaning of life?
Chatbot: What is the meaning of life?
Chatbot: What is the meaning of life?
Chatbot: What is
User: What is the meaning of life?
Chatbot:The meaning of life is
Chatbot:The meaning of life is
Chatbot:The meaning of life is
Chatbot:The meaning of life is
Chatbot:The meaning of life is
Chatbot:The meaning of life is


KeyboardInterrupt: Interrupted by user