In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Define the file paths
lines_file = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/Cornell_Movie_Dialogue_Corpus/movie_lines.txt"
conversation_file = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/Cornell_Movie_Dialogue_Corpus/movie_conversations.txt"

# Read the lines file with specified encoding
lines = pd.read_csv(lines_file, sep=r'\s*\+\+\+\$\+\+\+\s*', header=None, engine='python', encoding='ISO-8859-1')
lines.columns = ['line_id', 'character_id', 'movie_id', 'character', 'text']

# Read the conversations file with specified encoding
conversations = pd.read_csv(conversation_file, sep=r'\s*\+\+\+\$\+\+\+\s*', header=None, engine='python', encoding='ISO-8859-1')
conversations.columns = ['character1_id', 'character2_id', 'movie_id', 'utterance_ids']

# Display the first few rows of each DataFrame
print(lines.head())
print(conversations.head())


  line_id character_id movie_id character          text
0   L1045           u0       m0    BIANCA  They do not!
1   L1044           u2       m0   CAMERON   They do to!
2    L985           u0       m0    BIANCA    I hope so.
3    L984           u2       m0   CAMERON     She okay?
4    L925           u0       m0    BIANCA     Let's go.
  character1_id character2_id movie_id                     utterance_ids
0            u0            u2       m0  ['L194', 'L195', 'L196', 'L197']
1            u0            u2       m0                  ['L198', 'L199']
2            u0            u2       m0  ['L200', 'L201', 'L202', 'L203']
3            u0            u2       m0          ['L204', 'L205', 'L206']
4            u0            u2       m0                  ['L207', 'L208']


In [4]:
from tqdm import tqdm

# Create a function to extract the conversation history and responses
def create_dataset(conversations, lines):
    # Create a dictionary for fast line access
    line_dict = dict(zip(lines['line_id'], lines['text']))

    dataset = []

    # Use tqdm to show progress
    for _, row in tqdm(conversations.iterrows(), total=conversations.shape[0], desc="Processing conversations"):
        utterance_ids = row['utterance_ids'].strip('[]').replace("'", "").split(', ')

        # Initialize conversation history
        conversation_history = []

        for utterance_id in utterance_ids:
            # Access the text from the dictionary
            text = line_dict.get(utterance_id)
            if isinstance(text, str):  # Ensure text is a string
                # Add the current line to the history
                conversation_history.append(text)

                # Create a record with the current history and response
                if len(conversation_history) > 1:
                    # The last entry is the response
                    response = conversation_history[-1]
                    history = conversation_history[:-1]

                    # Ensure that the history is made of strings
                    dataset.append({
                        'conversation_history': ' '.join(h for h in history if isinstance(h, str)),
                        'response': response
                    })

    return pd.DataFrame(dataset)

# Create the dataset
import time

start_time = time.time()
chatbot_dataset = create_dataset(conversations, lines)
end_time = time.time()

# Display the first few rows of the dataset
print(f"Execution Time: {end_time - start_time} seconds")




Processing conversations: 100%|██████████| 83097/83097 [00:04<00:00, 18233.39it/s]


Execution Time: 4.896346092224121 seconds


In [5]:
chatbot_dataset.head()

Unnamed: 0,conversation_history,response
0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,Can we make this quick? Roxanne Korrine and A...,Not the hacking and gagging and spitting part....
2,Can we make this quick? Roxanne Korrine and A...,Okay... then how 'bout we try out some French ...
3,You're asking me out. That's so cute. What's ...,Forget it.
4,"No, no, it's my fault -- we didn't have a prop...",Cameron.


In [6]:
chatbot_dataset.to_csv("/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/attempt_2.csv", index=False)

## Model


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from tqdm import tqdm


In [8]:
# Load your dataset (adjust the path as necessary)
# chatbot_dataset = pd.read_csv('path/to/your/cornell_movie_chatbot_dataset.csv')


In [9]:
# Split the dataset into train and test sets (80% train, 20% test)
train_data, test_data = train_test_split(chatbot_dataset, test_size=0.2, random_state=42)

# Further split the train data into training and validation sets (80% train, 20% validation)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Display the sizes of the datasets
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")



Training set size: 141674
Validation set size: 35419
Test set size: 44274


In [10]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Set pad token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the datasets
def tokenize_data(data):
    return tokenizer(
        data['conversation_history'].tolist(),
        truncation=True,
        padding='max_length',
        max_length=100,
        return_tensors='pt'
        )

# Tokenize each split
train_encodings = tokenize_data(train_data)
val_encodings = tokenize_data(val_data)
test_encodings = tokenize_data(test_data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
class ChatbotDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets for training, validation, and testing
train_dataset = ChatbotDataset(train_encodings)
val_dataset = ChatbotDataset(val_encodings)
test_dataset = ChatbotDataset(test_encodings)


In [12]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, num_workers=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, num_workers=4, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, num_workers=4, shuffle=False)


In [13]:
# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [14]:
# Step 9: Check for GPU Availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the GPU
model.to(device)

# Step 10: Define Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Step 11: Training Loop with Validation
for epoch in range(3):  # Number of epochs
    start_time = time.time()  # Start time for the epoch
    model.train()  # Set model to training mode

    # Training phase
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
        optimizer.zero_grad()

        # Move input data to the GPU
        input_ids = batch['input_ids'].to(device)

        outputs = model(input_ids=input_ids, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            # Move input data to the GPU
            input_ids = batch['input_ids'].to(device)
            outputs = model(input_ids=input_ids, labels=input_ids)
            val_loss += outputs.loss.item()

    # Calculate the time taken for the epoch
    end_time = time.time()
    epoch_time = end_time - start_time

    # Print training and validation losses along with epoch time
    print(f"Epoch: {epoch + 1}, Training Loss: {loss.item():.4f}, Validation Loss: {val_loss / len(val_loader):.4f}, Time: {epoch_time:.2f} seconds")


Using device: cuda


Training Epoch 1: 100%|██████████| 8855/8855 [30:56<00:00,  4.77it/s]
Validating: 100%|██████████| 2214/2214 [02:20<00:00, 15.78it/s]


Epoch: 1, Training Loss: 1.7537, Validation Loss: 1.1485, Time: 1996.72 seconds


Training Epoch 2: 100%|██████████| 8855/8855 [30:56<00:00,  4.77it/s]
Validating: 100%|██████████| 2214/2214 [02:20<00:00, 15.76it/s]


Epoch: 2, Training Loss: 1.1251, Validation Loss: 0.9990, Time: 1996.63 seconds


Training Epoch 3: 100%|██████████| 8855/8855 [30:56<00:00,  4.77it/s]
Validating: 100%|██████████| 2214/2214 [02:20<00:00, 15.75it/s]

Epoch: 3, Training Loss: 0.9020, Validation Loss: 0.8982, Time: 1997.07 seconds





In [15]:
# Saving the model
gpt_model = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_model"
gpt_tokenizer = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer"

# Save the model and tokenizer
model.save_pretrained(gpt_model)
tokenizer.save_pretrained(gpt_tokenizer)


('/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer/added_tokens.json')

In [25]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=3a3dd6912fbe83279b5ab3c24548933089068a3e2fb36ed515a655a855f74568
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [26]:
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score

In [28]:
# Sample function to compute the average loss and perplexity
def compute_average_loss_and_perplexity(loader, model, device):
    model.eval()
    total_loss = 0
    num_batches = 0

    # Start timing
    start_time = time.time()

    with torch.no_grad():
        # Use tqdm to show progress
        for batch in tqdm(loader, desc="Computing Average Loss and Perplexity"):
            input_ids = batch['input_ids'].to(device)
            outputs = model(input_ids=input_ids, labels=input_ids)
            loss = outputs.loss
            total_loss += loss.item()
            num_batches += 1

    avg_loss = total_loss / num_batches
    perplexity = np.exp(avg_loss)

    # Calculate the time taken
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken: {elapsed_time:.2f} seconds")

    return avg_loss, perplexity

# Assuming you have a validation DataLoader
avg_loss, perplexity = compute_average_loss_and_perplexity(val_loader, model, device)
print(f"Average Loss: {avg_loss:.4f}, Perplexity: {perplexity:.4f}")

Computing Average Loss and Perplexity: 100%|██████████| 2214/2214 [02:20<00:00, 15.75it/s]

Time taken: 140.54 seconds
Average Loss: 0.8982, Perplexity: 2.4551





In [35]:
import numpy as np
import torch
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the GPT-2 tokenizer with left padding
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='left')
tokenizer.pad_token = tokenizer.eos_token  # Assign pad token
tokenizer.pad_token_id = tokenizer.eos_token_id  # Ensure padding ID is the same as EOS

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Step 3: Generate Predictions for the Test Dataset
model.eval()
predictions = []
references = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Generating Predictions"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)  # Ensure the attention mask is used

        # Generate outputs
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,
            num_return_sequences=1
        )

        # Decode the generated output
        generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        predictions.extend(generated_texts)

        # Collect reference texts (assuming input_ids corresponds to the expected outputs)
        reference_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
        references.extend(reference_texts)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Generating Predictions:  10%|▉         | 269/2768 [02:38<24:27,  1.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Generating Predictions:  10%|▉         | 270/2768 [02:39<24:26,  1.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Generating Predictions:  10%|▉         | 271/2768 [02:39<24:26,  1.70it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initia

In [36]:
# Function to compute BLEU, ROUGE, and METEOR scores
def compute_nlp_metrics(predictions, references):
    # Start timing
    start_time = time.time()

    # BLEU score
    bleu_scores = []
    for pred, ref in tqdm(zip(predictions, references), desc="Calculating BLEU"):
        bleu_scores.append(sentence_bleu([ref.split()], pred.split()))
    avg_bleu = np.mean(bleu_scores)

    # ROUGE score
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = []
    for pred, ref in tqdm(zip(predictions, references), desc="Calculating ROUGE"):
        rouge_scores.append(scorer.score(ref, pred))
    avg_rouge = {
        'rouge1': np.mean([score['rouge1'].fmeasure for score in rouge_scores]),
        'rouge2': np.mean([score['rouge2'].fmeasure for score in rouge_scores]),
        'rougeL': np.mean([score['rougeL'].fmeasure for score in rouge_scores]),
    }

    # METEOR score
    meteor_scores = []
    for pred, ref in tqdm(zip(predictions, references), desc="Calculating METEOR"):
        meteor_scores.append(meteor_score(ref.split(), pred.split()))
    avg_meteor = np.mean(meteor_scores)

    # Calculate total time taken
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken for metric calculations: {elapsed_time:.2f} seconds")

    return avg_bleu, avg_rouge, avg_meteor

# Compute BLEU, ROUGE, and METEOR scores using the test dataset
avg_bleu, avg_rouge, avg_meteor = compute_nlp_metrics(predictions, references)

# Print the results
print(f"Average BLEU: {avg_bleu:.4f}, Average ROUGE: {avg_rouge}, Average METEOR: {avg_meteor:.4f}")

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Calculating BLEU: 44274it [00:10, 4044.22it/s]
Calculating ROUGE: 44274it [01:32, 479.85it/s]
Calculating METEOR: 0it [00:00, ?it/s]


TypeError: "reference" expects pre-tokenized reference (Iterable[str]): Are

In [23]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the pad token to the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token  # Assign pad token
tokenizer.pad_token_id = tokenizer.eos_token_id  # Ensure padding ID is the same as EOS

# Set the model to evaluation mode
model.eval()

print("Chatbot is ready! Type 'exit' to stop the conversation.")

while True:
    # Get user input
    input_text = input("You: ")

    # Check for exit condition
    if input_text.lower() in ['exit', 'quit']:
        print("Chatbot: Goodbye!")
        break

    # Tokenize the input
    encoded_input = tokenizer.encode_plus(
        input_text,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=50,
        clean_up_tokenization_spaces=True  # Set to avoid warnings
    )

    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    # Generate output
    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,  # Generate up to 50 new tokens
            num_return_sequences=1
        )

    # Decode the generated output
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"Chatbot: {response}")


Chatbot is ready! Type 'exit' to stop the conversation.
You: Hello


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: Hello, I'm calling from... If you're selling something, I'm late for work... I'm calling from Professional Nurse Quarterly... From the magazine? This is Mr. Wallace in Subscriptions? How come you're calling me from...? This is
You: what are you talking about


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: what are you talking about
You: who are you calling?


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: who are you calling?
You: you are copying me


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: you are copying me
You: stop


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: stop, I'm sorry.  I was just... I was just... I was just... I was just... I was just... I was just so excited.  I was so excited.  I was so excited to get out of here. 
You: you are getting out of the computer?


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: you are getting out of the computer?
You: what?


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: what?
You: help me


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: help me
You: please


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: please, I'm sorry.  I was just... I was just... I was just... I was just... I was just so excited.  I was so excited to get out of here.  I was so excited to get back.  And
You: no


Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: no, I'm not.  I'm not.  I'm not.  I'm not.  I'm not.  I'm not.  I'm not.  I'm not.  I'm not.  I'm not.
You: exit
Chatbot: Goodbye!
