In [4]:
#!pip install torch transformers nltk

import nltk
nltk.download('punkt')

# Set the device (GPU if available, else CPU)
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#print(f"Using device: {device}")

[nltk_data] Downloading package punkt to C:\Users\Pete
[nltk_data]     P\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Data Preprocessing

In [5]:
# Load and Parse Data
import os
import torch
from transformers import GPT2Tokenizer

# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the pad_token to the eos_token and specify padding side
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'  # For decoder-only models like GPT-2

# Define the paths to the dataset files
lines_file = "dataset/movie_lines.txt"
conversations_file = "dataset/movie_conversations.txt"

# Function to load and parse the movie lines
def load_lines(file_path):
    """
    Load and parse the movie lines from the dataset.
    Returns a dictionary mapping line IDs to text.
    """
    lines = {}
    # Open the file with the appropriate encoding to handle special characters
    with open(file_path, encoding='iso-8859-1') as f:
        for line in f:
            # Split each line into its components
            parts = line.strip().split(" +++$+++ ")
            if len(parts) == 5:
                line_id = parts[0]  # Unique identifier for the line
                text = parts[4]     # The actual dialogue text
                lines[line_id] = text  # Store in a dictionary
    return lines

# Function to load and parse the movie conversations
def load_conversations(file_path):
    """
    Load and parse the movie conversations from the dataset.
    Returns a list of conversations, each conversation is a list of line IDs.
    """
    conversations = []
    # Open the file with the appropriate encoding
    with open(file_path, encoding='iso-8859-1') as f:
        for line in f:
            # Split each line into its components
            parts = line.strip().split(" +++$+++ ")
            if len(parts) == 4:
                # The fourth part contains the list of line IDs in a conversation
                line_ids = eval(parts[3])  # Convert the string representation of the list to an actual list
                conversations.append(line_ids)
    return conversations

# Function to extract dialogues based on the conversations and lines
def extract_dialogues(conversations, lines):
    """
    Extract the dialogues from the conversations using the line IDs.
    Returns a list of dialogues, each dialogue is a list of utterances.
    """
    dialogues = []
    for conv in conversations:
        conv_dialogues = []
        for line_id in conv:
            if line_id in lines:
                conv_dialogues.append(lines[line_id])  # Append the dialogue text for each line ID
        dialogues.append(conv_dialogues)
    return dialogues

# Load the data
lines = load_lines(lines_file)
conversations = load_conversations(conversations_file)
dialogues = extract_dialogues(conversations, lines)

# Display sample dialogues for inspection
print("Sample Dialogues:")
for i in range(2):
    print(f"Conversation {i+1}: {dialogues[i]}")




Sample Dialogues:
Conversation 1: ['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"]
Conversation 2: ["You're asking me out.  That's so cute. What's your name again?", 'Forget it.']


#### Explanation:
- The pad_token is set to the eos_token (<|endoftext|>) to handle padding.
- The padding side is set to 'left' because GPT-2 is a decoder-only model.
- Load the dataset files and parse them to extract dialogues.
- The load_lines function reads movie_lines.txt and stores each line's text with its unique ID.
- The load_conversations function reads movie_conversations.txt and extracts sequences of line IDs representing conversations.
- The extract_dialogues function links line IDs to their actual text to reconstruct the conversations.

### Prepare Data for Training

In [6]:
# Set the maximum number of previous exchanges to use as context
max_history = 5

# Function to build input-target pairs from dialogues
def build_inputs_targets(dialogues):
    """
    Build input-target pairs for training.
    Each input consists of a context (history of utterances),
    and the target is the next reply.
    """
    inputs = []
    for conv in dialogues:
        for i in range(1, len(conv)):
            # Use up to max_history previous exchanges as context
            history = conv[max(0, i - max_history):i]
            reply = conv[i]  # The current reply
            inputs.append((history, reply))  # Store the context and the reply
    return inputs

# Build the dataset
data = build_inputs_targets(dialogues)

# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# Function to tokenize and encode the data using GPT-2 tokenizer
def encode_data(data, tokenizer):
    encoded_inputs = []
    for history, reply in data:
        input_text = "<|endoftext|>".join(history) + "<|endoftext|>"
        full_text = input_text + reply + "<|endoftext|>"
        encoded = tokenizer.encode(
            full_text,
            add_special_tokens=False,
            max_length=1024,
            truncation=True
        )
        encoded_inputs.append(torch.tensor(encoded))
    return encoded_inputs

# Encode the data
encoded_data = encode_data(data, tokenizer)

# Display sample encoded data
print("\nSample Encoded Data:")
for i in range(2):
    print(f"Encoded Sample {i+1}: {encoded_data[i][:10]}... (length: {len(encoded_data[i])})")


Sample Encoded Data:
Encoded Sample 1: tensor([ 6090,   356,   787,   428,  2068,    30,   220, 34821, 21952, 14769])... (length: 49)
Encoded Sample 2: tensor([ 6090,   356,   787,   428,  2068,    30,   220, 34821, 21952, 14769])... (length: 63)


- Prepare the data by creating input-target pairs, where the input is the conversation history and the target is the reply.
- The encode_data function tokenizes and encodes the combined text using the GPT-2 tokenizer.
- Ensure that the input sequences are truncated to a maximum length to fit the model's requirements.

### Create Dataset and DataLoader

In [7]:
from torch.utils.data import Dataset, DataLoader

# Custom dataset class for our chatbot data
class ChatDataset(Dataset):
    def __init__(self, encoded_data):
        self.data = encoded_data  # Store the encoded data

    def __len__(self):
        return len(self.data)  # Return the total number of samples

    def __getitem__(self, idx):
        return self.data[idx]  # Retrieve a sample by index

# Collate function to pad sequences within a batch and create attention masks and labels
def collate_fn(batch):
    """
    Collate function to process batches of data.
    Pads input sequences, creates attention masks and labels.
    """
    # Pad sequences to the maximum length in the batch
    input_ids = torch.nn.utils.rnn.pad_sequence(
        batch, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    # Create attention masks (1 where input_ids is not pad_token_id, 0 otherwise)
    attention_mask = (input_ids != tokenizer.pad_token_id).long()
    # Create labels; set pad_token_id to -100 so that they are ignored in loss computation
    labels = input_ids.clone()
    labels[input_ids == tokenizer.pad_token_id] = -100
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

# To use a subset of the data for faster training
subset_size = 50000  # Adjust based on our computational resources
encoded_data = encoded_data[:subset_size]


# Split the data into training and validation sets
from sklearn.model_selection import train_test_split

# First, split the data into training and test sets (80% train, 20% test)
train_data, test_data = train_test_split(encoded_data, test_size=0.2, random_state=42)

# Then, split the training data into training and validation sets (80% train, 20% val)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Verify the sizes of the splits
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

# Create dataset instances
train_dataset = ChatDataset(train_data)
val_dataset = ChatDataset(val_data)
test_dataset = ChatDataset(test_data)

# Define batch size
batch_size = 6  # Adjust based on available resources

# Create data loaders
train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
)
val_loader = DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
)
test_loader = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
)

Training set size: 32000
Validation set size: 8000
Test set size: 10000


- Create a custom ChatDataset class to handle our data.
- The collate_fn function pads sequences in a batch to the same length.
- Use DataLoader to efficiently load data during training.
- A subset of the data is used to speed up training when resources are limited

## Model Design and Initialization

In [8]:
from transformers import GPT2LMHeadModel

# Specify the model name; use a smaller model if resources are limited
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Initialize the GPT-2 model
model_name = 'gpt2-medium'  # We can use 'gpt2-medium' if using a GPU

model = GPT2LMHeadModel.from_pretrained(model_name)
model.to(device)

# Display model size
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total Parameters: {count_parameters(model):,}")

Total Parameters: 354,823,168


Initialize the pre-trained GPT-2 model and move it to the appropriate device.

Load the GPT-2 model (chose 'gpt2-medium') and set up the device (CPU).

### Define Evaluation Metrics

In [11]:
from tqdm import tqdm
import math

# Function to evaluate the model on a validation set
def evaluate(model, dataloader):
    """
    Evaluate the model on the validation set.
    Computes perplexity as the evaluation metric.
    """
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    total_tokens = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            # Forward pass with labels to compute loss
            outputs = model(**batch)
            loss = outputs.loss  # Mean loss per token in batch
            num_tokens = batch['attention_mask'].sum().item()
            # Accumulate loss and token counts
            total_loss += loss.item() * num_tokens  # Total loss over all tokens in batch
            total_tokens += num_tokens
    # Calculate perplexity
    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    return perplexity


- The evaluate function computes the perplexity of the model on the validation set.
- Perplexity is a measure of how well the model predicts the sample; lower values indicate better performance.
- Use torch.no_grad() to disable gradient computation during evaluation for efficiency.

### Training Loop

In [None]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

# Set the number of training epochs
epochs = 5  # Increase if computational resources allow

# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)

# Total number of training steps
total_steps = len(train_loader) * epochs

# Create a learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=500, num_training_steps=total_steps
)

# Initialize variables for tracking best performance
best_val_perplexity = float('inf')
best_model_state = None

# Training loop
for epoch in range(epochs):
    total_train_loss = 0
    total_train_tokens = 0  # Initialize total_train_tokens
    model.train()  # Set the model to training mode
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()  # Reset gradients
        # Forward pass with labels to compute loss
        outputs = model(**batch)
        loss = outputs.loss  # Mean loss per token
        num_tokens = batch['attention_mask'].sum().item()
        # Backward pass to compute gradients
        loss.backward()
        # Update parameters
        optimizer.step()
        # Update learning rate
        scheduler.step()
        # Accumulate training loss
        total_train_loss += loss.item() * num_tokens
        total_train_tokens += num_tokens
    # Calculate average training loss
    avg_train_loss = total_train_loss / total_train_tokens
    print(f"Epoch {epoch+1}, Average Training Loss: {avg_train_loss:.4f}")
    # Evaluate on validation set
    val_perplexity = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}, Validation Perplexity: {val_perplexity:.4f}")
    if val_perplexity < best_val_perplexity:
        best_val_perplexity = val_perplexity
        best_model_state = model.state_dict()
        print(f"Validation perplexity improved; model saved.")
    else:
        print(f"No improvement in validation perplexity.")

# Load the best model state before testing
if best_model_state is not None:
    model.load_state_dict(best_model_state)
else:
    print("No improvement during training; using last epoch model.")

# Save the best model
model_save_path = 'trained_model_best.pt'
torch.save(model.state_dict(), model_save_path)
print(f"Best model saved to {model_save_path}")

Training Epoch 1:   0%|                                                            | 6/5334 [05:55<73:28:03, 49.64s/it]

- Train the model using the AdamW optimizer and a linear learning rate scheduler.
- The training loop iterates over epochs and batches, computing loss, performing backpropagation, and updating model parameters.
- After each epoch, evaluate the model on the validation set and compute perplexity.
- The trained model is saved to a file to avoid retraining in the future.

## Implement Evaluation Metrics

In [31]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk

# Download NLTK data required for BLEU score calculation
nltk.download('punkt')

# Evaluate the final model on the test set
test_perplexity = evaluate(model, test_loader)
print(f"Final Test Perplexity: {test_perplexity:.4f}")


# Function to calculate the BLEU score on the validation set
def calculate_bleu(model, dataloader, tokenizer):
    """
    Calculate the average BLEU score on the validation set.
    """
    model.eval()  # Set the model to evaluation mode
    bleu_scores = []
    smooth_fn = SmoothingFunction().method1  # Smoothing function for BLEU score
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Calculating BLEU"):
            batch = {k: v.to(device) for k, v in batch.items()}
            # Generate predictions
            outputs = model.generate(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                max_length=batch['input_ids'].shape[1] + 50,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
            )
            for i in range(batch['input_ids'].size(0)):
                # Decode the reference and hypothesis texts
                input_len = (batch['input_ids'][i] != tokenizer.pad_token_id).sum().item()
                reference_ids = batch['input_ids'][i][input_len:].tolist()
                hypothesis_ids = outputs[i][input_len:].tolist()
                reference = tokenizer.decode(reference_ids, skip_special_tokens=True)
                hypothesis = tokenizer.decode(hypothesis_ids, skip_special_tokens=True)
                # Tokenize the texts
                reference_tokens = nltk.word_tokenize(reference)
                hypothesis_tokens = nltk.word_tokenize(hypothesis)
                # Compute BLEU score
                bleu = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smooth_fn)
                bleu_scores.append(bleu)
    # Calculate average BLEU score
    average_bleu = sum(bleu_scores) / len(bleu_scores)
    return average_bleu

# Calculate BLEU score on the validation set
bleu_score = calculate_bleu(model, val_loader, tokenizer)
print(f"Validation BLEU Score: {bleu_score:.4f}")

[nltk_data] Downloading package punkt to C:\Users\Pete
[nltk_data]     P\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Evaluating:   0%|▏                                                                 | 6/1667 [02:47<12:50:35, 27.84s/it]

KeyboardInterrupt



- The calculate_bleu function evaluates the model's predictions against the ground truth using the BLEU score.
- BLEU score measures the similarity between the generated text and reference text.
- Smoothing is applied to handle cases where the hypothesis and reference have few overlapping n-grams.

## Interaction with the Chatbot

In [32]:
# Implement Content Filtering
import re

# List of disallowed words or phrases to filter out
disallowed_words = [
    # Offensive Language
    r'self_harm', r'suicide', r'death', r'kill',
    # Profanity (replace with actual words or leave as *)
    #r'\b(f**k|s**t|b***h|d**n|a**hole|c**t|sl*t|pr*ck|b**tard)\b',
    # Slurs or Hate Speech 
    #r'\b(racial_slur1|racial_slur2|homophobic_slur1|homophobic_slur2|gender_slur|ethnic_slur|religious_slur)\b',  
    # Explicit Sexual Content
    r'\b(porn|nude|sex|explicit|xxx|erotic|fetish|incest|rape|molest)\b',
    # Spam or Malicious Content
    #r'\b(spam|phishing|malware|virus|adware|trojan|scam|fraud)\b',
    # Violence or Graphic Content
    r'\b(assault|gun|shooting|bomb|explosion|attack|blood|mutilation|massacre)\b',
    # Misinformation and Conspiracy Theories
    #r'\b(fake_news|hoax|conspiracy|anti_vax|flat_earth|election_fraud)\b',
    # Harmful Behavior or Self-Harm
    r'\b(cutting|eating_disorder|anorexia|bulimia|starving|addiction)\b',
]

# Escape special regex characters and compile the pattern
pattern = re.compile(r'\b(' + '|'.join([re.escape(word) for word in disallowed_words]) + r')\b', re.IGNORECASE)

def contains_disallowed_content(text):
    return bool(pattern.search(text))

- Define patterns for disallowed content to prevent the chatbot from generating inappropriate responses.
- The contains_disallowed_content function checks if the generated text contains any disallowed words or phrases.

### Chatbot Class Implementation

In [33]:
class Chatbot:
    def __init__(self, model, tokenizer, max_history=4):
        self.model = model  # The language generation model
        self.tokenizer = tokenizer  # Tokenizer to encode/decode text
        self.max_history = max_history  # Number of previous exchanges to consider
        self.chat_history = []  # Stores the conversation history
        self.device = device  # Ensure device is set
        # Set pad_token to eos_token for models like GPT-2 that do not have a pad_token
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def reset_history(self):
        self.chat_history = []

    def get_response(self, user_input):
        # Detect inappropriate or harmful content
        if contains_disallowed_content(user_input):
            return "I'm sorry, but I cannot assist with that request."
            
        self.chat_history.append(user_input)
        # Keep only the last max_history exchanges
        history = self.chat_history[-self.max_history:]
        # Combine the history into a single input string
        input_text = "<|endoftext|>".join(history) + "<|endoftext|>"
        
        # Encode the input text with truncation and padding
        encoding = self.tokenizer(
            input_text,
            return_tensors='pt',
            max_length=512,  # Truncate if necessary
            truncation=True,
            padding=True
        )
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        # Generate a response
        output = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=50,  # Generate up to 50 new tokens
            pad_token_id=self.tokenizer.pad_token_id,
            do_sample=True,  # Use sampling for more varied responses
            top_p=0.92,      # Nucleus sampling parameter
            top_k=40,        # Top-k sampling parameter
            temperature=0.8, # Sampling temperature
            eos_token_id=self.tokenizer.eos_token_id,
            num_return_sequences=1,  # Generate one response at a time
            repetition_penalty=1.3,  # Penalize repetition for better response diversity
            no_repeat_ngram_size=3   # Prevent repetition of n-grams
        )
        # Extract the generated tokens beyond the input length
        generated_tokens = output[0][input_ids.size(-1):]
        # Decode the generated tokens to text
        reply = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
        # Apply content filtering
        if contains_disallowed_content(reply):
            reply = "I'm sorry, but I can't assist with that request."
            
        self.chat_history.append(reply)
        return reply.strip()

- The Chatbot class encapsulates the chatbot's functionality.
- It maintains a conversation history to provide context for multi-turn conversations.
- The get_response method generates a response using the model and applies content filtering.
- Sampling parameters (top_p, top_k, temperature) control the diversity of the generated responses.

## Save and Load the Trained Model

In [34]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Specify the model name used during training
model_name = 'gpt2-medium'  # Replace with 'gpt2', 'gpt2-large', etc., as appropriate

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# Load the model architecture
model = GPT2LMHeadModel.from_pretrained(model_name)

# Load the saved model weights
model_load_path = 'trained_model.pt'
try:
    model.load_state_dict(torch.load(model_load_path, map_location=device))
    print("Model loaded successfully!")
except RuntimeError as e:
    print(f"Error loading model state_dict: {e}")
    print("Please ensure that the model architecture matches the saved model weights.")
    exit(1)

model.to(device)
model.eval()  # Set the model to evaluation mode

  model.load_state_dict(torch.load(model_load_path, map_location=device))


Model loaded successfully!


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

- Load the saved model weights into the model architecture.
- The model is set to evaluation mode to disable dropout and other training-specific layers.
- This allows us to use the trained model for generating responses.

### Interact with the Chatbot via Console

In [36]:
# Instantiate the chatbot
chatbot = Chatbot(model, tokenizer)

# Start the conversation loop
print("Bot: Hello! How can I assist you today?")
while True:
    user_input = input("You: ").strip()
    if user_input.lower() in ["exit", "quit", "goodbye"]:
        print("Bot: Goodbye! Have a great day.")
        break
    response = chatbot.get_response(user_input)
    print(f"Bot: {response}")

Bot: Hello! How can I assist you today?


You:  Who are you?


Bot: My name is Marcus. I am a professional thief and carjacker who likes to gamble. I'm also an ex-cop, but I still like the idea of a good story for my clients and myself in this area... So if we got


You:  Are you good with your work? Sounds not fun haha


Bot: And what's better than good news at that moment? News from a young woman whose life has been turned upside down when she finds out her mother was killed by another man named Frank.  She wants revenge! What does she want? Is it love


You:  Do take care of your self


Bot: You will never find me without a watch. I was given one as part 'a kindness from a friend'.  Now listen carefully: "A dear old lady gave us a crystal timepiece." That's all there would be. All those little words


You:  bye


Bot: I had the same problem with my original order, but they sent an e-mail to let you know we were back on track and they wanted something in return. They offered up ten thousand for whatever this is -- which sounds like a lot to you


You:  goodbye


Bot: Goodbye! Have a great day.


- Create an instance of the Chatbot class.
- A loop allows the user to interact with the chatbot via the console.
- The conversation continues until the user types an exit command.