# Modeling

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.optim import AdamW
from tqdm import tqdm


# Set Pandas options to display the full output
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.max_colwidth', None) # Display full content of each column

In [7]:
# Load your dataset (adjust the path as necessary)
chatbot_dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/dataset.csv")

chatbot_dataset.head()


Unnamed: 0,conversation_history,response
0,Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break up on the quad. Again. SEP,Well I thought we'd start with pronunciation if that's okay with you.
1,Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break up on the quad. Again. SEP Well I thought we'd start with pronunciation if that's okay with you. SEP,Not the hacking and gagging and spitting part. Please.
2,Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break up on the quad. Again. SEP Well I thought we'd start with pronunciation if that's okay with you. SEP Not the hacking and gagging and spitting part. Please. SEP,Okay... then how 'bout we try out some French cuisine. Saturday? Night?
3,You're asking me out. That's so cute. What's your name again? SEP,Forget it.
4,No no it's my fault we didn't have a proper introduction SEP,Cameron.


In [None]:
# Assuming chatbot_dataset is already defined
# Sample 30% of the dataset before splitting
# sampled_dataset = chatbot_dataset.sample(frac=0.3, random_state=42)

# Split the dataset into train and test sets (80% train, 20% test)
train_data, test_data = train_test_split(chatbot_dataset, test_size=0.2, random_state=42)

# Further split the train data into training and validation sets (80% train, 20% validation)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Display the sizes of the sampled datasets
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")




Training set size: 141674
Validation set size: 35419
Test set size: 44274


In [None]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set pad token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'  # Use left-padding for generation

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



In [None]:
# Define a new dataset class to maintain context with limited history and turn markers
class ContextualChatbotDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=100, history_window=3):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.history_window = history_window  # Number of turns to retain in history

    def __getitem__(self, idx):
        # Get the conversation history and the response
        # Split conversation history by the [SEP] token between turns
        conversation_history = self.data['conversation_history'].iloc[idx].split(" [SEP] ")
        response = self.data['response'].iloc[idx]

        # Limit the conversation history to the last `history_window` turns
        limited_history = conversation_history[-self.history_window:]

        # Add special tokens for user and bot turns
        input_text = f"[USER] {' [BOT] '.join(limited_history)} [BOT] {response}"

        # Tokenize the input
        encodings = self.tokenizer.encode_plus(
            input_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            return_attention_mask=True,
            add_special_tokens=True,
        )

        return {
            'input_ids': encodings['input_ids'].squeeze(0),  # Remove batch dimension
            'attention_mask': encodings['attention_mask'].squeeze(0),  # Remove batch dimension
            'labels': encodings['input_ids'].squeeze(0),  # Set labels to input_ids for language modeling
        }

    def __len__(self):
        return len(self.data)

# Create datasets for training, validation, and testing
train_dataset = ContextualChatbotDataset(train_data, tokenizer, history_window=3)  # Limit to last 3 turns
val_dataset = ContextualChatbotDataset(val_data, tokenizer, history_window=3)
test_dataset = ContextualChatbotDataset(test_data, tokenizer, history_window=3)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, num_workers=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, num_workers=4, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, num_workers=4, shuffle=False)


In [None]:
# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Check for GPU Availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the GPU
model.to(device)

# Define Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop with Validation and Early Stopping
best_val_loss = float('inf')
patience = 3  # Number of epochs to wait for improvement
epochs_without_improvement = 0

# Training Loop
for epoch in range(50):  # Number of epochs
    start_time = time.time()  # Start time for the epoch
    model.train()  # Set model to training mode

    # Training phase
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
        optimizer.zero_grad()

        # Move input data to the GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Compute the loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            # Move input data to the GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Compute the loss
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            val_loss += outputs.loss.item()

    # Calculate the time taken for the epoch
    end_time = time.time()
    epoch_time = end_time - start_time

    # Print training and validation losses along with epoch time
    print(f"Epoch: {epoch + 1}, Training Loss: {loss.item():.4f}, Validation Loss: {val_loss / len(val_loader):.4f}, Time: {epoch_time:.2f} seconds")

    # Check for improvement
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0  # Reset the counter
        # Optionally, save the model checkpoint here
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs.")
            break

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using device: cuda


Training Epoch 1: 100%|██████████| 4428/4428 [31:22<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 1, Training Loss: 1.9753, Validation Loss: 1.5568, Time: 2033.25 seconds


Training Epoch 2: 100%|██████████| 4428/4428 [31:22<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.35it/s]


Epoch: 2, Training Loss: 1.0295, Validation Loss: 1.4147, Time: 2033.42 seconds


Training Epoch 3: 100%|██████████| 4428/4428 [31:23<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 3, Training Loss: 1.4691, Validation Loss: 1.2996, Time: 2034.06 seconds


Training Epoch 4: 100%|██████████| 4428/4428 [31:35<00:00,  2.34it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.30it/s]


Epoch: 4, Training Loss: 1.4890, Validation Loss: 1.2096, Time: 2047.23 seconds


Training Epoch 5: 100%|██████████| 4428/4428 [31:23<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 5, Training Loss: 1.2297, Validation Loss: 1.1398, Time: 2034.61 seconds


Training Epoch 6: 100%|██████████| 4428/4428 [31:24<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 6, Training Loss: 0.9790, Validation Loss: 1.0894, Time: 2035.17 seconds


Training Epoch 7: 100%|██████████| 4428/4428 [31:24<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.33it/s]


Epoch: 7, Training Loss: 0.8298, Validation Loss: 1.0339, Time: 2035.26 seconds


Training Epoch 8: 100%|██████████| 4428/4428 [31:24<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 8, Training Loss: 0.8078, Validation Loss: 0.9942, Time: 2034.94 seconds


Training Epoch 9: 100%|██████████| 4428/4428 [31:23<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 9, Training Loss: 0.9924, Validation Loss: 0.9551, Time: 2034.52 seconds


Training Epoch 10: 100%|██████████| 4428/4428 [31:24<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.32it/s]


Epoch: 10, Training Loss: 0.6302, Validation Loss: 0.9290, Time: 2035.61 seconds


Training Epoch 11: 100%|██████████| 4428/4428 [31:24<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.33it/s]


Epoch: 11, Training Loss: 0.6371, Validation Loss: 0.9002, Time: 2035.28 seconds


Training Epoch 12: 100%|██████████| 4428/4428 [31:24<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 12, Training Loss: 1.2498, Validation Loss: 0.8765, Time: 2035.66 seconds


Training Epoch 13: 100%|██████████| 4428/4428 [31:24<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.33it/s]


Epoch: 13, Training Loss: 1.1027, Validation Loss: 0.8538, Time: 2035.16 seconds


Training Epoch 14: 100%|██████████| 4428/4428 [31:25<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.33it/s]


Epoch: 14, Training Loss: 0.6438, Validation Loss: 0.8412, Time: 2037.05 seconds


Training Epoch 15: 100%|██████████| 4428/4428 [31:26<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.33it/s]


Epoch: 15, Training Loss: 0.8549, Validation Loss: 0.8239, Time: 2037.23 seconds


Training Epoch 16: 100%|██████████| 4428/4428 [31:25<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.33it/s]


Epoch: 16, Training Loss: 0.7272, Validation Loss: 0.8041, Time: 2036.85 seconds


Training Epoch 17: 100%|██████████| 4428/4428 [31:25<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 17, Training Loss: 0.4471, Validation Loss: 0.7976, Time: 2036.59 seconds


Training Epoch 18: 100%|██████████| 4428/4428 [31:25<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 18, Training Loss: 0.4901, Validation Loss: 0.7819, Time: 2036.23 seconds


Training Epoch 19: 100%|██████████| 4428/4428 [31:25<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.33it/s]


Epoch: 19, Training Loss: 0.7294, Validation Loss: 0.7705, Time: 2036.43 seconds


Training Epoch 20: 100%|██████████| 4428/4428 [31:25<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 20, Training Loss: 0.3933, Validation Loss: 0.7735, Time: 2036.46 seconds


Training Epoch 21: 100%|██████████| 4428/4428 [31:25<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 21, Training Loss: 0.6442, Validation Loss: 0.7598, Time: 2036.82 seconds


Training Epoch 22: 100%|██████████| 4428/4428 [31:25<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 22, Training Loss: 0.7830, Validation Loss: 0.7555, Time: 2035.98 seconds


Training Epoch 23: 100%|██████████| 4428/4428 [31:24<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 23, Training Loss: 0.6921, Validation Loss: 0.7431, Time: 2035.70 seconds


Training Epoch 24: 100%|██████████| 4428/4428 [31:36<00:00,  2.33it/s]
Validating: 100%|██████████| 1107/1107 [02:31<00:00,  7.30it/s]


Epoch: 24, Training Loss: 0.2847, Validation Loss: 0.7417, Time: 2048.35 seconds


Training Epoch 25: 100%|██████████| 4428/4428 [31:24<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 25, Training Loss: 0.4038, Validation Loss: 0.7323, Time: 2035.66 seconds


Training Epoch 26: 100%|██████████| 4428/4428 [31:25<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.34it/s]


Epoch: 26, Training Loss: 0.5987, Validation Loss: 0.7405, Time: 2035.84 seconds


Training Epoch 27: 100%|██████████| 4428/4428 [31:24<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.36it/s]


Epoch: 27, Training Loss: 0.6194, Validation Loss: 0.7352, Time: 2035.43 seconds


Training Epoch 28: 100%|██████████| 4428/4428 [31:24<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.35it/s]


Epoch: 28, Training Loss: 0.6040, Validation Loss: 0.7301, Time: 2034.59 seconds


Training Epoch 29: 100%|██████████| 4428/4428 [31:23<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.36it/s]


Epoch: 29, Training Loss: 0.2755, Validation Loss: 0.7301, Time: 2034.18 seconds


Training Epoch 30: 100%|██████████| 4428/4428 [31:23<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.35it/s]


Epoch: 30, Training Loss: 0.3038, Validation Loss: 0.7261, Time: 2034.31 seconds


Training Epoch 31: 100%|██████████| 4428/4428 [31:23<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.36it/s]


Epoch: 31, Training Loss: 0.3820, Validation Loss: 0.7286, Time: 2034.43 seconds


Training Epoch 32: 100%|██████████| 4428/4428 [31:23<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.36it/s]


Epoch: 32, Training Loss: 0.2933, Validation Loss: 0.7237, Time: 2034.46 seconds


Training Epoch 33: 100%|██████████| 4428/4428 [31:23<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.36it/s]


Epoch: 33, Training Loss: 0.3148, Validation Loss: 0.7330, Time: 2034.19 seconds


Training Epoch 34: 100%|██████████| 4428/4428 [31:24<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.36it/s]


Epoch: 34, Training Loss: 0.3571, Validation Loss: 0.7299, Time: 2034.56 seconds


Training Epoch 35: 100%|██████████| 4428/4428 [31:23<00:00,  2.35it/s]
Validating: 100%|██████████| 1107/1107 [02:30<00:00,  7.36it/s]

Epoch: 35, Training Loss: 0.3516, Validation Loss: 0.7330, Time: 2034.02 seconds
Early stopping triggered after 35 epochs.





In [None]:
# Saving the model
gpt_model = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_model_3_actual"
gpt_tokenizer = "/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_3"

# Save the model and tokenizer
model.save_pretrained(gpt_model)
tokenizer.save_pretrained(gpt_tokenizer)


('/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_3_actual_leftpad/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_3_actual_leftpad/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_3_actual_leftpad/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_3_actual_leftpad/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/AAI-520/Final Project/GenAI-Chatbot/eval/gpt/base_gpt_tokenizer_3_actual_leftpad/added_tokens.json')