In [1]:
# Download pippa.jsonl using wget
!wget -O /kaggle/working/pippa.jsonl "https://huggingface.co/datasets/PygmalionAI/PIPPA/resolve/main/pippa.jsonl?download=true"

--2025-05-21 11:30:18--  https://huggingface.co/datasets/PygmalionAI/PIPPA/resolve/main/pippa.jsonl?download=true
Resolving huggingface.co (huggingface.co)... 3.169.137.19, 3.169.137.111, 3.169.137.119, ...
Connecting to huggingface.co (huggingface.co)|3.169.137.19|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/64d19b3873dc458c1658fa1e/406cf96ad5700c7616998c15f7abf19738e5013c560df2094999f458ac54c4ee?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250521%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250521T113018Z&X-Amz-Expires=3600&X-Amz-Signature=453b210c10a9957eff4956e0423f49a12d80f9ad2bee75b4e67a4f2adbb44ebf&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27pippa.jsonl%3B+filename%3D%22pippa.jsonl%22%3B&x-id=GetObject&Expires=1747830618&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4i

In [5]:
import os
import json
import re
from datasets import Dataset, DatasetDict
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.optim import AdamW
import torch
from torch.utils.data import DataLoader
from transformers import get_cosine_schedule_with_warmup, DataCollatorForLanguageModeling
import logging
from retry import retry
import shutil
import numpy as np

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configurable paths
DATASET_PATH = os.getenv("DATASET_PATH", "/kaggle/working/pippa.jsonl")
SAVE_DIR = os.getenv("SAVE_DIR", "/kaggle/working/conversational_ai_v1")
SAVE_ZIP = os.getenv("SAVE_ZIP", "/kaggle/working/conversational_ai_v1.zip")
DRIVE_SAVE_PATH = os.getenv("DRIVE_SAVE_PATH", "/content/drive/My Drive/conversational_ai_v1.zip")
BEST_MODEL_DIR = os.getenv("BEST_MODEL_DIR", "/kaggle/working/conversational_ai_v1_best")

# Training hyperparameters
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 4))
EPOCHS = int(os.getenv("EPOCHS", 3))
ACCUMULATION_STEPS = int(os.getenv("ACCUMULATION_STEPS", 4))
LEARNING_RATE = float(os.getenv("LEARNING_RATE", 5e-5))
MAX_LENGTH = int(os.getenv("MAX_LENGTH", 512))
MAX_CONVERSATIONS = int(os.getenv("MAX_CONVERSATIONS", 10000))
WARMUP_STEPS = int(os.getenv("WARMUP_STEPS", 1000))
EARLY_STOPPING_PATIENCE = int(os.getenv("EARLY_STOPPING_PATIENCE", 1))

# Mount Google Drive (optional)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    logger.info("Google Drive mounted for saving outputs.")
except Exception:
    logger.warning("Google Drive not mounted; saving outputs to /kaggle/working/")

# Check dataset
if not os.path.exists(DATASET_PATH):
    logger.error(f"Dataset '{DATASET_PATH}' not found.")
    raise FileNotFoundError(
        f"Dataset '{DATASET_PATH}' not found. Ensure 'pippa.jsonl' is downloaded to /kaggle/working/pippa.jsonl."
    )

# Load tokenizer and model
try:
    tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
    tokenizer.pad_token = tokenizer.eos_token
    model = GPT2LMHeadModel.from_pretrained("distilgpt2")
except Exception as e:
    logger.error(f"Failed to load model or tokenizer: {e}")
    raise

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
logger.info(f"Using device: {device}")
if device == "cpu":
    logger.warning("GPU unavailable; using CPU. Ensure Accelerator is set to 'GPU P100' in Kaggle settings.")

# Check GPU memory
if device == "cuda":
    logger.info(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
    logger.info(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

# Optimizer and scaler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scaler = torch.amp.GradScaler('cuda')

# Learning rate scheduler
total_steps = (MAX_CONVERSATIONS // (BATCH_SIZE * ACCUMULATION_STEPS)) * EPOCHS
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)

# System prompt
SYSTEM_PROMPT = (
    "[SYSTEM]: You are a conversational AI assistant designed to provide helpful and accurate responses. "
    "Respond in a formal, polite, and professional manner, suitable for general conversation."
)

# Clean response
def clean_response(text):
    text = re.sub(r'\{\{.*?\}\}', 'User', text)
    text = re.sub(r'\*.*?\*', '', text)
    text = text[0].upper() + text[1:] if text else text
    text = text.rstrip('.') + '.' if not text.endswith('.') else text
    return text.strip()

# Preprocess dataset with content filter
def process_line(line):
    try:
        data = json.loads(line)
        conversation = data.get("conversation", [])
        if not conversation or len(conversation) < 2:
            logger.warning("Skipping empty or short conversation")
            return None
        sensitive_keywords = ['explicit', 'adult', 'nsfw', 'offensive']
        for msg in conversation:
            if not isinstance(msg.get('message'), str):
                logger.warning("Skipping conversation with invalid message type")
                return None
            if any(keyword in msg['message'].lower() for keyword in sensitive_keywords):
                logger.warning("Skipping conversation with sensitive content")
                return None
        text = SYSTEM_PROMPT + " "
        for msg in conversation:
            speaker = "[USER]" if msg["is_human"] else "[BOT]"
            message = clean_response(msg['message'])
            text += f"{speaker}: {message} "
        tokenized = tokenizer(text, truncation=True, max_length=MAX_LENGTH)
        if len(tokenized["input_ids"]) < 50:
            logger.warning("Skipping conversation with insufficient tokens")
            return None
        return {"text": text.strip()}
    except json.JSONDecodeError:
        logger.warning("Skipping invalid JSON line")
        return None
    except Exception as e:
        logger.warning(f"Skipping line due to error: {e}")
        return None

# Load dataset with limit
def load_dataset_chunked(file_path, chunk_size=1000, max_conversations=MAX_CONVERSATIONS):
    texts = []
    count = 0
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                if line.strip():
                    processed = process_line(line)
                    if processed:
                        texts.append(processed)
                        count += 1
                        if count >= max_conversations:
                            break
                if len(texts) >= chunk_size or count >= max_conversations:
                    yield texts
                    texts = []
            if texts:
                yield texts
        logger.info(f"Processed {count} valid conversations")
    except Exception as e:
        logger.error(f"Failed to load dataset: {e}")
        raise

# Load and split dataset
logger.info("Loading and preprocessing dataset...")
try:
    chunks = list(load_dataset_chunked(DATASET_PATH))
    dataset = Dataset.from_list([item for chunk in chunks for item in chunk if item])
    dataset.save_to_disk("/kaggle/working/preprocessed_dataset")
    train_test_split = dataset.train_test_split(test_size=0.1)
    dataset_dict = DatasetDict({"train": train_test_split["train"], "validation": train_test_split["test"]})
except Exception as e:
    logger.error(f"Dataset preprocessing failed: {e}")
    raise

# Tokenize dataset with explicit padding
def tokenize_function(examples):
    try:
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=MAX_LENGTH,
            return_tensors="pt"
        )
        tokenized["labels"] = tokenized["input_ids"].clone()
        return {
            "input_ids": tokenized["input_ids"].squeeze(),
            "attention_mask": tokenized["attention_mask"].squeeze(),
            "labels": tokenized["labels"].squeeze()
        }
    except Exception as e:
        logger.error(f"Tokenization failed: {e}")
        raise

try:
    tokenized_datasets = dataset_dict.map(tokenize_function, batched=True, remove_columns=["text"])
    tokenized_datasets.save_to_disk("/kaggle/working/tokenized_dataset")
    logger.info("Dataset tokenized and cached.")
except Exception as e:
    logger.error(f"Tokenization failed: {e}")
    raise

# Custom collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# DataLoader
def create_dataloader(dataset, batch_size=BATCH_SIZE):
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

try:
    train_dataloader = create_dataloader(tokenized_datasets["train"])
    val_dataloader = create_dataloader(tokenized_datasets["validation"])
except Exception as e:
    logger.error(f"DataLoader creation failed: {e}")
    raise

# Save checkpoint
@retry(tries=3, delay=2, backoff=2)
def save_checkpoint(epoch, is_best=False):
    checkpoint_dir = BEST_MODEL_DIR if is_best else f"{SAVE_DIR}_epoch_{epoch}"
    try:
        model.save_pretrained(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)
        shutil.make_archive(checkpoint_dir, 'zip', checkpoint_dir)
        try:
            drive_path = f"/content/drive/My Drive/{os.path.basename(checkpoint_dir)}.zip"
            if os.path.exists(drive_path):
                os.remove(drive_path)
                logger.info(f"Deleted existing checkpoint at '{drive_path}'")
            shutil.copy(f"{checkpoint_dir}.zip", drive_path)
            logger.info(f"Saved checkpoint at '{checkpoint_dir}' and uploaded to '{drive_path}'")
        except Exception:
            logger.warning(f"Drive save failed for checkpoint; saved locally at '{checkpoint_dir}.zip'")
    except Exception as e:
        logger.error(f"Checkpoint saving failed: {e}")
        raise

# Training function with early stopping
def train_model(epochs=EPOCHS, accumulation_steps=ACCUMULATION_STEPS):
    model.train()
    global grad_step_counter
    grad_step_counter = 0
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        epoch_loss = 0
        batch_count = 0
        try:
            for batch in train_dataloader:
                inputs = {k: v.to(device) for k, v in batch.items()}
                with torch.amp.autocast('cuda'):
                    outputs = model(**inputs)
                    loss = outputs.loss / accumulation_steps

                scaler.scale(loss).backward()
                grad_step_counter += 1
                if grad_step_counter % accumulation_steps == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                    scheduler.step()

                epoch_loss += loss.item() * accumulation_steps
                batch_count += 1
                if device == "cuda":
                    torch.cuda.empty_cache()

            avg_train_loss = epoch_loss / batch_count
            train_losses.append(avg_train_loss)

            # Validation
            model.eval()
            val_loss = 0
            val_count = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    inputs = {k: v.to(device) for k, v in batch.items()}
                    outputs = model(**inputs)
                    val_loss += outputs.loss.item()
                    val_count += 1
            avg_val_loss = val_loss / val_count
            val_losses.append(avg_val_loss)
            perplexity = torch.exp(torch.tensor(avg_val_loss)).item()

            logger.info(
                f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, "
                f"Val Loss: {avg_val_loss:.4f}, Perplexity: {perplexity:.2f}"
            )

            # Save checkpoint
            save_checkpoint(epoch + 1)

            # Save best model
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                save_checkpoint(epoch + 1, is_best=True)
                logger.info(f"New best model saved with val loss: {best_val_loss:.4f}")
                patience_counter = 0
            else:
                patience_counter += 1
                logger.info(f"No improvement in val loss. Patience: {patience_counter}/{EARLY_STOPPING_PATIENCE}")

            # Early stopping
            if patience_counter >= EARLY_STOPPING_PATIENCE:
                logger.info("Early stopping triggered.")
                break

            if device == "cuda":
                logger.info(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")

            model.train()
        except Exception as e:
            logger.error(f"Training failed at epoch {epoch+1}: {e}")
            raise

    return train_losses, val_losses

# Run training
logger.info("Starting training...")
try:
    train_losses, val_losses = train_model()
    logger.info("Training completed.")
except Exception as e:
    logger.error(f"Training failed: {e}")
    raise

# Save final model
try:
    model.save_pretrained(SAVE_DIR)
    tokenizer.save_pretrained(SAVE_DIR)
    shutil.make_archive(SAVE_DIR, 'zip', SAVE_DIR)
    logger.info(f"Saved '{SAVE_ZIP}'")
except Exception as e:
    logger.error(f"Final model save failed: {e}")
    raise

# Upload to Drive (optional)
@retry(tries=3, delay=2, backoff=2)
def save_to_drive():
    try:
        if os.path.exists(DRIVE_SAVE_PATH):
            os.remove(DRIVE_SAVE_PATH)
            logger.info(f"Deleted existing file at '{DRIVE_SAVE_PATH}'")
        shutil.copy(f"{SAVE_ZIP}", DRIVE_SAVE_PATH)
        logger.info(f"Copied '{SAVE_ZIP}' to '{DRIVE_SAVE_PATH}'")
    except Exception as e:
        logger.warning(f"Drive save failed; outputs saved to /kaggle/working/")

try:
    save_to_drive()
except Exception:
    logger.warning("Drive save failed; outputs saved to /kaggle/working/")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


In [None]:
import os
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configurable paths
MODEL_DIR = os.getenv("MODEL_DIR", "/kaggle/working/conversational_ai_v1")

# System prompt (same as training)
SYSTEM_PROMPT = (
    "[SYSTEM]: You are a conversational AI assistant designed to provide helpful and accurate responses. "
    "Respond in a formal, polite, and professional manner, suitable for general conversation."
)

# Clean response (same as training)
def clean_response(text):
    text = re.sub(r'\{\{.*?\}\}', 'User', text)
    text = re.sub(r'\*.*?\*', '', text)
    text = text[0].upper() + text[1:] if text else text
    text = text.rstrip('.') + '.' if not text.endswith('.') else text
    return text.strip()

# Load model and tokenizer
def load_model_and_tokenizer(model_dir):
    try:
        logger.info(f"Loading tokenizer from {model_dir}")
        tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
        logger.info(f"Loading model from {model_dir}")
        model = GPT2LMHeadModel.from_pretrained(model_dir)
        return tokenizer, model
    except Exception as e:
        logger.error(f"Failed to load model or tokenizer: {e}")
        raise

# Generate response
def generate_response(model, tokenizer, user_input, device, max_length=256, max_new_tokens=100):
    try:
        # Format input with system prompt and user message
        input_text = f"{SYSTEM_PROMPT} [USER]: {user_input.strip()} [BOT]: "
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
        
        # Generate
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            num_beams=5,
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.9,
            temperature=0.7
        )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract BOT response (after [BOT]:)
        bot_response = response.split("[BOT]:")[-1].strip()
        bot_response = clean_response(bot_response)
        
        return bot_response
    except Exception as e:
        logger.error(f"Error generating response: {e}")
        return "I'm sorry, I encountered an error. Please try again."

# Interactive chat loop
def chat_loop(model, tokenizer, device):
    logger.info("Starting chat. Type 'exit' to quit.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            logger.info("Exiting chat.")
            break
        response = generate_response(model, tokenizer, user_input, device)
        print(f"Bot: {response}")

# Main function
def main():
    # Verify model directory
    if not os.path.exists(MODEL_DIR):
        logger.error(f"Model directory '{MODEL_DIR}' not found.")
        raise FileNotFoundError(
            f"Model directory '{MODEL_DIR}' not found. Ensure 'conversational_ai_v1.zip' is unzipped."
        )

    # Load model and tokenizer
    tokenizer, model = load_model_and_tokenizer(MODEL_DIR)

    # Set device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    logger.info(f"Using device: {device}")
    if device == "cuda":
        logger.info(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
    else:
        logger.warning("GPU unavailable; using CPU. Responses may be slower.")

    # Start chat
    try:
        chat_loop(model, tokenizer, device)
    except KeyboardInterrupt:
        logger.info("Chat interrupted by user.")
    except Exception as e:
        logger.error(f"Chat loop failed: {e}")
        raise

if __name__ == "__main__":
    main()

You:  Hey there, how are you doing?


Bot: I've been working hard to become the best I can be, but I'm still struggling to make it to the top of my game. I've had a lot of work to do, so I feel like I should be more than happy to be part of the team. It's been a long time since I started working on this game, I don't think I'll ever get the chance to play again. 


So, what do you think of me? Do you have.


In [4]:
!pip install retry

Collecting retry
  Downloading retry-0.9.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting py<2.0.0,>=1.4.26 (from retry)
  Downloading py-1.11.0-py2.py3-none-any.whl.metadata (2.8 kB)
Downloading retry-0.9.2-py2.py3-none-any.whl (8.0 kB)
Downloading py-1.11.0-py2.py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: py, retry
Successfully installed py-1.11.0 retry-0.9.2
