In [None]:
# Socrates Dialogue Fine-tuning for Kaggle
# This notebook fine-tunes a language model on Plato's Socrates dialogues

# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import json
import re
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

# These are the libraries we will use to fine-tune the model
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments
)


from tqdm import tqdm # for progress bar

import warnings # to ignore warnings


device = 'cuda' if torch.cuda.is_available() else 'cpu'

warnings.filterwarnings('ignore')

print("=== SOCRATES DIALOGUE FINE-TUNING ===")
print("=== GPU SETUP ===")
print(f"PyTorch version: {torch.__version__}")

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    # Clear GPU cache for fresh start
    torch.cuda.empty_cache()
    print("GPU cache cleared")
else:
    print("WARNING: CUDA not available, will use CPU (training will be slow)")

print(f"Device selected: {device}")
print("\nStep 1: Loading Dataset")
print("Loading Socrates dialogue dataset from Hugging Face...")

# Load the dataset using the Hugging Face datasets library
dataset = load_dataset("tylercross/platos_socrates_no_context")
df = dataset['train'].to_pandas()

# Display dataset information
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
df.head()


  from .autonotebook import tqdm as notebook_tqdm


=== SOCRATES DIALOGUE FINE-TUNING ===
Step 1: Loading Dataset
Loading Socrates dialogue dataset from Hugging Face...
Dataset shape: (3412, 3)
Columns: ['instruction', 'input', 'output']

First few rows:


Unnamed: 0,instruction,input,output
0,Respond as Socrates,"CRITO: Who was the person, Socrates, with whom...","SOCRATES: There were two, Crito; which of them..."
1,Respond as Socrates,CRITO: The one whom I mean was seated second f...,"SOCRATES: He whom you mean, Crito, is Euthydem..."
2,Respond as Socrates,"CRITO: Neither of them are known to me, Socrat...","SOCRATES: As to their origin, I believe that t..."
3,Respond as Socrates,"CRITO: But, Socrates, are you not too old? the...","SOCRATES: Certainly not, Crito; as I will prov..."
4,Respond as Socrates,"CRITO: I see no objection, Socrates, if you li...",SOCRATES: In less than no time you shall hear;...


In [2]:
print("\n=== Step 2: Data Preprocessing ===")
print("Removing unnecessary columns...")

# Remove the 'instruction' column as it's not needed for our dialogue training
df.drop('instruction', axis=1, inplace=True)
print(f"Dataset shape after removing 'instruction' column: {df.shape}")
print("Remaining columns:", list(df.columns))

# Check for any missing values
print(f"\nMissing values:")
print(df.isnull().sum())

df.head()



=== Step 2: Data Preprocessing ===
Removing unnecessary columns...
Dataset shape after removing 'instruction' column: (3412, 2)
Remaining columns: ['input', 'output']

Missing values:
input     0
output    0
dtype: int64


Unnamed: 0,input,output
0,"CRITO: Who was the person, Socrates, with whom...","SOCRATES: There were two, Crito; which of them..."
1,CRITO: The one whom I mean was seated second f...,"SOCRATES: He whom you mean, Crito, is Euthydem..."
2,"CRITO: Neither of them are known to me, Socrat...","SOCRATES: As to their origin, I believe that t..."
3,"CRITO: But, Socrates, are you not too old? the...","SOCRATES: Certainly not, Crito; as I will prov..."
4,"CRITO: I see no objection, Socrates, if you li...",SOCRATES: In less than no time you shall hear;...


In [3]:
print("\n=== Step 3: Data Formatting ===")
print("Converting DataFrame to training format...")

# Convert DataFrame to JSON format suitable for fine-tuning
# Each entry will have a 'prompt' (question) and 'response' (Socrates' answer)

json_data = []
for _, row in df.iterrows():
    json_entry = {
        "prompt": row['input'],
        "response": row['output']
    }
    json_data.append(json_entry)

print(f"Created {len(json_data)} training examples")
print("\nSample entry structure:")
print(json.dumps(json_data[0], indent=2))




=== Step 3: Data Formatting ===
Converting DataFrame to training format...
Created 3412 training examples

Sample entry structure:
{
  "prompt": "CRITO: Who was the person, Socrates, with whom you were talking yesterday at the Lyceum? There was such a crowd around you that I could not get within hearing, but I caught a sight of him over their heads, and I made out, as I thought, that he was a stranger with whom you were talking: who was he?",
  "response": "SOCRATES: There were two, Crito; which of them do you mean?"
}


In [5]:
# Display a few sample entries to understand the data structure
print("Sample dialogue entries (first 3):")
for i, entry in enumerate(json_data[:3]):
    print(f"\n--- Entry {i+1} ---")
    print(f"Prompt: {entry['prompt']}")
    print(f"Response: {entry['response']}")

Sample dialogue entries (first 3):

--- Entry 1 ---
Prompt: CRITO: Who was the person, Socrates, with whom you were talking yesterday at the Lyceum? There was such a crowd around you that I could not get within hearing, but I caught a sight of him over their heads, and I made out, as I thought, that he was a stranger with whom you were talking: who was he?
Response: SOCRATES: There were two, Crito; which of them do you mean?

--- Entry 2 ---
Prompt: CRITO: The one whom I mean was seated second from you on the right-hand side. In the middle was Cleinias the young son of Axiochus, who has wonderfully grown; he is only about the age of my own Critobulus, but he is much forwarder and very good-looking: the other is thin and looks younger than he is.
Response: SOCRATES: He whom you mean, Crito, is Euthydemus; and on my left hand there was his brother Dionysodorus, who also took part in the conversation.

--- Entry 3 ---
Prompt: CRITO: Neither of them are known to me, Socrates; they are a ne

In [None]:
print("\n=== Step 4: Data Analysis ===")
print("Analyzing speaker names in the dialogue...")

def extract_speaker_names(json_data):
    
    """Extract all speaker names that occur before colons in the dialogue text"""
    all_speakers = []
    
    # Convert JSON to string for regex processing
    json_str = json.dumps(json_data, ensure_ascii=False)
    
    # Find all speaker names (2+ capital letters followed by colon), because all speaker names are capitalized
    # This pattern matches names like "SOCRATES:", "MENO:", etc.
    pattern = r'\b[A-Z]{2,}:'
    matches = re.findall(pattern, json_str)
    
    return matches

# Find all speaker names in the dataset
speaker_names = extract_speaker_names(json_data)
unique_speakers = set(speaker_names)

print(f"Found {len(unique_speakers)} unique speakers in the dialogues:")
print(sorted(unique_speakers))
print(f"\nTotal speaker name occurrences: {len(speaker_names)}")

# Count frequency of each speaker
from collections import Counter
speaker_counts = Counter(speaker_names)
print(f"\nMost common speakers:")
for speaker, count in speaker_counts.most_common(5):
    print(f"  {speaker} {count} times")

{'ANYTUS:', 'MENEXENUS:', 'POLUS:', 'THEODORUS:', 'LACHES:', 'EUDICUS:', 'CRITIAS:', 'ION:', 'CALLICLES:', 'CHAEREPHON:', 'CRATYLUS:', 'PROTARCHUS:', 'LYSIMACHUS:', 'DIALOGUE:', 'NICIAS:', 'SOCRATES:', 'MELESIAS:', 'BOY:', 'MENO:', 'THEAETETUS:', 'PHILEBUS:', 'EUTHYPHRO:', 'CRITO:', 'STRANGER:', 'TIMAEUS:', 'GORGIAS:', 'HIPPIAS:', 'PHAEDRUS:'}


In [None]:
print("\n=== Step 5: Data Cleaning ===")
print("Removing speaker names from dialogue text...")

# Clean the data by removing speaker names and extra whitespace
# This makes the text more natural for training the language model
cleaned_json_data = []

for entry in json_data:
    cleaned_entry = {
        # Remove speaker names (e.g., "SOCRATES: " becomes "")
        "prompt": re.sub(r'\b[A-Z]{2,}:\s*', '', entry['prompt']).strip(),
        "response": re.sub(r'\b[A-Z]{2,}:\s*', '', entry['response']).strip()
    }
    # Only keep entries that have content after cleaning
    if cleaned_entry['prompt'] and cleaned_entry['response']:
        cleaned_json_data.append(cleaned_entry)

print(f"Cleaned {len(cleaned_json_data)} entries")
print(f"Removed {len(json_data) - len(cleaned_json_data)} empty entries")

# Show before/after comparison
print("\n--- Before Cleaning ---")
print(f"Original: {json_data[0]['prompt'][:100]}...")
print("\n--- After Cleaning ---")
print(f"Cleaned: {cleaned_json_data[0]['prompt'][:100]}...")

# Update json_data to the cleaned version
json_data = cleaned_json_data

Cleaned 3412 entries


# Display sample cleaned entries to verify the cleaning worked
print("=== Sample Cleaned Entries ===")
for i, entry in enumerate(json_data[:2]):
    print(f"\n--- Sample {i+1} ---")
    print(f"Prompt: {entry['prompt']}")
    print(f"Response: {entry['response']}")
    print("-" * 50)

In [None]:
print("\n=== Step 6: Train/Validation Split ===")
print("Splitting data into training and validation sets...")

# Split data into training and validation sets (90% train, 10% validation)

train_data, val_data = train_test_split(
    json_data, 
    test_size=0.1, 
    random_state=42
)

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")

print("\n=== Step 7: Model Setup ===")
print("Initializing pre-trained model and tokenizer...")

# Model configuration - you can change this to other models if needed
MODEL_NAME = "tiiuae/falcon-7b"  
print(f"Loading model: {MODEL_NAME}")

# Load tokenizer first
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Added padding token")

# Load model with GPU optimizations for training
print("Loading model...")
print(f"Target device: {device}")

if torch.cuda.is_available():
    # Clear GPU cache before loading model
    torch.cuda.empty_cache()
    print("GPU cache cleared before model loading")

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,  # Use half precision to save memory
    device_map="auto" if torch.cuda.is_available() else None,  # Auto GPU placement if available
    trust_remote_code=True,  # Required for some models
    low_cpu_mem_usage=True,  # Reduce CPU memory usage during loading
)

print("Model loaded successfully!")
print(f"Model parameters: {model.num_parameters():,}")
print(f"Model device: {next(model.parameters()).device}")

if torch.cuda.is_available():
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"GPU memory cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
    
    # Ensure model is on GPU if available
    model_device = next(model.parameters()).device
    if model_device.type != 'cuda':
        print("Moving model to GPU...")
        model = model.to(device)
        print(f"Model moved to: {next(model.parameters()).device}")
    else:
        print("Model already on GPU ✓")

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

2025-08-05 14:13:15.150991: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754403195.337468      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754403195.397005      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

In [None]:
print("\n=== Step 8: Data Tokenization ===")
print("Converting data to HuggingFace Dataset format and tokenizing...")

def tokenize_function(examples):
    """Tokenize the text for causal language modeling with GPU optimization"""
    # Combine prompt and response for causal language modeling
    # Keep your format: just text input -> text output (no speaker names)
    texts = [f"{prompt}\n{response}" for prompt, response in zip(examples["prompt"], examples["response"])]
    
    tokenized = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=1024,  # Increased for longer philosophical discussions
        return_tensors="pt"
    )
    
    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].clone()
    
    # Note: Datasets will handle moving to GPU during training automatically
    # with DataLoader, so we don't need to explicitly move here
    return tokenized

# Convert training and validation data to HuggingFace Dataset format
print("Creating Dataset objects...")
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

print(f"Training dataset created with {len(train_dataset)} examples")
print(f"Validation dataset created with {len(val_dataset)} examples")
print(f"Dataset features: {train_dataset.features}")

# Tokenize both datasets
print("Tokenizing training dataset...")
tokenized_train_dataset = train_dataset.map(
    tokenize_function, 
    batched=True,
    remove_columns=["prompt", "response"],  # Remove original columns
    desc="Tokenizing training data"
)

print("Tokenizing validation dataset...")
tokenized_val_dataset = val_dataset.map(
    tokenize_function, 
    batched=True,
    remove_columns=["prompt", "response"],  # Remove original columns
    desc="Tokenizing validation data"
)

print(f"Tokenized training dataset size: {len(tokenized_train_dataset)}")
print(f"Tokenized validation dataset size: {len(tokenized_val_dataset)}")
print(f"Tokenized features: {tokenized_train_dataset.features}")

# Display sample tokenized data
print("\nSample tokenized entry from training set:")
sample = tokenized_train_dataset[0]
print(f"Input IDs shape: {len(sample['input_ids'])}")
print(f"First 10 tokens: {sample['input_ids'][:10]}")
print(f"Decoded sample: {tokenizer.decode(sample['input_ids'][:50], skip_special_tokens=True)}")

TypeError: list indices must be integers or slices, not str

In [None]:
print("\n=== Step 9: Training Configuration ===")
print("Setting up training arguments and trainer...")

# Check GPU memory before training
if torch.cuda.is_available():
    print(f"GPU memory before training setup: {torch.cuda.memory_allocated() / 1e9:.2f} GB allocated")
    print(f"GPU memory available: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB total")

# Training configuration optimized for GPU with validation
training_args = TrainingArguments(
    output_dir="./socrates-finetuned-model",  # Output directory
    per_device_train_batch_size=2,  # Batch size per device (adjust based on GPU memory)
    per_device_eval_batch_size=2,   # Evaluation batch size
    gradient_accumulation_steps=4,  # Effective batch size = 2 * 4 = 8
    num_train_epochs=3,  # Number of training epochs
    logging_dir="./logs",  # Logging directory
    save_total_limit=2,  # Keep only the last 2 checkpoints to save space
    logging_steps=50,  # Log every 50 steps
    save_steps=250,  # Save checkpoint every 250 steps
    eval_steps=100,  # Evaluate every 100 steps
    warmup_steps=100,  # Number of warmup steps for learning rate scheduler
    learning_rate=2e-5,  # Learning rate (reduced for fine-tuning)
    weight_decay=0.01,  # Weight decay for regularization
    fp16=True if torch.cuda.is_available() else False,  # Use mixed precision only on GPU
    dataloader_pin_memory=True if torch.cuda.is_available() else False,  # Enable pin memory for GPU
    dataloader_num_workers=0,  # Reduce workers to save memory
    evaluation_strategy="steps",  # Enable evaluation during training
    save_strategy="steps",  # Save based on steps
    load_best_model_at_end=True,  # Load the best model at the end
    metric_for_best_model="eval_loss",  # Use evaluation loss as metric
    greater_is_better=False,  # Lower loss is better
    remove_unused_columns=False,  # Keep all columns
    report_to="none",  # Disable wandb/tensorboard logging
    push_to_hub=False,  # Don't push to model hub
    optim="adamw_torch",  # Use PyTorch AdamW optimizer
    gradient_checkpointing=True,  # Save memory by trading compute for memory
)

print("Training arguments configured successfully!")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Total training steps: {len(tokenized_train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")

# Initialize trainer with both training and validation datasets
print("Initializing trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,  # Use training dataset
    eval_dataset=tokenized_val_dataset,     # Use validation dataset
    tokenizer=tokenizer,  # Add tokenizer for proper saving
)

print("Trainer initialized successfully!")
print(f"Training dataset size: {len(tokenized_train_dataset)}")
print(f"Validation dataset size: {len(tokenized_val_dataset)}")

# Start training
print("\n=== Step 10: Model Training ===")
print("Starting model training with validation monitoring...")
print("This may take a while depending on your hardware.")
print(f"Training on {len(tokenized_train_dataset)} examples for {training_args.num_train_epochs} epochs")
print(f"Validating on {len(tokenized_val_dataset)} examples")

if torch.cuda.is_available():
    print(f"GPU memory before training: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print("Training will use GPU acceleration ✓")
else:
    print("Training will use CPU (this will be slower)")

# Clear cache before training
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Train the model
trainer.train()

print("Training completed successfully!")
print("Best model has been loaded based on validation loss.")

if torch.cuda.is_available():
    print(f"GPU memory after training: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    # Clear cache after training
    torch.cuda.empty_cache()
    print("GPU cache cleared after training")

NameError: name 'tokenized_dataset' is not defined

In [None]:
print("\n=== Step 11: Save Fine-tuned Model ===")
print("Saving the fine-tuned model and tokenizer...")

# Save the fine-tuned model and tokenizer
model_save_path = "socrates-finetuned-model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved successfully to: {model_save_path}")
print("The model can now be loaded for inference or further training.")
print("Note: The best model (based on validation loss) has been saved.")

In [None]:
print("\n=== Step 12: Test the Fine-tuned Model ===")
print("Testing the model with philosophical questions...")

def generate_socrates_response(prompt, max_new_tokens=100):
    """Generate a response using the fine-tuned Socrates model with GPU optimization"""
    # Format the input to match training format
    formatted_input = f"{prompt}\n"
    
    # Tokenize the input
    input_ids = tokenizer.encode(formatted_input, return_tensors="pt")
    
    # Move to same device as model (GPU if available)
    model_device = next(model.parameters()).device
    input_ids = input_ids.to(model_device)
    
    # Generate response with GPU acceleration
    with torch.no_grad():
        if torch.cuda.is_available():
            # Use GPU-optimized generation
            with torch.cuda.amp.autocast():  # Use automatic mixed precision
                output = model.generate(
                    input_ids,
                    max_new_tokens=max_new_tokens,
                    temperature=0.7,  # Add some randomness
                    do_sample=True,   # Use sampling instead of greedy
                    pad_token_id=tokenizer.eos_token_id,
                    no_repeat_ngram_size=2,  # Avoid repetition
                    eos_token_id=tokenizer.eos_token_id,
                    use_cache=True  # Enable KV cache for faster generation
                )
        else:
            # CPU generation (fallback)
            output = model.generate(
                input_ids,
                max_new_tokens=max_new_tokens,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                no_repeat_ngram_size=2,
                eos_token_id=tokenizer.eos_token_id
            )
    
    # Decode and return the response
    full_response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Remove the input prompt from the response and clean up
    response = full_response[len(formatted_input):].strip()
    
    # Clean up any remaining artifacts
    if response.startswith('\n'):
        response = response[1:].strip()
    
    return response

# Test the model with philosophical questions
test_prompts = [
    "What is the meaning of life?",
    "How should one live a good life?", 
    "What is wisdom?",
    "Tell me about virtue.",
    "What is justice?"
]

print("Testing the fine-tuned Socrates model:")
print("Format: Your text input → Model's text output (no speaker names)")
if torch.cuda.is_available():
    print(f"Running inference on GPU: {torch.cuda.get_device_name(0)} ✓")
else:
    print("Running inference on CPU")
print("=" * 70)

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n{i}. Input: {prompt}")
    try:
        response = generate_socrates_response(prompt, max_new_tokens=75)
        print(f"   Output: {response}")
    except Exception as e:
        print(f"   Error generating response: {e}")
    print("-" * 50)

print("\nModel testing completed!")
print("Your Socrates dialogue model is ready for use!")
print("The model takes text input and returns text output, following the Socratic dialogue style.")

if torch.cuda.is_available():
    print(f"\n=== Final GPU Status ===")
    print(f"GPU memory in use: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"GPU memory cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
    print("All operations completed successfully on GPU! 🚀")