In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv
/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv


In [None]:
# -*- coding: utf-8 -*-
"""
LLM Detect AI Generated Text - Colab Baseline

This notebook provides a baseline solution for the Kaggle competition
"LLM - Detect AI Generated Text", designed to run on Google Colab's free tier.
It fine-tunes a DeBERTa-v3-base model.
"""

# -----------------------------------------------------------------------------
# 1. Setup & Installs
# -----------------------------------------------------------------------------
!pip install -q transformers datasets evaluate accelerate scikit-learn pandas

import pandas as pd
import numpy as np
import torch
import gc # Garbage collector
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from datasets import Dataset, DatasetDict

print("Libraries imported.")

# Check GPU availability and clear cache (good practice on Colab)
if torch.cuda.is_available():
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")
    torch.cuda.empty_cache()
    USE_FP16 = True # Enable Mixed Precision if T4/P100/V100 is available
    # Note: K80 GPUs on Colab might not support FP16 well.
    # The Trainer will often handle this, but manual checks can be added if needed.
else:
    print("No GPU detected, running on CPU (will be very slow). FP16 disabled.")
    USE_FP16 = False

# -----------------------------------------------------------------------------
# 2. Configuration
# -----------------------------------------------------------------------------
class CFG:
    MODEL_NAME = "microsoft/deberta-v3-base" # Good balance of performance/size
    # MODEL_NAME = "roberta-base" # Alternative good choice
    MAX_LENGTH = 512        # Max sequence length for tokenizer
    TRAIN_BATCH_SIZE = 8    # Adjust based on GPU VRAM (4, 8, 16 are common)
    EVAL_BATCH_SIZE = 16
    GRAD_ACCUM_STEPS = 2    # Effective batch size = TRAIN_BATCH_SIZE * GRAD_ACCUM_STEPS
    LEARNING_RATE = 2e-5
    EPOCHS = 3              # Start with a few epochs, increase if needed/possible
    WEIGHT_DECAY = 0.01
    RANDOM_SEED = 42
    VAL_SPLIT = 0.2         # Use 20% of training data for validation
    OUTPUT_DIR = "llm-detect-output"
    DATA_PATH = "/kaggle/input/llm-detect-ai-generated-text/" # Default Kaggle path

# Set seed for reproducibility
np.random.seed(CFG.RANDOM_SEED)
torch.manual_seed(CFG.RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(CFG.RANDOM_SEED)

# -----------------------------------------------------------------------------
# 3. Load & Prepare Data
# -----------------------------------------------------------------------------
print("Loading data...")
# --- IMPORTANT: Adapt this path if running locally or on Colab ---
# If on Colab, you might need to upload data or mount Google Drive
# Example for Colab if data is uploaded to session storage:
# CFG.DATA_PATH = "./" # Assuming files are in the root runtime directory

# Load the primary training data
try:
    # Assuming execution within a Kaggle Notebook environment
    train_essays = pd.read_csv(f"{CFG.DATA_PATH}train_essays.csv")
    test_essays = pd.read_csv(f"{CFG.DATA_PATH}test_essays.csv")
    sample_submission = pd.read_csv(f"{CFG.DATA_PATH}sample_submission.csv")

    # Load external dataset (if available and beneficial)
    # Be mindful of Colab disk space!
    try:
         # This specific dataset was commonly used in this competition
         external_df = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")
         # Prepare external data similar to train_essays if structure differs
         # Example: Rename columns if necessary, select relevant text
         # For simplicity here, assume it has 'text' and 'label' columns
         if 'text' not in external_df.columns or 'label' not in external_df.columns:
             print("Warning: External dataset columns might need adjustment.")
             # Add specific renaming/selection logic here if needed
             external_df = external_df[['text', 'label']].copy() # Basic example

         print(f"Loaded external dataset with {len(external_df)} rows.")
         # Combine with train_essays
         train_essays = pd.concat([
    # 1. Select 'text', 'generated' from train_essays and rename 'generated' to 'label'
    train_essays[['text', 'generated']].rename(columns={'generated': 'label'}),

    # 2. Use all columns from external_df as they are
    external_df

], ignore_index=True)
         # This will display all rows where the 'label' column is NaN
         print("Rows with NaN in 'label' column:")
         print(train_essays[train_essays['label'].isnull()])
         # Make sure label column is consistent (0 or 1)
         train_essays['label'] = train_essays['label'].astype(int)
         print(f"Combined training data shape: {train_essays.shape}")

    except FileNotFoundError:
        print("Optional external dataset not found. Proceeding with train_essays only.")
        # Rename 'generated' column to 'label' for consistency
        train_essays.rename(columns={'generated': 'label'}, inplace=True)

except FileNotFoundError:
    print("ERROR: Could not find competition data.")
    print(f"Looked in: {CFG.DATA_PATH}")
    print("Please ensure the data files (train_essays.csv, test_essays.csv) are accessible.")
    # You might need to upload manually or adjust the DATA_PATH if not on Kaggle
    # Example for local execution: CFG.DATA_PATH = "./data/"
    raise SystemExit("Data files not found.")


print(f"Train essays shape: {train_essays.shape}")
print(f"Test essays shape: {test_essays.shape}")
print(train_essays.head())

# Split training data for validation
print(f"Splitting data (Validation size: {CFG.VAL_SPLIT})...")
train_df, val_df = train_test_split(
    train_essays,
    test_size=CFG.VAL_SPLIT,
    random_state=CFG.RANDOM_SEED,
    stratify=train_essays['label'] # Important for classification
)

print(f"Train split shape: {train_df.shape}")
print(f"Validation split shape: {val_df.shape}")

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_essays) # Keep test separate

# Create a DatasetDict if needed (optional, but good practice)
# ds = DatasetDict({
#     'train': train_dataset,
#     'validation': val_dataset,
#     'test': test_dataset
# })
# print(ds)

# Clean up memory
del train_df, val_df, train_essays
gc.collect()

# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# 4. Tokenization
# -----------------------------------------------------------------------------
print(f"Loading tokenizer: {CFG.MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME)

def tokenize_function(examples):
    """Applies tokenizer to text data."""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=False, # Pad dynamically later with DataCollator
        max_length=CFG.MAX_LENGTH,
        return_token_type_ids=False, # Not needed for DeBERTa v3/RoBERTa
    )

print("Tokenizing datasets...")

# --- Dynamically determine columns to remove ---
# It's safer to check actual columns before removing

# For train/val: remove text (used by tokenizer) and any extra columns
# that came from concatenation (like __index_level_0__ or from external_df)
# DO NOT try to remove 'id' as it's not present after the concat operation.
train_val_potential_remove = ['text', '__index_level_0__']
# Add columns from external_df if they exist and aren't needed (check external_df source)
# Example: train_val_potential_remove.extend(['prompt_name', 'source', 'RDizzl3_seven'])

train_cols_to_remove = [col for col in train_val_potential_remove if col in train_dataset.column_names]
val_cols_to_remove = [col for col in train_val_potential_remove if col in val_dataset.column_names]

# For test: remove only 'text', keep 'id' for submission
test_cols_to_remove = ['text']
if 'text' not in test_dataset.column_names:
     print("Warning: 'text' column not found in test_dataset before mapping.")
     test_cols_to_remove = [] # Avoid error if text somehow missing

print(f"Columns to remove from train_dataset: {train_cols_to_remove}")
print(f"Columns to remove from val_dataset: {val_cols_to_remove}")
print(f"Columns to remove from test_dataset: {test_cols_to_remove}")


# Apply tokenization with the CORRECTED lists
train_tokenized = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_cols_to_remove # Use calculated list
)
val_tokenized = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=val_cols_to_remove   # Use calculated list
)
test_tokenized = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=test_cols_to_remove    # Only removes 'text'
)

# Data collator handles dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("Tokenization complete.")
# print(train_tokenized[0]) # Check an example
# print(test_tokenized[0]) # Check an example, 'id' should be present if it was in test_essays

# -----------------------------------------------------------------------------
# 5. Model Loading & Training Setup
# -----------------------------------------------------------------------------
print(f"Loading model: {CFG.MODEL_NAME}")
model = AutoModelForSequenceClassification.from_pretrained(
    CFG.MODEL_NAME,
    num_labels=2, # Binary classification (human=0, AI=1)
    # ignore_mismatched_sizes=True # Use if encountering checkpoint size issues (less common now)
)

# Define metrics - AUC is the competition metric
def compute_metrics(eval_pred):
    """Computes AUC score for evaluation."""
    predictions, labels = eval_pred
    # Sigmoid / Softmax needed depending on model output type
    # Trainer often handles logits, so apply softmax or sigmoid
    # If logits:
    # probs = torch.nn.functional.softmax(torch.Tensor(predictions), dim=-1)[:, 1].numpy()
    # If probabilities directly (less common from Trainer):
    # probs = predictions[:, 1]

    # Assuming predictions are logits (most common from HF Trainer)
    if isinstance(predictions, tuple): # Handle potential tuple output
        logits = predictions[0]
    else:
        logits = predictions
    
    probs = torch.tensor(logits).softmax(dim=-1)[:, 1].numpy() # Probability of class 1
    
    auc = roc_auc_score(labels, probs)
    # You could add accuracy, f1 etc. here too
    # from sklearn.metrics import accuracy_score
    # preds = np.argmax(logits, axis=1)
    # acc = accuracy_score(labels, preds)
    return {
        'auc': auc,
        # 'accuracy': acc
    }

print("Setting up Training Arguments...")
training_args = TrainingArguments(
    output_dir=CFG.OUTPUT_DIR,
    learning_rate=CFG.LEARNING_RATE,
    per_device_train_batch_size=CFG.TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=CFG.EVAL_BATCH_SIZE,
    gradient_accumulation_steps=CFG.GRAD_ACCUM_STEPS,
    num_train_epochs=CFG.EPOCHS,
    weight_decay=CFG.WEIGHT_DECAY,
    eval_strategy="epoch",        # Evaluate every epoch
    save_strategy="epoch",              # Save model every epoch
    load_best_model_at_end=True,        # Load the best model based on metric
    metric_for_best_model="auc",        # Competition metric
    greater_is_better=True,
    fp16=USE_FP16,                      # Enable Mixed Precision
    # fp16_full_eval=USE_FP16,          # Use FP16 for evaluation too (if memory allows)
    logging_strategy="steps",           # Log periodically
    logging_steps=50,                  # Log every 50 steps
    report_to="none",                   # Disable WandB/Tensorboard logging for simplicity
    save_total_limit=1,                 # Only keep the best checkpoint
    seed=CFG.RANDOM_SEED,
    # dataloader_num_workers=2,         # Can sometimes speed up data loading
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Stop if no improvement for 3 epochs
)

# Clean cache before training
gc.collect()
torch.cuda.empty_cache()

# -----------------------------------------------------------------------------
# 6. Train the Model
# -----------------------------------------------------------------------------
print("Starting training...")
train_result = trainer.train()

# Print training summary
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
print("\nTraining complete.")

# -----------------------------------------------------------------------------
# 7. Evaluate on Validation Set
# -----------------------------------------------------------------------------
print("\nEvaluating on validation set...")
eval_metrics = trainer.evaluate()
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)
print(f"Validation Metrics: {eval_metrics}")

# Optional: Save the best model explicitly (Trainer already saves it)
# trainer.save_model(f"{CFG.OUTPUT_DIR}/best_model")
# tokenizer.save_pretrained(f"{CFG.OUTPUT_DIR}/best_model")

# -----------------------------------------------------------------------------
# 8. Generate Predictions for Submission
# -----------------------------------------------------------------------------
print("\nGenerating predictions on the test set...")

# Ensure model is on the correct device (GPU if available)
if torch.cuda.is_available():
    model.to('cuda')

# Get predictions
# The trainer.predict() method returns prediction outputs, which include logits
predictions_output = trainer.predict(test_tokenized)
test_logits = predictions_output.predictions

# Convert logits to probabilities (probability of being AI-generated, i.e., class 1)
test_probs = torch.tensor(test_logits).softmax(dim=-1)[:, 1].numpy()

# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': test_essays['id'],
    'generated': test_probs
})

# Save submission file
submission_df.to_csv("submission.csv", index=False)

print("\nSubmission file created: submission.csv")
print(submission_df.head())

# -----------------------------------------------------------------------------
# 9. Clean up
# -----------------------------------------------------------------------------
print("\nCleaning up memory...")
del model, tokenizer, trainer, train_tokenized, val_tokenized, test_tokenized
gc.collect()
torch.cuda.empty_cache()
print("Done.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Libraries imported.
GPU detected: Tesla T4
Loading data...
Loaded external dataset with 44868 rows.
Rows with NaN in 'label' column:
Empty DataFrame
Columns: [text, label, prompt_name, source, RDizzl3_seven]
Index: []
Combined training data shape: (46246, 5)
Train essays shape: (46246, 5)
Test essays shape: (3, 3)
                                                text  label prompt_name  \
0  Cars. Cars have been around since they became ...      0         NaN   
1  Transportation is a large necessity in most co...      0         NaN   
2  "America's love affair with it's vehicles seem...      0         NaN   
3  How often do you ride in a car? Do you drive a...      0         NaN   
4  Cars are a wonderful thing. They are perhaps o...      0         NaN   

  source RDizzl3_seven  
0    NaN           NaN  
1    NaN           NaN  
2    NaN           NaN  
3    NaN           NaN  
4    NaN           NaN  
Splitting data (Validation size: 0.2)...
Train split shape: (36996, 5)
Validation s



Tokenizing datasets...
Columns to remove from train_dataset: ['text', '__index_level_0__']
Columns to remove from val_dataset: ['text', '__index_level_0__']
Columns to remove from test_dataset: ['text']


Map:   0%|          | 0/36996 [00:00<?, ? examples/s]

Map:   0%|          | 0/9250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Tokenization complete.
Loading model: microsoft/deberta-v3-base


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Setting up Training Arguments...
Starting training...




Epoch,Training Loss,Validation Loss
