# Step 1: Install Dependencies

In [None]:
# Install necessary libraries for text classification and deployment
!pip install -q transformers datasets gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.1/322.1 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Step 2: Load & Prepare Dataset

In [None]:
from datasets import Dataset

# Define AI-generated text samples (from ChatGPT, DeepSeek, Claude, etc.)
ai_text_samples = [
    "Artificial intelligence is revolutionizing industries by automating processes.",
    "The universe is vast, with countless galaxies and undiscovered planets.",
    "Quantum computing has the potential to solve complex problems exponentially faster.",
    "AI-generated text can sometimes mimic human writing patterns convincingly."
]

# Define human-written text samples (from Wikipedia, books, articles)
human_text_samples = [
    "The history of ancient civilizations dates back thousands of years.",
    "Cooking requires a balance of flavors, textures, and techniques.",
    "A compelling novel keeps readers engaged with strong characters and plot development.",
    "Traveling allows individuals to experience different cultures and traditions."
]

# Create labels for classification (1 = AI-generated, 0 = Human-written)
labels = [1] * len(ai_text_samples) + [0] * len(human_text_samples)

# Convert data into a Hugging Face Dataset format
dataset = Dataset.from_dict({"text": ai_text_samples + human_text_samples, "label": labels})

# Split dataset into training (80%) and testing (20%) sets
dataset = dataset.train_test_split(test_size=0.2)

# Step 3: Load & Tokenize `ModernBERT-base`

In [None]:
from transformers import AutoTokenizer

# ModernBERT model
model_name = "answerdotai/ModernBERT-base"

# Load ModernBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize dataset with padding and truncation
tokenized_datasets = dataset.map(lambda x: tokenizer(x["text"],
                                                     padding="max_length",
                                                     truncation=True,
                                                     max_length=128), batched=True)

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

# Step 4: Fine-Tune `ModernBERT-base`

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load ModernBERT model for binary classification (2 labels: AI vs. Human)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define training parameters
training_args = TrainingArguments(
    output_dir="./ai_detector",  # Directory to save model checkpoints
    learning_rate=0.001,  # Optimal learning rate for fine-tuning
    weight_decay=0.01,  # Regularization to prevent overfitting
    per_device_train_batch_size=2,  # Batch size for training
    per_device_eval_batch_size=2,  # Batch size for evaluation
    num_train_epochs=3,  # Number of training iterations over dataset
    eval_strategy="epoch",  # Evaluate model at the end of each epoch
    save_strategy="epoch",  # Save model at the end of each epoch
    logging_strategy="epoch",  # Log training/evaluation loss at each epoch
    logging_steps=1,  # Log training loss every step (for debugging)
    logging_dir="./logs",  # Directory to store training logs
    load_best_model_at_end=True,  # Load best model checkpoint after training
    report_to=["wandb"],  # Log training results to Weights & Biases
    # push_to_hub=True,  # Uncomment to upload model to Hugging Face Hub
)

# Initialize Trainer for model fine-tuning
trainer = Trainer(
    model=model,  # ModernBERT model
    args=training_args,  # Training configuration
    train_dataset=tokenized_datasets["train"],  # Training data
    eval_dataset=tokenized_datasets["test"],  # Evaluation data
    processing_class=tokenizer  # Tokenizer for processing text
)

# Start training the model
trainer.train()

# Step 5: Test the Model

In [None]:
import numpy as np
from transformers import pipeline

# Load trained model as a text classifier pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Example AI-generated text for testing
test_text = "Deep learning is a subset of machine learning that uses artificial neural networks to model and solve complex problems. It is characterized by deep architectures with multiple layers of interconnected neurons, allowing it to automatically learn hierarchical representations from data."

# Run text classification
result = classifier(test_text)

# Convert output label to human-readable format
label = "AI-Generated" if result[0]["label"] == "LABEL_1" else "Human-Written"
confidence = np.round(result[0]["score"], 3)

# Print classification result
print(f"Prediction: {label} (Confidence: {confidence})")

# Step 6: Deploy as a Gradio Web App

In [None]:
import gradio as gr

# Define function for real-time AI text detection
def predict_ai_text(input_text):
    result = classifier(input_text)
    label = "AI-Generated" if result[0]["label"] == "LABEL_1" else "Human-Written"
    confidence = np.round(result[0]["score"], 3)
    return f"{label} (Confidence: {confidence})"

# Create Gradio interface for web-based text detection
gr.Interface(fn=predict_ai_text, inputs="text", outputs="text", title="AI Text Detector").launch()