In [None]:
%%capture 
pip install transformers datasets

In [None]:
pip install --upgrade transformers

In [None]:
cd /kaggle/input/daigt-proper-train-dataset

In [None]:
# ======= Load Libraries =======
# ======= Load DAIGET Dataset =======
df = pd.read_csv("train_drcat_01.csv")
for file in ["train_drcat_02.csv", "train_drcat_03.csv", "train_drcat_04.csv"]:
    df = pd.concat([df, pd.read_csv(file)], ignore_index=True)

# Binary label: 1 = AI, 0 = human
df["label"] = df["label"].apply(lambda x: 1 if x >= 0.95 else 0)

# Balance: 1000 per class
sampled_df = pd.concat([
    df[df["label"] == 0].sample(n=10, random_state=42),
    df[df["label"] == 1].sample(n=10, random_state=42)
]).sample(frac=1, random_state=42).reset_index(drop=True)

# Split 80/20 train/val
train_df = sampled_df.sample(frac=0.8, random_state=42)
val_df = sampled_df.drop(train_df.index)

# ======= Load Deepfake Dataset (OOD Test) =======
deepfake_ds = load_dataset("yaful/MAGE", split="train")
sampled_deepfake_df = pd.DataFrame(deepfake_ds.select(range(100)))
sampled_deepfake_df["label"] = 1  # All assumed AI-generated

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset
import numpy as np
import random
import pandas as pd

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Custom Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Load and sample datasets
def load_and_sample_data():
    # Load DAIGT dataset from Kaggle CSV

    daigt_df = pd.read_csv("/kaggle/input/daigt-proper-train-dataset/train_drcat_01.csv")
    for file in ["train_drcat_02.csv", "train_drcat_03.csv", "train_drcat_04.csv"]:
        daigt_df = pd.concat([df, pd.read_csv(file)], ignore_index=True)

    # Binary label: 1 = AI, 0 = human
    daigt_df["label"] = daigt_df["label"].apply(lambda x: 1 if x >= 0.95 else 0)

    class_0 = daigt_df[daigt_df["label"] == 0]
    class_1 = daigt_df[daigt_df["label"] == 1]
    
    if len(class_0) < 40000 or len(class_1) < 40000:
        raise ValueError(f"Not enough samples: class 0 has {len(class_0)}, class 1 has {len(class_1)}. Need 2000 each.")
        
    # Balance: 1000 per class
    daigt_sample = pd.concat([
    daigt_df[daigt_df["label"] == 0].sample(n=40000, random_state=42),
    daigt_df[daigt_df["label"] == 1].sample(n=40000, random_state=42)
            ]).sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Sample 2000 examples per class (4000 total)
    # class_0 = daigt_df[daigt_df["label"] == 0]
    # class_1 = daigt_df[daigt_df["label"] == 1]
    
    
    
    # # class_0 = class_0.sample(n=7000, random_state=42)
    # # class_1 = class_1.sample(n=7000, random_state=42)
    # # daigt_sample = pd.concat([class_0, class_1])
    daigt_texts = daigt_sample["text"].tolist()
    daigt_labels = daigt_sample["label"].tolist()

    # # Load yaful/MAGE dataset for OOV sample
    # try:
    #     mage_dataset = load_dataset("yaful/MAGE", split="train")
    # except Exception as e:
    #     print(f"Error loading MAGE: {e}")
    #     raise Exception("Ensure you have access to yaful/MAGE.")
    # mage_df = pd.DataFrame(mage_dataset)
    
    # # Sample 100 OOV examples
    # class_0_mage = mage_df[mage_df["label"] == 0]
    # class_1_mage = mage_df[mage_df["label"] == 1]
    # class_0_mage = class_0_mage.sample(n=2000, random_state=42)
    # class_1_mage = class_0_mage.sample(n=2000, random_state=42)
    # mage_sample = pd.concat([class_0_mage, class_1_mage])
    # mage_texts = mage_sample["text"].tolist()
    # mage_labels = [1 - label for label in mage_sample["label"].tolist()]
    # # Combine datasets
    # texts = daigt_texts + mage_texts
    # labels = daigt_labels + mage_labels

    return daigt_texts, daigt_labels

In [None]:
# mage_dataset = load_dataset("yaful/MAGE", split="train")
# mage_df = pd.DataFrame(mage_dataset)
# mage_df.head()

In [None]:
# Load MAGE test set
def load_mage_test_data(tokenizer, sample_size=1000):
    try:
        mage_dataset = load_dataset("yaful/MAGE", split="train")
    except Exception as e:
        print(f"Error loading MAGE: {e}")
        raise Exception("Ensure you have access to yaful/MAGE.")
    mage_df = pd.DataFrame(mage_dataset)
    
    # Sample 500 examples for testing (distinct from OOV samples)
    mage_test = mage_df.sample(n=sample_size, random_state=43)  # Different seed to avoid overlap
    texts = mage_test["text"].tolist()
    labels = mage_test["label"].tolist()
    return TextDataset(texts, labels, tokenizer)

In [None]:
# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predictions, average="binary"
    )

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
# Main function to fine-tune ALBERT
def main():
    # Load tokenizer and model
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    model = AlbertForSequenceClassification.from_pretrained(
        "albert-base-v2",
        num_labels=2,
        hidden_dropout_prob=0.1
    ).to(device)

    # Load and preprocess data
    texts, labels = load_and_sample_data()

    # Train-test split (80:20)
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42, stratify=labels
    )

    # Create datasets
    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    test_dataset = TextDataset(test_texts, test_labels, tokenizer)
    mage_test_dataset = load_mage_test_data(tokenizer)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)
    #mage_test_loader = DataLoader(mage_test_dataset, batch_size=16)

    # Set up optimizer
    optimizer = AdamW(model.parameters(), lr=2e-5)

    # Training loop
    num_epochs = 5
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

    # Evaluate on 80:20 test set
    test_metrics = evaluate_model(model, test_loader)
    print("\n80:20 Test Set Evaluation Results:")
    print(f"Accuracy: {test_metrics['accuracy']:.4f}")
    print(f"Precision: {test_metrics['precision']:.4f}")
    print(f"Recall: {test_metrics['recall']:.4f}")
    print(f"F1 Score: {test_metrics['f1']:.4f}")

    # # Evaluate on MAGE test set
    # mage_metrics = evaluate_model(model, mage_test_loader)
    # print("\nMAGE Test Set Evaluation Results:")
    # print(f"Accuracy: {mage_metrics['accuracy']:.4f}")
    # print(f"Precision: {mage_metrics['precision']:.4f}")
    # print(f"Recall: {mage_metrics['recall']:.4f}")
    # print(f"F1 Score: {mage_metrics['f1']:.4f}")

    # Save the model
    model.save_pretrained("/kaggle/working/albert_finetuned_model")
    tokenizer.save_pretrained("/kaggle/working/albert_finetuned_tokenizer")

In [None]:
main()

In [None]:
!zip -r /kaggle/working/ALBERT.zip /kaggle/working

In [None]:
pip install fastapi

In [None]:
pip install uvicorn

In [None]:
pip install pyngrok

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import AlbertTokenizer, AlbertForSequenceClassification
import numpy as np
import nest_asyncio
import uvicorn
from pyngrok import ngrok
import threading

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Initialize FastAPI app
app = FastAPI()

# Define input data model
class TextInput(BaseModel):
    text: str

# Load fine-tuned model and tokenizer
model_path = "/kaggle/working/albert_finetuned"
tokenizer = AlbertTokenizer.from_pretrained(model_path)
model = AlbertForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Prediction function
def predict_text(text: str):
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
        pred_class = np.argmax(probs).item()

    label = "human" if pred_class == 0 else "machine-generated"
    confidence = float(probs[pred_class])

    return {"label": label, "confidence": confidence}

# API endpoint
@app.post("/classify")
async def classify_text(input: TextInput):
    result = predict_text(input.text)
    return result

# Function to run Uvicorn server
def run_server():
    uvicorn.run(app, host="0.0.0.0", port=8000)

# Main execution
if __name__ == "__main__":
    # Install pyngrok if not already installed
    try:
        from pyngrok import ngrok
    except ImportError:
        import os
        os.system("pip install pyngrok")
        from pyngrok import ngrok

    # Set ngrok authtoken (replace with your authtoken)
    ngrok.set_auth_token("2wtqB90OJlII7ydBPNMSVpLFGx8_3sQFNcNMvZYQXALrS6exJ")  # Replace with your ngrok authtoken

    # Start ngrok tunnel
    public_url = ngrok.connect(8000, bind_tls=True)
    print(f"Ngrok tunnel started at: {public_url}")

    # Run Uvicorn server in a separate thread to avoid blocking
    server_thread = threading.Thread(target=run_server)
    server_thread.start()

    # Keep the main thread alive
    try:
        server_thread.join()
    except KeyboardInterrupt:
        print("Shutting down server...")
        ngrok.disconnect(public_url)
        ngrok.kill()

In [None]:
df_test = pd.read_csv("/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv")


In [None]:
df_test.iloc[44204]["text"]

In [None]:
import torch
from transformers import AlbertTokenizer, AlbertForSequenceClassification
import numpy as np

# Load fine-tuned model and tokenizer
model_path = "/kaggle/working/albert_finetuned"
tokenizer = AlbertTokenizer.from_pretrained(model_path)
model = AlbertForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Prediction function
def predict_text(text: str):
    # Tokenize input text
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    # Get model predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
        pred_class = np.argmax(probs).item()

    # Map class to label
    label = "machine-generated" if pred_class == 1 else "human"
    confidence = float(probs[pred_class])

    return {"label": label, "confidence": confidence}

# Test query
query = df_test.iloc[44204]["text"]
# Run prediction
result = predict_text(query)
print("Prediction Result:")
print(f"Label: {result['label']}")
print(f"Confidence: {result['confidence']:.4f}")