In [9]:
import torch
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from torch.optim import AdamW

In [10]:
# Configuration
class Config:
    # Model settings
    model_name = "microsoft/deberta-base"
    num_labels = 2  # Binary classification (real vs fake)
    
    # Training parameters
    batch_size = 8
    epochs = 3
    learning_rate = 2e-5
    max_length = 256  # Max token length
    
    # Paths (customize these)
    real_data_path = "/Users/fenilvadher/Documents/Collage Data/SEM - 6/AI/AI Project/Fake-Real News Dataset/True.csv"  # or .txt
    fake_data_path = "/Users/fenilvadher/Documents/Collage Data/SEM - 6/AI/AI Project/Fake-Real News Dataset/Fake.csv"  # or .txt
    model_save_path = "models/deberta_fake_text_detector"
    tokenizer_save_path = "models/tokenizer"

In [11]:
# Custom Dataset Class
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=Config.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [4]:
# --- 3. GPU-Optimized Dataset Class ---
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [12]:
# Data Loading Function
def load_data(real_path, fake_path):
    """Load data from CSV or TXT files"""
    def load_file(path):
        if path.endswith('.csv'):
            return pd.read_csv(path)['text'].tolist()
        elif path.endswith('.txt'):
            with open(path, 'r', encoding='utf-8') as f:
                return [line.strip() for line in f if line.strip()]
        else:
            raise ValueError("File format not supported. Use .csv or .txt")

    print("Loading data...")
    real_texts = load_file(real_path)
    fake_texts = load_file(fake_path)
    
    # Verify data
    if not real_texts or not fake_texts:
        raise ValueError("No data loaded - check your file paths and contents")
    
    print(f"Loaded {len(real_texts)} real samples and {len(fake_texts)} fake samples")
    return real_texts, fake_texts

In [13]:
# Training Function
def train():
    # Create output directories
    os.makedirs(Config.model_save_path, exist_ok=True)
    os.makedirs(Config.tokenizer_save_path, exist_ok=True)
    
    # Initialize tokenizer and model
    tokenizer = DebertaTokenizer.from_pretrained(Config.model_name)
    model = DebertaForSequenceClassification.from_pretrained(
        Config.model_name,
        num_labels=Config.num_labels
    )
    
    # Load and prepare data
    real_texts, fake_texts = load_data(Config.real_data_path, Config.fake_data_path)
    
    # Create labels (0=real, 1=fake)
    texts = real_texts + fake_texts
    labels = [0]*len(real_texts) + [1]*len(fake_texts)
    
    # Split into train/test
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )
    
    # Create datasets
    train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer)
    test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer)
    
    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.batch_size,
        shuffle=True
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=Config.batch_size
    )
    
    # Set up training
    optimizer = AdamW(model.parameters(), lr=Config.learning_rate)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    print("\nStarting training...")
    for epoch in range(Config.epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{Config.epochs}")
        
        for batch in progress_bar:
            optimizer.zero_grad()
            
            # Move batch to device
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
                'labels': batch['labels'].to(device)
            }
            
            outputs = model(**inputs)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': total_loss / (progress_bar.n + 1)})
    
    # Evaluation
    model.eval()
    predictions, true_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device)
            }
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(batch['labels'].cpu().numpy())
    
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions, target_names=["Real", "Fake"]))
    
    # Save model and tokenizer
    model.save_pretrained(Config.model_save_path)
    tokenizer.save_pretrained(Config.tokenizer_save_path)
    print(f"\nModel saved to {Config.model_save_path}")
    print(f"Tokenizer saved to {Config.tokenizer_save_path}")

if __name__ == "__main__":
    train()

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading data...
Loaded 21417 real samples and 23481 fake samples

Starting training...


Epoch 1/3:   0%|          | 14/4490 [00:51<4:33:41,  3.67s/it, loss=0.484]


KeyboardInterrupt: 