In [1]:
# --- 1. Environment Setup ---
import torch
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Force CPU usage
device = torch.device("cpu")
torch.set_num_threads(4)  # Optimize for 4 CPU cores
print("Running on CPU with 4 threads")


Running on CPU with 4 threads


In [4]:
# --- 2. Data Preparation with Custom Dataset ---
def load_custom_data(real_data_path, fake_data_path):
    """
    Load your custom dataset from files
    Expected file formats: CSV or TXT
    
    CSV Format should have at least one column named 'text'
    TXT Format should have one text sample per line
    """
    # For CSV files
    if real_data_path.endswith('.csv'):
        real_df = pd.read_csv(real_data_path)
        fake_df = pd.read_csv(fake_data_path)
        real_texts = real_df['text'].tolist()
        fake_texts = fake_df['text'].tolist()
    
    # For TXT files
    elif real_data_path.endswith('.txt'):
        with open(real_data_path, 'r', encoding='utf-8') as f:
            real_texts = [line.strip() for line in f if line.strip()]
        with open(fake_data_path, 'r', encoding='utf-8') as f:
            fake_texts = [line.strip() for line in f if line.strip()]
    
    else:
        raise ValueError("Unsupported file format. Use CSV or TXT files.")
    
    # Verify data loaded correctly
    print(f"Loaded {len(real_texts)} real samples and {len(fake_texts)} fake samples")
    assert len(real_texts) > 0 and len(fake_texts) > 0, "No data loaded - check your files"
    
    texts = real_texts + fake_texts
    labels = [0]*len(real_texts) + [1]*len(fake_texts)
    
    return train_test_split(texts, labels, test_size=0.2, random_state=42)

# --- REPLACE THESE PATHS WITH YOUR ACTUAL FILES ---
real_data_path = "/Users/fenilvadher/Documents/Collage Data/SEM - 6/AI/AI Project/Fake-Real News Dataset/True.csv"  # or .txt
fake_data_path = "/Users/fenilvadher/Documents/Collage Data/SEM - 6/AI/AI Project/Fake-Real News Dataset/Fake.csv"  # or .txt

train_texts, test_texts, train_labels, test_labels = load_custom_data(real_data_path, fake_data_path)

Loaded 21417 real samples and 23481 fake samples


In [5]:
# --- 3. Dataset Class ---
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [6]:
# --- 4. Initialize DeBERTa ---
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
model = DebertaForSequenceClassification.from_pretrained(
    "microsoft/deberta-base",
    num_labels=2
).to(device)


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Error while downloading from https://cdn-lfs.hf.co/microsoft/deberta-base/b8dd0f54523e221f5e4dc2457d61da3115ecfe859c01010954d39e25b0ecf271?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1744351576&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NDM1MTU3Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9taWNyb3NvZnQvZGViZXJ0YS1iYXNlL2I4ZGQwZjU0NTIzZTIyMWY1ZTRkYzI0NTdkNjFkYTMxMTVlY2ZlODU5YzAxMDEwOTU0ZDM5ZTI1YjBlY2YyNzE%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=B%7EoDhgj0POWQEqwzWNbfXzXQZX5tqszPogihbKzUsXRWYSa0uGW8uKb53H1RjuC51qPMSKmM5HHZ7H8dMcdLcB-%7EI7jl79y3eDFzRg1zWdvpaCW%7EDz6gPMK2zK%7EjRWgsw63951yFgIiiCYd5DBCX9NxgYieE-AP-5Y4c2lm41Z7F8Kj79n41XCsmFsGZWnK2OYXVOFI2DJlBORQ6ACqacL6fz9jeAs1AnontcdtRLFsHUfJeotVX%7ElO1BLj1lKh3AFKsmYmtFGSUPlvPFT8-b3Bs3UekqYvNNWoAxjncn31%7E5%7ECcg1F4hLk8UmP6RjD3XsKHwKKEQp4-IN1ZXaCy4A__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSC

In [7]:
# --- 5. DataLoaders ---
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
test_dataset = TextDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=4
)


In [8]:
# --- 6. Training Loop ---
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

for epoch in range(2):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    
    for batch in progress_bar:
        optimizer.zero_grad()
        
        inputs = {
            "input_ids": batch["input_ids"].to(device),
            "attention_mask": batch["attention_mask"].to(device),
            "labels": batch["labels"].to(device)
        }
        
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({"loss": total_loss / (progress_bar.n + 1)})

Epoch 1: 100%|██████████| 8980/8980 [4:48:10<00:00,  1.93s/it, loss=0.0087]      
Epoch 2: 100%|██████████| 8980/8980 [5:35:35<00:00,  2.24s/it, loss=0.00134]      


In [9]:
# --- 8. Save Model ---
torch.save(model.state_dict(), "custom_deberta_detector.pth")
print("Model saved to custom_deberta_detector.pth")

Model saved to custom_deberta_detector.pth


In [10]:
# --- 9. Prediction Function ---
def detect_deepfake(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        max_length=128,
        truncation=True,
        padding="max_length"
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)[0]
    
    return {
        "prediction": "Fake" if probs[1] > 0.5 else "Real",
        "confidence": max(probs[0], probs[1]).item(),
        "real_prob": f"{probs[0].item():.2%}",
        "fake_prob": f"{probs[1].item():.2%}"
    }

In [11]:
# --- Example Usage with Custom Data ---
print("\nTesting with sample from your dataset:")
sample_text = test_texts[0]  # Using your actual test data
result = detect_deepfake(sample_text)
print(f"\nText: {sample_text}")
print(f"Prediction: {result['prediction']}")
print(f"Confidence: {result['confidence']:.2%}")
print(f"Real Probability: {result['real_prob']}")
print(f"Fake Probability: {result['fake_prob']}")


Testing with sample from your dataset:

Prediction: Fake
Confidence: 100.00%
Real Probability: 0.00%
Fake Probability: 100.00%
