In [1]:
%pip install transformers torch pandas numpy scikit-learn tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load and prepare dataset
def load_data():
    # Load datasets (replace with your paths)
    real_news = pd.read_csv("/Users/fenilvadher/Documents/Collage Data/SEM - 6/AI/AI Project/Fake-Real News Dataset/True.csv")
    fake_news = pd.read_csv("/Users/fenilvadher/Documents/Collage Data/SEM - 6/AI/AI Project/Fake-Real News Dataset/Fake.csv")
    
    # Add labels and combine
    real_news['label'] = 0
    fake_news['label'] = 1
    df = pd.concat([real_news, fake_news])
    
    # Clean and reset index
    df = df.dropna(subset=['text'])  # Remove empty texts
    df = df.reset_index(drop=True)
    
    # Verify data integrity
    assert df.index.max() == len(df) - 1, "Index mismatch detected"
    
    return df

In [4]:
# Split data
df = load_data()
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

In [5]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Initialize tokenizer and datasets
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
test_dataset = NewsDataset(test_texts, test_labels, tokenizer)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=16, drop_last=True)

In [6]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and move to GPU
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
).to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

Using device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def train_model(model, train_loader, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")
        
        for batch in progress_bar:
            optimizer.zero_grad()
            
            # Move batch to GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': total_loss / (progress_bar.n + 1)})

train_model(model, train_loader, optimizer)

Epoch 1/3:   9%|▉         | 211/2244 [9:51:18<94:57:13, 168.14s/it, loss=0.0526]   


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report

def evaluate_model(model, test_loader):
    model.eval()
    predictions, true_labels = [], []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            preds = torch.argmax(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    print(classification_report(true_labels, predictions, target_names=["Real", "Fake"]))

evaluate_model(model, test_loader)

In [None]:
# Save model
model.save_pretrained("./bert_fake_news_model")
tokenizer.save_pretrained("./bert_fake_news_tokenizer")

# Load model (example)
# model = BertForSequenceClassification.from_pretrained("./bert_fake_news_model").to(device)
# tokenizer = BertTokenizer.from_pretrained("./bert_fake_news_tokenizer")

In [None]:
def predict(text, model, tokenizer):
    inputs = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits).item()
    
    return "Fake" if pred == 1 else "Real"

# Example usage
sample_text = "Breaking: Scientists discover that chocolate cures all diseases!"
print(predict(sample_text, model, tokenizer))