In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

def load_data(file_path):
    df = pd.read_csv(file_path, header=None)
    df.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party', 
                 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 
                 'pants_fire_counts', 'context']
    return df


def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()

    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    return text

def binarize_labels(label):
    if label in ['pants-fire', 'false', 'barely-true']:
        return 1  # Fake
    else:
        return 0  # Not fake

def prepare_dataset(df):
    df['processed_statement'] = df['statement'].apply(preprocess_text)
    # Convert labels to binary
    df['binary_label'] = df['label'].apply(binarize_labels)
    
    return df

# Custom dataset class for BERT
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def train_model(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    for batch in data_loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
    
    return total_loss / len(data_loader)

def evaluate_model(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

def predict_fake_news(statement, model, tokenizer, device):
    model.eval()
    
    processed_statement = preprocess_text(statement)
    
    encoding = tokenizer.encode_plus(
        processed_statement,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        _, preds = torch.max(outputs.logits, dim=1)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
    
    prediction = preds.item()
    confidence = probabilities[0][prediction].item()
    
    result = "Fake" if prediction == 1 else "Not Fake"
    
    return result, confidence

def main(file_path, test_statement=None):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    print("Loading and preparing data...")
    df = load_data(file_path)
    df = prepare_dataset(df)
    
    print(f"Dataset shape: {df.shape}")
    print(f"Label distribution:\n{df['label'].value_counts()}")
    print(f"Binary label distribution:\n{df['binary_label'].value_counts()}")
    
    # Split data
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df['processed_statement'], 
        df['binary_label'], 
        test_size=0.2, 
        random_state=42
    )
    
    # Load the BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    model.to(device)
    
    # Create datasets and dataloaders
    train_dataset = NewsDataset(train_texts.values, train_labels.values, tokenizer)
    test_dataset = NewsDataset(test_texts.values, test_labels.values, tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # Set up optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(train_loader) * 3 
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    
    # Train the model
    print("\nTraining BERT model...")
    for epoch in range(3):
        print(f"Epoch {epoch + 1}/3")
        train_loss = train_model(model, train_loader, optimizer, scheduler, device)
        print(f"Training loss: {train_loss}")
        
        accuracy, report = evaluate_model(model, test_loader, device)
        print(f"Accuracy: {accuracy}")
        print(f"Classification Report:\n{report}")
    
    if test_statement:
        result, confidence = predict_fake_news(test_statement, model, tokenizer, device)
        print(f"\nStatement: {test_statement}")
        print(f"Prediction: {result} (Confidence: {confidence:.2f})")
    
    return model, tokenizer

if __name__ == "__main__":
    file_path = "valid.csv"  
    
    
    model, tokenizer = main(file_path)
   
    test_statement = "The United States has the highest tax rate in the world."
    result, confidence = predict_fake_news(test_statement, model, tokenizer, torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    print(f"\nStatement: {test_statement}")
    print(f"Prediction: {result} (Confidence: {confidence:.2f})")
    
    print("\nEnter statements to check (type 'exit' to quit):")
    while True:
        user_input = input("> ")
        if user_input.lower() == 'exit':
            break
        
        result, confidence = predict_fake_news(user_input, model, tokenizer, torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
        print(f"Prediction: {result} (Confidence: {confidence:.2f})")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dkmr0\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dkmr0\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dkmr0\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading and preparing data...
Dataset shape: (1284, 16)
Label distribution:
label
false          263
mostly-true    251
half-true      248
barely-true    237
true           169
pants-fire     116
Name: count, dtype: int64
Binary label distribution:
binary_label
0    668
1    616
Name: count, dtype: int64


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training BERT model...
Epoch 1/3
Training loss: 0.6971156963935266
Accuracy: 0.5719844357976653
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.66      0.61       130
           1       0.58      0.48      0.53       127

    accuracy                           0.57       257
   macro avg       0.57      0.57      0.57       257
weighted avg       0.57      0.57      0.57       257

Epoch 2/3
Training loss: 0.5373813069783724
Accuracy: 0.5914396887159533
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.74      0.65       130
           1       0.62      0.44      0.52       127

    accuracy                           0.59       257
   macro avg       0.60      0.59      0.58       257
weighted avg       0.60      0.59      0.58       257

Epoch 3/3
Training loss: 0.2819602919312624
Accuracy: 0.5875486381322957
Classification Report:
              precision    recall 

>  This new miracle supplement cured my diabetes in just three days! I lost 20 pounds without changing my diet or exercising. Every doctor should be prescribing this!


Prediction: Fake (Confidence: 0.77)


>  I started taking this supplement as recommended by my doctor. Over the past few weeks, I’ve noticed a slight improvement in my energy levels. It’s not a miracle, but it seems to be helping when combined with proper diet and exercise.


Prediction: Not Fake (Confidence: 0.82)


>  This phone lasts for 10 days on a single 5-minute charge and has better features than any iPhone or Samsung ever made! It's basically free because it pays for itself in rewards!


Prediction: Not Fake (Confidence: 0.90)


>   Over-the-top claims, unrealistic battery life, and vague buzzwords are typical red flags.


Prediction: Not Fake (Confidence: 0.90)


>  The battery life on this phone is decent—I can usually get through a full day with moderate use. The camera performs well in good lighting, and the interface is user-friendly.


Prediction: Not Fake (Confidence: 0.92)


>  This product is absolutely amazing! It saved my life! I can't believe how much I love it.


Prediction: Fake (Confidence: 0.59)


>  exit


In [4]:
from torch.optim.adamw import AdamW