In [7]:
import pandas as pd
import torch
import os
from transformers import BertForSequenceClassification, BertTokenizer
from safetensors.torch import load_file
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm
import numpy as np

In [8]:
# --- 1. Load Model and Tokenizer ---
model_path = "/Users/fenilvadher/Documents/Collage Data/SEM - 6/AI/AI Project/bert_fake_news_model/model.safetensors"
tokenizer_path = "/Users/fenilvadher/Documents/Collage Data/SEM - 6/AI/AI Project/bert_fake_news_tokenizer"

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)
model.load_state_dict(load_file(model_path))
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [9]:
# --- 2. Improved Data Loader (Handles All Formats) ---
def load_new_data(data_path):
    """Load CSV or folder-based dataset, auto-converting labels to 0/1"""
    if data_path.endswith('.csv'):
        df = pd.read_csv(data_path)
        texts = df['text'].tolist()
        
        # Convert any label format to 0/1
        if 'label' in df.columns:
            labels = df['label'].apply(lambda x: 0 if str(x).lower() in ['0', 'real', 'true'] else 1).tolist()
        else:
            raise ValueError("CSV must contain 'text' and 'label' columns")
    else:
        # Folder-based loading
        texts, labels = [], []
        label_map = {'real': 0, 'fake': 1}
        for label_name, label_val in label_map.items():
            folder_path = os.path.join(data_path, label_name)
            if os.path.exists(folder_path):
                for file in os.listdir(folder_path):
                    with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
                        texts.append(f.read())
                        labels.append(label_val)
    
    return texts, np.array(labels)  # Ensure labels are numpy array

In [11]:
# --- 3. Batch Prediction ---
def predict_batch(texts, batch_size=8):
    predictions = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Predicting"):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            max_length=512,
            truncation=True,
            padding="max_length"
        )
        with torch.no_grad():
            outputs = model(**inputs)
            batch_preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            predictions.extend(batch_preds)
    return np.array(predictions)


In [12]:
# --- 4. Main Execution ---
try:
    # Load data (replace with your path)
    new_texts, new_labels = load_new_data("/Users/fenilvadher/Documents/Collage Data/SEM - 6/AI/AI Project/news.csv")  # or folder path
    
    # Predict
    new_predictions = predict_batch(new_texts)
    
    # Evaluate
    print("\nPerformance on New Dataset:")
    print(classification_report(new_labels, new_predictions, 
                              target_names=["Real", "Fake"],
                              digits=4))
    print(f"Accuracy: {accuracy_score(new_labels, new_predictions):.2%}")
    
    # Error analysis
    errors = pd.DataFrame({
        'text': [t[:200] + "..." for t in new_texts],
        'true': new_labels,
        'predicted': new_predictions
    }).query("true != predicted")
    
    if not errors.empty:
        print(f"\nError Analysis ({len(errors)} misclassified):")
        print(errors.sample(min(3, len(errors))[['text', 'true', 'predicted']]))
    else:
        print("\nPerfect accuracy on this dataset!")

except Exception as e:
    print(f"Error: {str(e)}")
    print("Check:")
    print("- Your data path exists")
    print("- CSV has 'text' column")
    print("- Labels are in [0,1,'real','fake','true','false'] format")

Predicting: 100%|██████████| 792/792 [51:46<00:00,  3.92s/it]    


Performance on New Dataset:
              precision    recall  f1-score   support

        Real     0.4588    0.0369    0.0683      3171
        Fake     0.4977    0.9564    0.6547      3164

    accuracy                         0.4961      6335
   macro avg     0.4783    0.4966    0.3615      6335
weighted avg     0.4782    0.4961    0.3612      6335

Accuracy: 49.61%

Error Analysis (3192 misclassified):
Error: 'int' object is not subscriptable
Check:
- Your data path exists
- CSV has 'text' column
- Labels are in [0,1,'real','fake','true','false'] format





In [13]:
# --- 5. Analyze Mistakes ---
mistakes = []
for text, true_label, pred in zip(new_texts[:100], new_labels[:100], new_predictions[:100]):
    if true_label != pred:
        mistakes.append({
            'text': text[:200] + "...",
            'true': "Real" if true_label == 0 else "Fake",
            'predicted': "Real" if pred == 0 else "Fake"
        })

print("\nSample Misclassified Texts:")
for i, mistake in enumerate(mistakes[:3]):
    print(f"\nCase {i+1}:")
    print(f"Text: {mistake['text']}")
    print(f"True: {mistake['true']}, Predicted: {mistake['predicted']}")


Sample Misclassified Texts:

Case 1:
Text: U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sunday’s unity march against terrorism.

Kerry said...
True: Real, Predicted: Fake

Case 2:
Text: It's primary day in New York and front-runners Hillary Clinton and Donald Trump are leading in the polls.

Trump is now vowing to win enough delegates to clinch the Republican nomination and prevent a...
True: Real, Predicted: Fake

Case 3:
Text: A Czech stockbroker who saved more than 650 Jewish children from Nazi Germany has died at the age of 106. Dubbed “Britain’s Schindler,” Nicholas Winton arranged to transport Jewish youngsters from Pra...
True: Real, Predicted: Fake
