In [62]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,pipeline
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm

In [63]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [64]:
df_test_set = pd.read_csv('Test_dataset(FINAL).csv')

In [65]:
df_test_set.head()

Unnamed: 0,Headline,Source,Question_phi,Question_Mistral,Label
0,NASA’s Perseverance rover finds its first poss...,sciencenews.org,"""Has NASA's Perseverance rover discovered evid...","""Has NASA officially announced the discovery o...",True
1,Sepsis tests take days putting patients at ri...,sciencenews.org,"""What is the current average wait time for sep...","""Is there a recent study or research that show...",True
2,Nasa's DART asteroid unlocks complex history o...,https://timesofindia.indiatimes.com/,"""What is the history of NASA's DART mission an...","""Has NASA's DART mission provided evidence of ...",True
3,Say goodbye to back pain patients go for adva...,https://timesofindia.indiatimes.com/,"""What are the benefits of advanced endoscopy s...","""Has 'advanced endoscopy spine surgery for sci...",True
4,Neurodivergent children more likely to develop...,https://timesofindia.indiatimes.com/,"""What does the study find about the likelihood...","""Is there a peer-reviewed study titled 'Neurod...",True


In [66]:
# Check if all values in df_test_set['Label'] are boolean
all_boolean = df_test_set['Label'].apply(lambda x: isinstance(x, bool)).all()

if all_boolean:
    print("All values in df['label'] are boolean (True/False).")
else:
    print("Not all values in df['label'] are boolean. There may be other data types present.")

All values in df['label'] are boolean (True/False).


Hugging Face Link - https://huggingface.co/Arjun24420/BERT-FakeNews-BinaryClassification

In [67]:
tokenizer_Bert_Binary_FakeReal = AutoTokenizer.from_pretrained(
    "Arjun24420/BERT-FakeNews-BinaryClassification")
model_Bert_Binary_FakeReal = AutoModelForSequenceClassification.from_pretrained(
    "Arjun24420/BERT-FakeNews-BinaryClassification")

# Define class labels mapping
class_mapping_Bert_Binary_FakeReal = {
    1: 'Reliable',
    0: 'Unreliable',
}

def predict_Bert_Binary_FakeOrReal(input_headline):
    # Tokenize the input text
    inputs = tokenizer_Bert_Binary_FakeReal(input_headline, padding=True, truncation=True,
                       max_length=512, return_tensors="pt")

    # Get model output (logits)
    outputs = model_Bert_Binary_FakeReal(**inputs)

    # Calculate probabilities
    probs = outputs.logits.softmax(1)

    # Get the probabilities for each class
    class_probabilities = {class_mapping_Bert_Binary_FakeReal[i]: probs[0, i].item()
                           for i in range(probs.shape[1])}

    return class_probabilities

In [68]:
#Example Usage
predict_Bert_Binary_FakeOrReal("Paris 2024 Olympics: Leon Marchand fails to achieve any medals in the competition.")

{'Unreliable': 0.5826311707496643, 'Reliable': 0.4173688292503357}

In [69]:
def evaluate_bert_model(df):
    correct_predictions = 0
    total_predictions = len(df)
    y_true = []
    y_pred = []

    for _, row in tqdm(df.iterrows(), total=total_predictions, desc="Evaluating"):
        headline = row['Headline']
        true_label = row['Label']
        y_true.append(true_label)

        # Get prediction from BERT model
        bert_proba = predict_Bert_Binary_FakeOrReal(headline)

        # Determine label
        predicted_label = bert_proba['Reliable'] > bert_proba['Unreliable']
        y_pred.append(predicted_label)

        # Compare prediction with true label
        if predicted_label == true_label:
            correct_predictions += 1

    accuracy = correct_predictions / total_predictions
    return accuracy, y_true, y_pred

# Evaluate the model
accuracy, y_test_set, y_test_set_pred = evaluate_bert_model(df_test_set)

print(f"Model Accuracy: {accuracy:.4f}")

# Calculate additional metrics
precision = precision_score(y_test_set, y_test_set_pred)
recall = recall_score(y_test_set, y_test_set_pred)
f1 = f1_score(y_test_set, y_test_set_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

# Create and display confusion matrix
conf_matrix = confusion_matrix(y_test_set, y_test_set_pred)

print("\nConfusion Matrix:")
print(conf_matrix)

Evaluating: 100%|██████████| 688/688 [00:30<00:00, 22.93it/s]

Model Accuracy: 0.5334
Precision: 0.52
Recall: 0.71
F1-Score: 0.60

Confusion Matrix:
[[124 220]
 [101 243]]



