In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [9]:
# Download required NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/ameet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ameet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ameet/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
# Function to clean and preprocess text data
def clean_words(new_tokens):
    new_tokens = [t.lower() for t in new_tokens]
    stop_words = set(stopwords.words('english'))
    new_tokens = [t for t in new_tokens if t not in stop_words]
    new_tokens = [t for t in new_tokens if t.isalpha()]
    lemmatizer = WordNetLemmatizer()
    new_tokens = [lemmatizer.lemmatize(t) for t in new_tokens]
    return new_tokens

In [11]:
df = pd.read_csv('../Liar.csv')  # Read the Liar Dataset CSV 

In [12]:
# Create a custom mapping for the labels for binary classification
label_mapping = {
    'true': True,
    'mostly-true': True,
    'half-true': True,
    'barely-true': False,
    'false': False,
    'pants-fire': False
}

df['label'] = df['label'].map(label_mapping)


In [13]:
df['label'].value_counts()

label
True     7133
False    5655
Name: count, dtype: int64

In [14]:
# Check if all values in df['label'] are boolean
all_boolean = df['label'].apply(lambda x: isinstance(x, bool)).all()

if all_boolean:
    print("All values in df['label'] are boolean (True/False).")
else:
    print("Not all values in df['label'] are boolean. There may be other data types present.")


All values in df['label'] are boolean (True/False).


In [15]:
df_test_set = pd.read_csv('../Test_dataset(FINAL).csv')

In [16]:
# Check if all values in df_test_set['Label'] are also boolean
all_boolean = df_test_set['Label'].apply(lambda x: isinstance(x, bool)).all()

if all_boolean:
    print("All values in df['label'] are boolean (True/False).")
else:
    print("Not all values in df['label'] are boolean. There may be other data types present.")

All values in df['label'] are boolean (True/False).


In [17]:
X = [' '.join(clean_words(word_tokenize(text))) for text in df['statement']]
y = df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_vectorized = vectorizer.fit_transform(X)

clf = MultinomialNB()
clf.fit(X_vectorized, y)

df_test_set['cleaned_headline'] = df_test_set['Headline'].apply(lambda x: ' '.join(clean_words(word_tokenize(x))))
X_test_set_vectorized = vectorizer.transform(df_test_set['cleaned_headline'])

y_test_set = df_test_set['Label']
y_test_set_pred = clf.predict(X_test_set_vectorized)
df_test_set['MultinomialNB Model Decision'] = y_test_set_pred # Store model decisions to the test DataFrame

accuracy = accuracy_score(y_test_set, y_test_set_pred)
print(f"MultinomialNB Model Accuracy on Test Set: {accuracy * 100:.2f}%")

precision = precision_score(y_test_set, y_test_set_pred)
recall = recall_score(y_test_set, y_test_set_pred)
f1 = f1_score(y_test_set, y_test_set_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

conf_matrix = confusion_matrix(y_test_set, y_test_set_pred)

print("\nConfusion Matrix:")
print(conf_matrix)


MultinomialNB Model Accuracy on Test Set: 51.02%
Precision: 0.51
Recall: 0.64
F1-Score: 0.57

Confusion Matrix:
[[131 213]
 [124 220]]


In [18]:
df_eval_results = pd.read_csv('Classical_ml_EVAL.csv', index_col=None)
df_eval_results.head()

Unnamed: 0,Headline,cleaned_headline,Source,Question_phi,Question_Mistral,Label,LinearSVC Model Decision,LogisticRegression Model Decision
0,NASA’s Perseverance rover finds its first poss...,nasa perseverance rover find first possible hi...,sciencenews.org,"""Has NASA's Perseverance rover discovered evid...","""Has NASA officially announced the discovery o...",True,True,True
1,Sepsis tests take days putting patients at ri...,sepsis test take day putting patient risk new ...,sciencenews.org,"""What is the current average wait time for sep...","""Is there a recent study or research that show...",True,True,True
2,Nasa's DART asteroid unlocks complex history o...,nasa dart asteroid unlocks complex history twi...,https://timesofindia.indiatimes.com/,"""What is the history of NASA's DART mission an...","""Has NASA's DART mission provided evidence of ...",True,False,True
3,Say goodbye to back pain patients go for adva...,say goodbye back pain patient go advanced endo...,https://timesofindia.indiatimes.com/,"""What are the benefits of advanced endoscopy s...","""Has 'advanced endoscopy spine surgery for sci...",True,False,False
4,Neurodivergent children more likely to develop...,neurodivergent child likely develop chronic fa...,https://timesofindia.indiatimes.com/,"""What does the study find about the likelihood...","""Is there a peer-reviewed study titled 'Neurod...",True,False,True


In [19]:
df_test_set = df_test_set[['Headline','MultinomialNB Model Decision']]
df_eval_results = pd.merge(df_eval_results, df_test_set, on='Headline', how='left') # Left join to add to results

In [20]:
df_eval_results.head(2)

Unnamed: 0,Headline,cleaned_headline,Source,Question_phi,Question_Mistral,Label,LinearSVC Model Decision,LogisticRegression Model Decision,MultinomialNB Model Decision
0,NASA’s Perseverance rover finds its first poss...,nasa perseverance rover find first possible hi...,sciencenews.org,"""Has NASA's Perseverance rover discovered evid...","""Has NASA officially announced the discovery o...",True,True,True,False
1,Sepsis tests take days putting patients at ri...,sepsis test take day putting patient risk new ...,sciencenews.org,"""What is the current average wait time for sep...","""Is there a recent study or research that show...",True,True,True,True


In [21]:
df_eval_results.to_csv("Classical_ml_EVAL.csv", index=False)

In [22]:
def predict_headline(headline, vectorizer, model):
    # Clean and preprocess the headline
    cleaned_headline = ' '.join(clean_words(word_tokenize(headline)))
    headline_vectorized = vectorizer.transform([cleaned_headline])
    prediction = model.predict(headline_vectorized)[0]
    result = "True" if prediction else "False"
    return result

# Example usage
headline = "Paris 2024 Olympics: Leon Marchand fails to achieve any medals in the competition."
result = predict_headline(headline, vectorizer, clf)
print(f"Prediction for the headline: {result}")

Prediction for the headline: True
