In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# Download required NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/ameet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ameet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ameet/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Function to clean and preprocess text data
def clean_words(new_tokens):
    new_tokens = [t.lower() for t in new_tokens]
    stop_words = set(stopwords.words('english'))
    new_tokens = [t for t in new_tokens if t not in stop_words]
    new_tokens = [t for t in new_tokens if t.isalpha()]
    lemmatizer = WordNetLemmatizer()
    new_tokens = [lemmatizer.lemmatize(t) for t in new_tokens]
    return new_tokens

In [4]:
df = pd.read_csv('../Liar.csv')  # Read the Liar Dataset CSV 

In [5]:
# Create a custom mapping for the labels for binary classification
label_mapping = {
    'true': True,
    'mostly-true': True,
    'half-true': True,
    'barely-true': False,
    'false': False,
    'pants-fire': False
}

df['label'] = df['label'].map(label_mapping)


In [6]:
df['label'].value_counts()

label
True     7133
False    5655
Name: count, dtype: int64

In [7]:
# Check if all values in df['label'] are boolean
all_boolean = df['label'].apply(lambda x: isinstance(x, bool)).all()

if all_boolean:
    print("All values in df['label'] are boolean (True/False).")
else:
    print("Not all values in df['label'] are boolean. There may be other data types present.")


All values in df['label'] are boolean (True/False).


In [8]:
df_test_set = pd.read_csv('../Test_dataset(FINAL).csv')

In [9]:
# Check if all values in df_test_set['Label'] are also boolean
all_boolean = df_test_set['Label'].apply(lambda x: isinstance(x, bool)).all()

if all_boolean:
    print("All values in df['label'] are boolean (True/False).")
else:
    print("Not all values in df['label'] are boolean. There may be other data types present.")

All values in df['label'] are boolean (True/False).


In [10]:
X = [' '.join(clean_words(word_tokenize(text))) for text in df['statement']]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

clf = MultinomialNB()
clf.fit(X_train_vectorized, y_train)

y_test_pred = clf.predict(X_test_vectorized)

accuracy = accuracy_score(y_test, y_test_pred)
print(f"MultinomialNB Model Accuracy on Test Set: {accuracy * 100:.2f}%")

precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

conf_matrix = confusion_matrix(y_test, y_test_pred)

print("\nConfusion Matrix:")
print(conf_matrix)

MultinomialNB Model Accuracy on Test Set: 60.54%
Precision: 0.61
Recall: 0.83
F1-Score: 0.70

Confusion Matrix:
[[ 520 1133]
 [ 381 1803]]
