In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# Download required NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/ameet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ameet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ameet/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Function to clean and preprocess text data
def clean_words(new_tokens):
    new_tokens = [t.lower() for t in new_tokens]
    stop_words = set(stopwords.words('english'))
    new_tokens = [t for t in new_tokens if t not in stop_words]
    new_tokens = [t for t in new_tokens if t.isalpha()]
    lemmatizer = WordNetLemmatizer()
    new_tokens = [lemmatizer.lemmatize(t) for t in new_tokens]
    return new_tokens

In [4]:
df = pd.read_csv('../Liar.csv')  # Read the Liar Dataset CSV 

In [5]:
# Create a custom mapping for the labels for binary classification
label_mapping = {
    'true': True,
    'mostly-true': True,
    'half-true': True,
    'barely-true': False,
    'false': False,
    'pants-fire': False
}

df['label'] = df['label'].map(label_mapping)


In [6]:
df['label'].value_counts()

label
True     7133
False    5655
Name: count, dtype: int64

In [7]:
# Check if all values in df['label'] are boolean
all_boolean = df['label'].apply(lambda x: isinstance(x, bool)).all()

if all_boolean:
    print("All values in df['label'] are boolean (True/False).")
else:
    print("Not all values in df['label'] are boolean. There may be other data types present.")

All values in df['label'] are boolean (True/False).


In [8]:
df_test_set = pd.read_csv('../Test_dataset(FINAL).csv')

In [9]:
# Check if all values in df_test_set['Label'] are also boolean
all_boolean = df_test_set['Label'].apply(lambda x: isinstance(x, bool)).all()

if all_boolean:
    print("All values in df['label'] are boolean (True/False).")
else:
    print("Not all values in df['label'] are boolean. There may be other data types present.")

All values in df['label'] are boolean (True/False).


In [12]:
# Step 1: Clean and vectorize the entire training dataset
X = [' '.join(clean_words(word_tokenize(text))) for text in df['statement']]
y = df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_vectorized = vectorizer.fit_transform(X)

# Step 2: Train the model on the entire dataset
clf = LogisticRegression(max_iter=1000)
clf.fit(X_vectorized, y)

# Step 3: Clean and vectorize the test set (df_test_set)
df_test_set['cleaned_headline'] = df_test_set['Headline'].apply(lambda x: ' '.join(clean_words(word_tokenize(x))))
X_test_set_vectorized = vectorizer.transform(df_test_set['cleaned_headline'])

# Step 4: Predict the labels for the test set and evaluate the model
y_test_set = df_test_set['Label']
y_test_set_pred = clf.predict(X_test_set_vectorized)

# Step 5: Print classification metrics separately
accuracy = accuracy_score(y_test_set, y_test_set_pred)
print(f"LogisticRegression Model Accuracy on Test Set: {accuracy * 100:.2f}%")

precision = precision_score(y_test_set, y_test_set_pred)
recall = recall_score(y_test_set, y_test_set_pred)
f1 = f1_score(y_test_set, y_test_set_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

conf_matrix = confusion_matrix(y_test_set, y_test_set_pred)

print("\nConfusion Matrix:")
print(conf_matrix)


LogisticRegression Model Accuracy on Test Set: 50.00%
Precision: 0.50
Recall: 0.59
F1-Score: 0.54

Confusion Matrix:
[[142 202]
 [142 202]]


In [11]:
def predict_headline(headline, vectorizer, model):
    # Clean and preprocess the headline
    cleaned_headline = ' '.join(clean_words(word_tokenize(headline)))
    headline_vectorized = vectorizer.transform([cleaned_headline])
    prediction = model.predict(headline_vectorized)[0]
    probability = model.predict_proba(headline_vectorized)[0]
    result = "True" if prediction else "False"
    confidence = probability[1] if prediction else probability[0]
    return result, confidence

# Example usage
headline = "Angela Carini triumphs in Olympic bout, defeating Imane Khelif in a record 46 seconds."
result, confidence = predict_headline(headline, vectorizer, clf)
print(f"Prediction for the headline: {result}")
print(f"Confidence: {confidence:.2f}")

Prediction for the headline: True
Confidence: 0.54
