# **Fake News Detection**

## Step 1: Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import urllib.request
import zipfile
import os

## Step 2: Download NLTK resources

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Step 3: Download and extract the LIAR dataset

In [None]:
def download_liar_dataset():
    url = "https://www.cs.ucsb.edu/~william/data/liar_dataset.zip"
    dataset_path = r"C:\Users\varav\Documents\Project\Fake News Detection\liar_dataset.zip"
    if not os.path.exists(dataset_path):
        print("Downloading LIAR dataset...")
        urllib.request.urlretrieve(url, dataset_path)
        with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
            zip_ref.extractall("liar_dataset")
        print("Dataset extracted.")
    else:
        print("LIAR dataset already downloaded.")

## Step 4: Load and preprocess the dataset

In [None]:
def load_data():
    train_path = "liar_dataset/train.tsv"
    valid_path = "liar_dataset/valid.tsv"
    test_path = "liar_dataset/test.tsv"

    train_df = pd.read_csv(train_path, sep='\t', header=None)
    valid_df = pd.read_csv(valid_path, sep='\t', header=None)
    test_df = pd.read_csv(test_path, sep='\t', header=None)

    columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party',
               'barely_true', 'false', 'half_true', 'mostly_true', 'pants_fire', 'context']
    train_df.columns = columns
    valid_df.columns = columns
    test_df.columns = columns

    df = pd.concat([train_df, valid_df], ignore_index=True)

    def simplify_label(label):
        if label in ['true', 'mostly-true', 'half-true']:
            return 1
        else:
            return 0

    df['binary_label'] = df['label'].apply(simplify_label)
    test_df['binary_label'] = test_df['label'].apply(simplify_label)

    return df[['statement', 'binary_label']], test_df[['statement', 'binary_label']]

## Step 5: Text preprocessing

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

## Step 6: Feature extraction and model training

In [None]:
def train_model(train_df, test_df):
    train_df['processed_statement'] = train_df['statement'].apply(preprocess_text)
    test_df['processed_statement'] = test_df['statement'].apply(preprocess_text)

    vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train = vectorizer.fit_transform(train_df['processed_statement'])
    X_test = vectorizer.transform(test_df['processed_statement'])

    y_train = train_df['binary_label']
    y_test = test_df['binary_label']

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    return model, vectorizer, X_test, y_test

## Step 7: Model evaluation

In [None]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    cm = confusion_matrix(y_test, y_pred)

    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(cm)

## Step 8: Inference function

In [None]:
def predict_fake_news(model, vectorizer, text):
    processed_text = preprocess_text(text)
    X = vectorizer.transform([processed_text])
    prediction = model.predict(X)[0]
    return "Real" if prediction == 1 else "Fake"

def main():
    download_liar_dataset()

    train_df, test_df = load_data()

    model, vectorizer, X_test, y_test = train_model(train_df, test_df)

    evaluate_model(model, X_test, y_test)

    sample_text = "The economy is booming with record-low unemployment rates."
    prediction = predict_fake_news(model, vectorizer, sample_text)
    print(f"\nSample Prediction: The statement '{sample_text}' is {prediction}.")

if __name__ == "__main__":
    main()


LIAR dataset already downloaded.
Evaluation Metrics:
Accuracy: 0.6219
Precision: 0.6400
Recall: 0.7521
F1-Score: 0.6916

Confusion Matrix:
[[251 302]
 [177 537]]

Sample Prediction: The statement 'The economy is booming with record-low unemployment rates.' is Real.
