#Fake News detection

In [None]:
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import pickle
import warnings
import ssl
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1018)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1018)>


False

###Load Dataset

In [None]:
df_true = pd.read_csv('data/True.csv')
df_fake = pd.read_csv('data/Fake.csv')
df_true.shape, df_fake.shape

###Drop columns "subject" and "subject". Add column "label"

In [None]:
df_true.drop(['subject', 'date'], axis=1, inplace=True)
df_fake.drop(['subject', 'date'], axis=1, inplace=True)
df_true['label'] = 1
df_fake['label'] = 0
df_true.head()

###Merge True and Fake dataset

In [None]:
df = pd.concat([df_true,df_fake]).sample(frac=1).reset_index(drop=True)
df.head()

In [None]:
df.shape

###Data preprocessing

In [24]:
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(words)
df['text'] = df['text'].apply(preprocess_text)

###Feature Extraction

In [25]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['text'])
y = df['label']

In [26]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Model training

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
# Predict on the test set
y_pred = model.predict(X_test)
print("\nLogistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

In [28]:
with open('model_input.dat', 'wb') as f:
    pickle.dump(model, f)

with open('vector_input.dat', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

###Deploy

In [29]:
def detectNews(text):
    with open('model_input.dat', 'rb') as f:
        model = pickle.load(f)
        
    with open('vector_input.dat', 'rb') as f:
        vectorizer = pickle.load(f)

    """Classify news text as Real or Fake."""
    processed_text = preprocess_text(text)
    text_vector = vectorizer.transform([processed_text])
    prediction = model.predict(text_vector)
    probability = model.predict_proba(text_vector)

    label = "Real News" if prediction[0] == 1 else "Fake News"
    confidence = probability[0][prediction[0]]

    return label, confidence

In [None]:

sample_text = "hello world"
result, confidence = detectNews(sample_text)
print(f"Result: {result} with confidence: {confidence:.2f}")

###Check for null values

In [None]:
df.isnull().sum()

###Check for duplicated values

In [None]:
df.duplicated().sum()

In [14]:
df.drop_duplicates(inplace=True)

In [None]:
sns.countplot(x='label', data=df, color='g')
plt.title("Title")
plt.xlabel("")
plt.ylabel("")
plt.xticks([1,0], ['Real', 'Fake'])
plt.show()

#References:
1. https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset/data
