In [1]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import numpy as np
from nltk.corpus import stopwords
import pandas as pd

fake_news = pd.read_csv("Fake.csv")
real_news = pd.read_csv("True.csv")

documents = pd.concat([fake_news['text'], real_news['text']])

def preprocess(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text

processed_docs = [preprocess(doc) for doc in documents]

In [2]:
data = pd.DataFrame({'text': processed_docs})

stop_words = set(stopwords.words("english"))
data["cleaned_text"] = data["text"].apply(lambda x: " ".join([word.lower() for word in x.split() if word.isalpha() and word.lower() not in stop_words]))

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data["cleaned_text"]).toarray()

data['label'] = [0] * len(fake_news) + [1] * len(real_news)
y = data['label']

In [3]:
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

warnings.filterwarnings("ignore")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = LogisticRegression(max_iter=1000) 
classifier.fit(X_train, y_train)

In [4]:
y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.4f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Real News", "Fake News"]))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 98.8864%

Classification Report:
              precision    recall  f1-score   support

   Real News       0.99      0.99      0.99      4733
   Fake News       0.99      0.99      0.99      4247

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Confusion Matrix:
[[4681   52]
 [  48 4199]]


In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

y_pred_nb = nb_classifier.predict(X_test)
print(f"Naive Bayes Accuracy: {accuracy_score(y_test, y_pred_nb) * 100:.4f}%")
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb, target_names=["Real News", "Fake News"]))

Naive Bayes Accuracy: 93.4298%
Naive Bayes Classification Report:
              precision    recall  f1-score   support

   Real News       0.94      0.94      0.94      4733
   Fake News       0.93      0.93      0.93      4247

    accuracy                           0.93      8980
   macro avg       0.93      0.93      0.93      8980
weighted avg       0.93      0.93      0.93      8980



In [6]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train)

y_pred_xgb = xgb_classifier.predict(X_test)
print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb) * 100:.4f}%")
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=["Real News", "Fake News"]))

XGBoost Accuracy: 99.6659%
XGBoost Classification Report:
              precision    recall  f1-score   support

   Real News       1.00      1.00      1.00      4733
   Fake News       1.00      1.00      1.00      4247

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



In [7]:
import matplotlib.pyplot as plt

In [12]:
from collections import Counter
import re

def calculate_vocab_size(dataset):
    token_counter = Counter()
    for text in dataset:
        tokens = re.findall(r'\b\w+\b', text.lower())
        token_counter.update(tokens)
    vocab_size = len(token_counter)
    return vocab_size

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = torch.tensor(X_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test.to_numpy(), dtype=torch.float32)

train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        out = self.fc(hidden[-1])
        return self.sigmoid(out)

vocab_size = calculate_vocab_size(documents)
embed_dim = 128
hidden_dim = 64
output_dim = 1
model = LSTMModel(vocab_size, embed_dim, hidden_dim, output_dim)

In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 5
for epoch in range(epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    test_outputs = (test_outputs > 0.5).float()
    accuracy = (test_outputs.squeeze() == y_test).float().mean()
    print(f"Test Accuracy: {accuracy.item() * 100:.4f}%")

Epoch 1/5, Loss: 0.6862496733665466
