The problem at hand is to develop a fake news detection model using a dataset obtained from Kaggle. The objective is to create a system that can effectively distinguish between genuine and fake news articles based on their titles and textual content. This project requires the utilization of Natural Language Processing (NLP) techniques to preprocess and transform the text data, building a machine learning model for classification, and subsequently evaluating the model's performance.

In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK data (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')

# Load the dataset (make sure you've downloaded it from Kaggle)
fake_data = pd.read_csv("path_to_fake_news_dataset.csv")

Text preprocessing

In [None]:
import nltk
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Remove missing values (if any)
fake_data = fake_data.dropna()

# Combine title and text for analysis
fake_data['text'] = fake_data['title'] + " " + fake_data['text']

# Tokenization and stop-word removal
stop_words = set(stopwords.words('english'))
fake_data['text'] = fake_data['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))

Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(fake_data['text'])
y = fake_data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




Model Training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# Train the classification model
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
# Evaluate the model
y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


Print evaluation results

In [None]:
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(classification_rep)


Plot a confusion matrix

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues', xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()