In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from wordcloud import WordCloud

In [None]:
# Load the dataset
data_path = r"C:\Users\subra\Desktop\Projects\New Projects\Fake-News-Detection\data\data.csv"
df = pd.read_csv(data_path)

In [None]:
# Display basic information about the dataset
print("First five rows of the dataset:")
print(df.head())

print("\nSummary statistics of the dataset:")
print(df.describe())

print("\nDataset information:")
print(df.info())

In [None]:
# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())

In [None]:
# Handle missing values (if any)
df.dropna(subset=['title', 'text', 'label'], inplace=True)  # Drop rows with missing text, title, or label

In [None]:
# Exploratory Data Analysis (EDA)
# Word Cloud for the most frequent words in news articles
text = " ".join(article for article in df['text'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Fake News Headlines")
plt.show()

In [None]:
# Check distribution of news categories
sns.countplot(x='label', data=df, palette='Set2')
plt.title("Distribution of News Categories (Fake/Real)")
plt.xlabel("Label (0 = Fake, 1 = Real)")
plt.ylabel("Count")
plt.show()

In [None]:
# Feature and target selection
X = df['text']  # Features (text of the articles)
y = df['label']  # Target (0 = Fake, 1 = Real)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)