In [48]:
# Step 1: Import the Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [55]:
# Load the CSV files
true_news = pd.read_csv("C:\\Users\\MAYANK SAXENA\\Desktop\\fake news detector dataset\\true\\True.csv")
fake_news = pd.read_csv("C:\\Users\\MAYANK SAXENA\\Desktop\\fake news detector dataset\\fake\\Fake.csv")

# Explore the datasets
print("True News Dataset Info:")
print(true_news.info())
print("\nFake News Dataset Info:")
print(fake_news.info())

# Check the first few rows of each dataset
true_news.head()
fake_news.head()


True News Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB
None

Fake News Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB
None


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [56]:
# Step 3: Data Preprocessing

# Add a label column to both datasets
true_news['label'] = 1
fake_news['label'] = 0

# Concatenate the datasets
news_data = pd.concat([true_news, fake_news], axis=0).reset_index(drop=True)

# Check for missing values
print(news_data.isnull().sum())


title      0
text       0
subject    0
date       0
label      0
dtype: int64


In [None]:
# Step 4: Text Cleaning
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if you haven't already
nltk.download('stopwords')

# Function to clean the text data
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

# Apply text cleaning to the dataset
news_data['cleaned_text'] = news_data['text'].apply(clean_text)


In [None]:
# Step 5: Exploratory Data Analysis (EDA)
# Count of true and fake news
sns.countplot(news_data['label'])
plt.title('Distribution of True and Fake News')
plt.show()

# Check the length of the articles
news_data['text_length'] = news_data['text'].apply(len)
sns.histplot(news_data['text_length'], bins=50)
plt.title('Distribution of Article Lengths')
plt.show()


In [None]:
#Step 6: Feature Extraction
# Use TfidfVectorizer to convert the cleaned text data into numerical features suitable for model training.


# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the data
X = vectorizer.fit_transform(news_data['cleaned_text'])

# Extract the target labels
y = news_data['label']


In [None]:
#Step 7: Train-Test Split
#Split the data into training and testing sets to evaluate your model's performance.
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the size of the training and testing sets
print(X_train.shape, X_test.shape)


In [None]:
# Step 8: Model Building
# Choose a classification algorithm (e.g., Logistic Regression) to train your model.
# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)


In [None]:
# Step 9: Model Evaluation
# Evaluate the model's performance on the test set using accuracy, confusion matrix, and classification report.
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy*100:.2f}%')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))


In [None]:
# Step 10: Fine-Tuning
# You can experiment with different machine learning algorithms (e.g., Random Forest, Naive Bayes, SVM), hyperparameter tuning, and feature engineering to improve the model's performance.

# python
# Example: Trying a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test)
print(f'Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf)*100:.2f}%')


In [None]:
# Step 11: Save the Model
# Once you are satisfied with the model performance, save the trained model for future use.
import joblib

# Save the model
joblib.dump(model, 'fake_news_detector.pkl')
