# Fake News Detection using NLP
### Advanced NLP-based Fake News Classification
**Author:** Ashish Jha

This notebook implements a fake news detection system using TF-IDF and multiple classifiers.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('stopwords', quiet=True)

## 2. Text Preprocessing

In [None]:
ps = PorterStemmer()

def preprocess_text(text):
    # Remove special characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', str(text))
    text = text.lower()
    
    # Tokenize and remove stopwords
    words = text.split()
    words = [ps.stem(word) for word in words if word not in stopwords.words('english')]
    
    return ' '.join(words)

## 3. Load and Process Data

In [None]:
# Load dataset
df = pd.read_csv('news_data.csv')

# Preprocess text
df['processed_text'] = df['text'].apply(preprocess_text)

# Prepare features and target
X = df['processed_text']
y = df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f'Training samples: {X_train_tfidf.shape[0]}')
print(f'Test samples: {X_test_tfidf.shape[0]}')
print(f'Features: {X_train_tfidf.shape[1]}')

## 4. Train Classification Models

In [None]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
lr_pred = lr_model.predict(X_test_tfidf)
lr_accuracy = accuracy_score(y_test, lr_pred)
print('Logistic Regression:')
print(f'Accuracy: {lr_accuracy:.4f}')
print(classification_report(y_test, lr_pred))

# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)
nb_accuracy = accuracy_score(y_test, nb_pred)
print('\nNaive Bayes:')
print(f'Accuracy: {nb_accuracy:.4f}')
print(classification_report(y_test, nb_pred))

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
rf_pred = rf_model.predict(X_test_tfidf)
rf_accuracy = accuracy_score(y_test, rf_pred)
print('\nRandom Forest:')
print(f'Accuracy: {rf_accuracy:.4f}')
print(classification_report(y_test, rf_pred))

## 5. Save Best Model

In [None]:
# Save the best model and vectorizer
joblib.dump(lr_model, 'fake_news_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
print('Models saved successfully!')