# Fake News Detection

### Import Libraries & Load Data

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For text preprocessing
import re
import string

# For model building
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# for nlp
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Load datasets
true_df = pd.read_csv("True.csv")
fake_df = pd.read_csv("Fake.csv")

print("True news shape:", true_df.shape)
print("Fake news shape:", fake_df.shape)

True news shape: (21417, 4)
Fake news shape: (23481, 4)


### Combine, Label & Shuffle the Data

In [13]:
#Combine and label the data

true_df['label'] = 1    # 1 = Real
fake_df['label'] = 0    # 0 = Fake

# Combine both
df = pd.concat([true_df, fake_df], axis=0).reset_index(drop=True)

# Shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Combined dataset shape:", df.shape)
print(df.head(3))


Combined dataset shape: (44898, 5)
                                               title  \
0   BREAKING: GOP Chairman Grassley Has Had Enoug...   
1   Failed GOP Candidates Remembered In Hilarious...   
2   Mike Penceâ€™s New DC Neighbors Are HILARIOUSLY...   

                                                text subject  \
0  Donald Trump s White House is in chaos, and th...    News   
1  Now that Donald Trump is the presumptive GOP n...    News   
2  Mike Pence is a huge homophobe. He supports ex...    News   

               date  label  
0     July 21, 2017      0  
1       May 7, 2016      0  
2  December 3, 2016      0  


### Clean and Prepare the Text

In [4]:
# Download NLTK resources
nltk.download('stopwords')

# Initialize
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Keep only letters
    text = text.split()
    text = [ps.stem(word) for word in text if word not in stop_words]
    return " ".join(text)

# Apply cleaning
df['clean_text'] = df['title'] + " " + df['text']
df['clean_text'] = df['clean_text'].apply(clean_text)

print("Text cleaning done.")
print(df[['clean_text', 'label']].head(3))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Text cleaning done.
                                          clean_text  label
0  break gop chairman grassley enough demand trum...      0
1  fail gop candid rememb hilari mock eulog video...      0
2  mike penc new dc neighbor hilari troll homopho...      0


### Vectorize Text & Train Model

In [5]:
# Split data
X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression(max_iter=300)
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluation
print("Model training complete!")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model training complete!
Accuracy: 0.9895768374164811

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      5871
           1       0.99      0.99      0.99      5354

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



### Save Model and Vectorizer

In [6]:
import joblib

# Save model and vectorizer
joblib.dump(model, "fake_news_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully!")

Model and vectorizer saved successfully!


### Streamlit app

In [7]:
#streamlit run app.py