In [3]:

import pandas as pd
import numpy as np
import nltk
import string
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer


In [4]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\ansh
[nltk_data]     nimbalkar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\ansh
[nltk_data]     nimbalkar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
import pandas as pd

# Load both datasets
df_fake = pd.read_csv("Fake.csv")
df_true = pd.read_csv("True.csv")

# Add label column: 0 = fake, 1 = real
df_fake['label'] = 0
df_true['label'] = 1

# Combine both into one DataFrame
df = pd.concat([df_fake, df_true], ignore_index=True)

# Shuffle the dataset (important)
df = df.sample(frac=1).reset_index(drop=True)

# Show shape and preview
print("🔍 Dataset Shape:", df.shape)
df[['text', 'label']].head()

🔍 Dataset Shape: (44898, 5)


Unnamed: 0,text,label
0,(Reuters) - U.S. President Donald Trump will a...,1
1,The Democrats doubled down on moving to the le...,0
2,"At this point in the race, it s pretty clear t...",0
3,(Reuters) - Wall Street’s predilection for a g...,1
4,WASHINGTON (Reuters) - U.S. Ambassador to the ...,1


In [6]:
import nltk

# Remove punkt if corrupted
nltk.data.find('tokenizers/punkt')  # Check if punkt is available

# Force reinstall punkt
nltk.download('punkt', force=True)


[nltk_data] Downloading package punkt to C:\Users\ansh
[nltk_data]     nimbalkar/nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [7]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenizer = TreebankWordTokenizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = tokenizer.tokenize(text)
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and w not in string.punctuation]
    return ' '.join(words)


In [8]:
df['clean_text'] = df['text'].apply(preprocess)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the clean text
X = vectorizer.fit_transform(df['clean_text'])

# Labels
y = df['label']


In [10]:
from sklearn.model_selection import train_test_split

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Accuracy and report
print("🔍 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))


🔍 Accuracy: 0.9865256124721603

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      4627
           1       0.98      0.99      0.99      4353

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [12]:
import pickle

# Save model
with open('fake_news_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


In [13]:
def predict_fake_news(text):
    # Clean the input using same preprocessing steps
    import re
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    import string

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and w not in string.punctuation]
    clean = ' '.join(words)

    # Vectorize
    vec = vectorizer.transform([clean])

    # Predict
    prediction = model.predict(vec)
    return "FAKE NEWS ❌" if prediction[0] == 0 else "REAL NEWS ✅"


In [14]:
import nltk
nltk.download('punkt')  # Re-downloads the correct punkt tokenizer


[nltk_data] Downloading package punkt to C:\Users\ansh
[nltk_data]     nimbalkar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
def predict_fake_news(text):
    import re
    import string
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    text = text.lower()
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabet characters

    words = text.split()  # ✅ Skip word_tokenize, use simple split
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and w not in string.punctuation]

    clean = ' '.join(words)

    vec = vectorizer.transform([clean])
    prediction = model.predict(vec)
    return "FAKE NEWS ❌" if prediction[0] == 0 else "REAL NEWS ✅"


In [16]:
print(predict_fake_news("NASA announces new mission to search for life on Mars."))
print(predict_fake_news("Breaking: Aliens have landed in India and met the Prime Minister."))


FAKE NEWS ❌
FAKE NEWS ❌


In [17]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Evaluate
y_pred_svm = svm_model.predict(X_test)

print("🔍 SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred_svm))
print("\n📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


🔍 SVM Accuracy: 0.9928730512249443

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4627
           1       0.99      0.99      0.99      4353

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


📉 Confusion Matrix:
 [[4588   39]
 [  25 4328]]
