In [4]:
import numpy as np
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# NLTK for text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# For splitting the data and classic models
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# For LSTM model using TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/muhammadalizaffar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/muhammadalizaffar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/muhammadalizaffar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/muhammadalizaffar/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
# Load the dataset
df = pd.read_csv('/Users/muhammadalizaffar/Developers_hub/Task 3/fake_or_real_news.csv')

# Preview the data
print("Dataset preview:")
print(df.head())

# Check class distribution
print("\nLabel distribution:")
print(df['label'].value_counts())


Dataset preview:
      id                                              title  \
0   8476                       You Can Smell Hillary’s Fear   
1  10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2   3608        Kerry to go to Paris in gesture of sympathy   
3  10142  Bernie supporters on Twitter erupt in anger ag...   
4    875   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  

Label distribution:
label
REAL    3171
FAKE    3164
Name: count, dtype: int64


In [6]:
# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Apply stemming
    tokens = [stemmer.stem(word) for word in tokens]
    # Apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back to string
    return " ".join(tokens)

# Apply cleaning to the text column
df['cleaned_text'] = df['text'].apply(clean_text)
print("\nCleaned text preview:")
print(df[['text', 'cleaned_text']].head())



Cleaned text preview:
                                                text  \
0  Daniel Greenfield, a Shillman Journalism Fello...   
1  Google Pinterest Digg Linkedin Reddit Stumbleu...   
2  U.S. Secretary of State John F. Kerry said Mon...   
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...   
4  It's primary day in New York and front-runners...   

                                        cleaned_text  
0  daniel greenfield shillman journal fellow free...  
1  googl pinterest digg linkedin reddit stumbleup...  
2  u secretari state john f kerri said monday sto...  
3  — kayde king kaydeek novemb lesson tonight dem...  
4  primari day new york frontrunn hillari clinton...  


In [7]:
X = df['cleaned_text']
y = df['label']  # Assuming labels are "FAKE" and "REAL"

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")


Training samples: 5068, Testing samples: 1267


In [8]:
# Naïve Bayes Pipeline
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', MultinomialNB())
])
pipeline_nb.fit(X_train, y_train)
print("Naïve Bayes training complete.")

# Random Forest Pipeline
pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])
pipeline_rf.fit(X_train, y_train)
print("Random Forest training complete.")


Naïve Bayes training complete.
Random Forest training complete.


In [9]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, pos_label='REAL')
    print(f"{model_name} Accuracy: {acc:.4f}")
    print(f"{model_name} F1 Score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("-" * 50)

evaluate_model(pipeline_nb, X_test, y_test, "Naïve Bayes")
evaluate_model(pipeline_rf, X_test, y_test, "Random Forest")


Naïve Bayes Accuracy: 0.8903
Naïve Bayes F1 Score: 0.8901
Classification Report:
              precision    recall  f1-score   support

        FAKE       0.88      0.90      0.89       628
        REAL       0.90      0.88      0.89       639

    accuracy                           0.89      1267
   macro avg       0.89      0.89      0.89      1267
weighted avg       0.89      0.89      0.89      1267

--------------------------------------------------
Random Forest Accuracy: 0.9187
Random Forest F1 Score: 0.9184
Classification Report:
              precision    recall  f1-score   support

        FAKE       0.91      0.93      0.92       628
        REAL       0.93      0.91      0.92       639

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267

--------------------------------------------------


In [10]:
# Parameters for tokenization and padding
max_features = 5000
max_len = 150  # maximum length of sequences
embedding_dim = 50

# Tokenize the text
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Build the LSTM model
model_lstm = Sequential()
model_lstm.add(Embedding(max_features, embedding_dim, input_length=max_len))
model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='sigmoid'))

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()

# Convert labels to binary (assume "REAL"=1, "FAKE"=0)
y_train_bin = y_train.apply(lambda x: 1 if x == 'REAL' else 0).values
y_test_bin = y_test.apply(lambda x: 1 if x == 'REAL' else 0).values

# Train LSTM with EarlyStopping to prevent overfitting
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
history = model_lstm.fit(X_train_pad, y_train_bin, 
                         epochs=10, batch_size=64, 
                         validation_split=0.2, callbacks=[es])




Epoch 1/10
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 137ms/step - accuracy: 0.6346 - loss: 0.6534 - val_accuracy: 0.7998 - val_loss: 0.5064
Epoch 2/10
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 133ms/step - accuracy: 0.8549 - loss: 0.4243 - val_accuracy: 0.8432 - val_loss: 0.3473
Epoch 3/10
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 123ms/step - accuracy: 0.8914 - loss: 0.2794 - val_accuracy: 0.8679 - val_loss: 0.3031
Epoch 4/10
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 130ms/step - accuracy: 0.9411 - loss: 0.1769 - val_accuracy: 0.8787 - val_loss: 0.3106
Epoch 5/10
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 125ms/step - accuracy: 0.9568 - loss: 0.1334 - val_accuracy: 0.8639 - val_loss: 0.3205
Epoch 6/10
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 130ms/step - accuracy: 0.9686 - loss: 0.0990 - val_accuracy: 0.8649 - val_loss: 0.3594
Epoch 6: early stopping


In [11]:
# Evaluate the LSTM model
loss, accuracy = model_lstm.evaluate(X_test_pad, y_test_bin)
print(f"LSTM Accuracy: {accuracy:.4f}")

# Optionally, you can print predictions for a few samples:
y_pred_lstm = (model_lstm.predict(X_test_pad) > 0.5).astype("int32")
print(classification_report(y_test_bin, y_pred_lstm))


[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.8719 - loss: 0.3685
LSTM Accuracy: 0.8769
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step
              precision    recall  f1-score   support

           0       0.89      0.86      0.87       628
           1       0.87      0.89      0.88       639

    accuracy                           0.88      1267
   macro avg       0.88      0.88      0.88      1267
weighted avg       0.88      0.88      0.88      1267



In [12]:
# Save classical models
joblib.dump(pipeline_nb, 'fake_news_nb.pkl')
joblib.dump(pipeline_rf, 'fake_news_rf.pkl')

# Save the tokenizer and LSTM model
joblib.dump(tokenizer, 'tokenizer.pkl')
model_lstm.save('fake_news_lstm.h5')

print("Models and tokenizer saved successfully.")




Models and tokenizer saved successfully.
