In [None]:
# Import libraries

import re # For removing unwanted characters from text
import os # For file and folder operations
import nltk # Natural Language Processing Library
import sklearn # For Evaluation metrics
import kagglehub # For download dataset from kaggle
import tensorflow as tf # Deep learning Library
import numpy as np # For numeric operations
import pandas as pd # For data manipulation

from nltk.corpus import stopwords # Stopword list
from tensorflow.keras.models import Sequential # Sequential model
from sklearn.metrics import accuracy_score # For measurement accuracy score
from sklearn.model_selection import train_test_split # For data splitting
from tensorflow.keras.preprocessing.text import Tokenizer # For converting words into token
from tensorflow.keras.preprocessing.sequence import pad_sequences # Make sequence same length
from sklearn.metrics import classification_report , confusion_matrix # For evaluation model
from tensorflow.keras.layers import LSTM , Embedding , Dense , Dropout # Neural Network layers


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("emineyetm/fake-news-detection-datasets")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/emineyetm/fake-news-detection-datasets?dataset_version_number=1...


100%|██████████| 41.0M/41.0M [00:03<00:00, 14.1MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/emineyetm/fake-news-detection-datasets/versions/1


In [None]:
# This line return all folders into the given directory path
os.listdir(path)

['News _dataset']

In [None]:
# for check all files into dataset
os.listdir(path + "/News_dataset")

['Fake.csv', 'True.csv']

In [None]:
# Load fake news data from given path
Fake = pd.read_csv(path + "/News_dataset/Fake.csv")

# Assign 0 label for fake news
Fake["label"] = 0

# Load real news
Real = pd.read_csv(path + "/News_dataset/True.csv")

# Assign 1 label for real news
Real["label"] = 1

In [None]:
Fake.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [None]:
# Combining both real and fake news into dataframe
df = pd.concat([Fake , Real] , ignore_index=True )

# For showing few rows of dataset
print(df.head())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  label  
0  December 31, 2017      0  
1  December 31, 2017      0  
2  December 30, 2017      0  
3  December 29, 2017      0  
4  December 25, 2017      0  


In [None]:
# Remove duplicate rows from dataset
df = df.drop_duplicates(subset=["text"])

In [None]:
# Download stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Loading list of English Stopwords
stop_words = set(stopwords.words("english"))

# Function for cleaning input text for model training
def clean_text(text):

    # Removing all characters except alphabets and spaces
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # Convert text into smallcase
    text = text.lower()

    # Removing stopwords from text
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# Combing title and text
df["full_text"] = df["title"] + " " + df["text"]

# Appling clean_text function on full_text
df["clean_text"] = df["full_text"].apply(clean_text)

In [None]:
# Split data into train and test for model training
X_train , X_test , Y_train , Y_test = train_test_split(

                      df["clean_text"] , df["label"] , test_size=0.2 , random_state=42

                                                       )

In [None]:
# Setting maximum numbers of words to keep in tokenizer vocabulary
max_words = 10000

# Setting maximum sequence length for each new article
max_len = 200

# Creating tokenizer that will convert text into numeric
tokenizer = Tokenizer(num_words=max_words)

# Apply tokenizer on the training data
tokenizer.fit_on_texts(X_train)

In [None]:
# Convert training and testing text into sequence
x_train_seq = tokenizer.texts_to_sequences(X_train)
x_test_seq = tokenizer.texts_to_sequences(X_test)

# Add padding on the training and testing text for getting same length text
x_train_pad = pad_sequences(x_train_seq , maxlen=max_len)
x_test_pad = pad_sequences(x_test_seq , maxlen=max_len)

In [None]:
# Converting training and testing labels into numpy array for model compatibility
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

In [None]:
# Creating sequential neural network model
model = Sequential()

# Adding first Embedding layer to convert words into dense vectors
model.add(Embedding(max_words , 64 , input_length=max_len))

# Adding LSTM layer
model.add(LSTM(64 , dropout=0.2 , recurrent_dropout=0))

# Adding last Dense output layer
model.add(Dense(1 , activation = "sigmoid"))



In [None]:
model.compile(

              loss="binary_crossentropy",
              optimizer="adam",
              metrics = ["accuracy"]

              )

# Display model architecture
model.summary()

In [None]:
# Train the model on train data
history = model.fit(x_train_pad , Y_train , epochs=5 , batch_size=64 , validation_split=0.2)

Epoch 1/5
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.8932 - loss: 0.2560 - val_accuracy: 0.9859 - val_loss: 0.0517
Epoch 2/5
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9862 - loss: 0.0418 - val_accuracy: 0.9869 - val_loss: 0.0425
Epoch 3/5
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9932 - loss: 0.0256 - val_accuracy: 0.9867 - val_loss: 0.0451
Epoch 4/5
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.9937 - loss: 0.0220 - val_accuracy: 0.9835 - val_loss: 0.0555
Epoch 5/5
[1m387/387[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9764 - loss: 0.0650 - val_accuracy: 0.9829 - val_loss: 0.0638


In [None]:
# Evaluate accuracy
loss , acc = model.evaluate(x_test_pad , Y_test)
print("Test Accuracy" , acc)

[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9768 - loss: 0.0761
Test Accuracy 0.9785252213478088


In [None]:
# Predicting probability for the dataset
y_pred_prob = model.predict(x_test_pad)

# Converting probability to binary lablel 0 or 1
y_pred = (y_pred_prob > 0.5).astype(int)

# Display the confusion matrix
print("Confusion Matrix : \n" , confusion_matrix(Y_test , y_pred))

# Display classification report
print("Classification Report: \n" , classification_report(Y_test , y_pred))

[1m242/242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Confusion Matrix : 
 [[3417   83]
 [  83 4147]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      3500
           1       0.98      0.98      0.98      4230

    accuracy                           0.98      7730
   macro avg       0.98      0.98      0.98      7730
weighted avg       0.98      0.98      0.98      7730



In [None]:
# Save model
model.save("fake_news_detection_model.keras")

In [None]:
from tensorflow.keras.models import load_model

In [None]:
import pickle

# Save tokenizer
with open("tokenizer.pkl" , "wb") as f:
  pickle.dump(tokenizer , f)