In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from zipfile import ZipFile
import os

In [2]:
# Load the FA-KES dataset
df_fakes = pd.read_csv("FA-KES-Dataset.csv", encoding='ISO-8859-1')

In [3]:
# Dataset Splitting for FA-KES dataset
#train_test_split fcn scikit-learn library to split  into training and testing.
X_fakes = df_fakes["article_title"].values
y_fakes = df_fakes["labels"].values

X_fakes_train, X_fakes_test, y_fakes_train, y_fakes_test = train_test_split(X_fakes, y_fakes, test_size=0.2, random_state=42)

No stemming

In [4]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def text_preprocessing(text):
    text = re.sub(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', '', text)  # Regular expression to remove any IP addresses
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

X_fakes_train = [text_preprocessing(text) for text in X_fakes_train]
X_fakes_test = [text_preprocessing(text) for text in X_fakes_test]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bhavi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bhavi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
import requests
import zipfile
import os

# Define the URL of the GloVe embeddings
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"

# Define the local file name to save the downloaded embeddings
local_filename = "glove.6B.zip"

# Download the GloVe embeddings file
response = requests.get(glove_url, stream=True)
with open(local_filename, 'wb') as file:
    for chunk in response.iter_content(chunk_size=1024):
        if chunk:
            file.write(chunk)

# Unzip the downloaded file
with zipfile.ZipFile(local_filename, 'r') as zip_ref:
    zip_ref.extractall()

# Remove the zip file if needed
os.remove(local_filename)



In [7]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import numpy as np

# Load the pre-trained GloVe word embeddings
embedding_dim = 100
glove_file = "glove.6B.100d.txt"

embeddings_index = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


In [8]:
# Create tokenizer for FA-KES dataset
tokenizer_fakes = Tokenizer()
tokenizer_fakes.fit_on_texts(X_fakes_train)

# Convert text to sequences of integers and apply post-padding
X_fakes_train_sequences = tokenizer_fakes.texts_to_sequences(X_fakes_train)
X_fakes_test_sequences = tokenizer_fakes.texts_to_sequences(X_fakes_test)

max_sequence_length = 300
X_fakes_train_padded = pad_sequences(X_fakes_train_sequences, maxlen=max_sequence_length)
X_fakes_test_padded = pad_sequences(X_fakes_test_sequences, maxlen=max_sequence_length)

In [9]:
# Prepare embedding matrix for FA-KES dataset
word_index_fakes = tokenizer_fakes.word_index
num_words_fakes = min(len(word_index_fakes), len(embeddings_index)) + 1
embedding_matrix_fakes = np.zeros((num_words_fakes, embedding_dim))

for word, i in word_index_fakes.items():
    if i >= num_words_fakes:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_fakes[i] = embedding_vector

Hybrid CNN-RNN(LSTM)

In [26]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense

# Model Architecture
def create_model(embedding_matrix, max_sequence_length, embedding_dim):
    model = Sequential()
    model.add(Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(32))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model
# Create the model for FA-KES dataset
model_fakes = create_model(embedding_matrix_fakes, max_sequence_length, embedding_dim)

history_fakes = model_fakes.fit(X_fakes_train_padded, y_fakes_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
  # Evaluate the models on the test sets
y_fakes_pred_probs = model_fakes.predict(X_fakes_test_padded)
y_fakes_pred = np.argmax(y_fakes_pred_probs, axis=1)



In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert predicted probabilities to binary predictions (0 or 1)
y_fakes_pred_binary = (y_fakes_pred_probs > 0.5).astype(int)

# Calculate accuracy
accuracy_fakes = accuracy_score(y_fakes_test, y_fakes_pred_binary)

# Calculate precision, recall, and F1 score
precision_fakes = precision_score(y_fakes_test, y_fakes_pred_binary)
recall_fakes = recall_score(y_fakes_test, y_fakes_pred_binary)
f1_score_fakes = f1_score(y_fakes_test, y_fakes_pred_binary)

print("Accuracy:", accuracy_fakes)
print("Precision:", precision_fakes)
print("Recall:", recall_fakes)
print("F1 Score:", f1_score_fakes)


Accuracy: 0.515527950310559
Precision: 0.5523809523809524
Recall: 0.651685393258427
F1 Score: 0.597938144329897


Hybrid CNN-RNN(BiLSTM)

In [29]:
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense

#Hybrid CNN-RNN BiLSTM

def create_model(embedding_matrix, max_sequence_length, embedding_dim):
    model = Sequential()
    model.add(Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Bidirectional(LSTM(32)))  # Use Bidirectional LSTM
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [30]:
# Create the model for FA-KES dataset with Bidirectional LSTM
model_fakes = create_model(embedding_matrix_fakes, max_sequence_length, embedding_dim)

# Train the model
history_fakes = model_fakes.fit(X_fakes_train_padded, y_fakes_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)

# Evaluate the model on the test set
y_fakes_pred_probs = model_fakes.predict(X_fakes_test_padded)
y_fakes_pred = (y_fakes_pred_probs > 0.5).astype(int)  # Convert predicted probabilities to binary predictions (0 or 1)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy
accuracy_hcnnrnn = accuracy_score(y_fakes_test, y_fakes_pred)

# Calculate precision, recall, and F1 score
precision_hcnnrnn = precision_score(y_fakes_test, y_fakes_pred)
recall_hcnnrnn = recall_score(y_fakes_test, y_fakes_pred)
f1_hcnnrnn = f1_score(y_fakes_test, y_fakes_pred)

print("Accuracy:", accuracy_hcnnrnn)
print("Precision:", precision_hcnnrnn)
print("Recall:", recall_hcnnrnn)
print("F1 Score:", f1_hcnnrnn)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.453416149068323
Precision: 0.5061728395061729
Recall: 0.4606741573033708
F1 Score: 0.4823529411764706


Transformer

In [35]:
pip install transformers


Note: you may need to restart the kernel to use updated packages.


In [37]:
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [38]:

# Load a pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Tokenize and encode the text data
X_train_encoded = tokenizer(X_fakes_train, padding=True, truncation=True, return_tensors='pt', max_length=128)
X_test_encoded = tokenizer(X_fakes_test, padding=True, truncation=True, return_tensors='pt', max_length=128)

# Convert labels to tensors
y_train_tensor = torch.tensor(y_fakes_train)
y_test_tensor = torch.tensor(y_fakes_test)

# Forward pass through the model
with torch.no_grad():
    logits = model(**X_test_encoded).logits

# Predictions
y_pred = torch.argmax(logits, dim=1).numpy()

# Calculate evaluation metrics
accuracy_tr = accuracy_score(y_test_tensor.numpy(), y_pred)
precision_tr = precision_score(y_test_tensor.numpy(), y_pred)
recall_tr = recall_score(y_test_tensor.numpy(), y_pred)
f1_tr = f1_score(y_test_tensor.numpy(), y_pred)

print("BERT Model Metrics:")
print("Accuracy:", accuracy_tr)
print("Precision:", precision_tr)
print("Recall:", recall_tr)
print("F1 Score:", f1_tr)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

BERT Model Metrics:
Accuracy: 0.5527950310559007
Precision: 0.5527950310559007
Recall: 1.0
F1 Score: 0.7120000000000001


CNN

In [39]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense

# Model Architecture for CNN-only
def create_cnn_model(embedding_matrix, max_sequence_length, embedding_dim):
    model = Sequential()
    model.add(Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    # Compile
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# CNN-only model for FA-KES dataset
cnn_model = create_cnn_model(embedding_matrix_fakes, max_sequence_length, embedding_dim)

# Summary of CNN-only Model Architecture for FA-KES dataset
print("CNN-only Model Summary:")
cnn_model.summary()

# Train the CNN-only model
history_cnn = cnn_model.fit(X_fakes_train_padded, y_fakes_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)

# Evaluate the CNN-only model on the test set
y_cnn_pred = cnn_model.predict(X_fakes_test_padded)
y_cnn_pred = (y_cnn_pred > 0.5)  # Convert probabilities to binary predictions

# Calculate evaluation metrics
accuracy_cnn = accuracy_score(y_fakes_test, y_cnn_pred)
precision_cnn = precision_score(y_fakes_test, y_cnn_pred)
recall_cnn = recall_score(y_fakes_test, y_cnn_pred)
f1_score_cnn = f1_score(y_fakes_test, y_cnn_pred)

# Print the evaluation results
print("CNN-only Model Evaluation Results:")
print("Accuracy:", accuracy_cnn)
print("Precision:", precision_cnn)
print("Recall:", recall_cnn)
print("F1 Score:", f1_score_cnn)


CNN-only Model Summary:
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 300, 100)          93000     
                                                                 
 conv1d_7 (Conv1D)           (None, 296, 128)          64128     
                                                                 
 max_pooling1d_7 (MaxPooling  (None, 148, 128)         0         
 1D)                                                             
                                                                 
 flatten (Flatten)           (None, 18944)             0         
                                                                 
 dense_7 (Dense)             (None, 64)                1212480   
                                                                 
 dense_8 (Dense)             (None, 1)                 65        
                              

RNN

In [40]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Model Architecture for RNN-only
def create_rnn_model(embedding_matrix, max_sequence_length, embedding_dim):
    model = Sequential()
    model.add(Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
    model.add(LSTM(32))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create the RNN-only model for FA-KES dataset
rnn_model = create_rnn_model(embedding_matrix_fakes, max_sequence_length, embedding_dim)

# Summary of RNN-only Model Architecture for FA-KES dataset
print("RNN-only Model Summary:")
rnn_model.summary()

# Train the RNN-only model
history_rnn = rnn_model.fit(X_fakes_train_padded, y_fakes_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)

# Evaluate the RNN-only model on the test set
y_rnn_pred = rnn_model.predict(X_fakes_test_padded)
y_rnn_pred = (y_rnn_pred > 0.5)  # Convert probabilities to binary predictions

# Calculate evaluation metrics
accuracy_rnn = accuracy_score(y_fakes_test, y_rnn_pred)
precision_rnn = precision_score(y_fakes_test, y_rnn_pred)
recall_rnn = recall_score(y_fakes_test, y_rnn_pred)
f1_score_rnn = f1_score(y_fakes_test, y_rnn_pred)

# Print the evaluation results
print("RNN-only Model Evaluation Results:")
print("Accuracy:", accuracy_rnn)
print("Precision:", precision_rnn)
print("Recall:", recall_rnn)
print("F1 Score:", f1_score_rnn)


RNN-only Model Summary:
Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 300, 100)          93000     
                                                                 
 lstm_7 (LSTM)               (None, 32)                17024     
                                                                 
 dense_9 (Dense)             (None, 1)                 33        
                                                                 
Total params: 110,057
Trainable params: 17,057
Non-trainable params: 93,000
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
RNN-only Model Evaluation Results:
Accuracy: 0.484472049689441
Precision: 0.5319148936170213
Recall: 0.5617977528089888
F1 Score: 0.5464480874316939


BiLSTM

In [41]:
# Create the model for FA-KES dataset with only Bidirectional LSTM
def create_bilstm_model(embedding_matrix, max_sequence_length, embedding_dim):
    model = Sequential()
    model.add(Embedding(embedding_matrix.shape[0], embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
    model.add(Bidirectional(LSTM(32)))  # Use Bidirectional LSTM
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create and train the Bidirectional LSTM model
model_bilstm_fakes = create_bilstm_model(embedding_matrix_fakes, max_sequence_length, embedding_dim)
history_bilstm_fakes = model_bilstm_fakes.fit(X_fakes_train_padded, y_fakes_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)

# Evaluate the Bidirectional LSTM model on the test set
y_fakes_bilstm_pred_probs = model_bilstm_fakes.predict(X_fakes_test_padded)
y_fakes_bilstm_pred = (y_fakes_bilstm_pred_probs > 0.5).astype(int)

# Calculate evaluation metrics
accuracy_bilstm = accuracy_score(y_fakes_test, y_fakes_bilstm_pred)
precision_bilstm = precision_score(y_fakes_test, y_fakes_bilstm_pred)
recall_bilstm = recall_score(y_fakes_test, y_fakes_bilstm_pred)
f1_bilstm = f1_score(y_fakes_test, y_fakes_bilstm_pred)

print("Bidirectional LSTM Model Metrics:")
print("Accuracy:", accuracy_bilstm)
print("Precision:", precision_bilstm)
print("Recall:", recall_bilstm)
print("F1 Score:", f1_bilstm)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Bidirectional LSTM Model Metrics:
Accuracy: 0.5279503105590062
Precision: 0.5503875968992248
Recall: 0.797752808988764
F1 Score: 0.6513761467889908


Classifiers

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from sklearn.linear_model import SGDClassifier
# Define classifiers
classifiers = {
    "LR": LogisticRegression(),
    "RF": RandomForestClassifier(),
    "MNB": MultinomialNB(),
    "SGD": SGDClassifier(),
    "KNNs": KNeighborsClassifier(),
    "DT": DecisionTreeClassifier(),
    "AB": AdaBoostClassifier(),
}
# Train and evaluate classifiers
results = {}
for clf_name, clf in classifiers.items():
    clf.fit(X_fakes_train_padded, y_fakes_train)
    y_pred = clf.predict(X_fakes_test_padded)

    accuracy = accuracy_score(y_fakes_test, y_pred)
    precision = precision_score(y_fakes_test, y_pred)
    recall = recall_score(y_fakes_test, y_pred)
    f1 = f1_score(y_fakes_test, y_pred)

    results[clf_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
    }


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [43]:
hybrid_results = {
    "Hybrid CNN-RNN(LSTM)": {
        "Accuracy": accuracy_fakes,
        "Precision": precision_fakes,
        "Recall": recall_fakes,
        "F1 Score": f1_score_fakes,
    }
}
hybrid_bilstmresults={
    "Hybrid CNN-RNN(BiLSTM)":{
        "Accuracy": accuracy_hcnnrnn,
        "Precision": precision_hcnnrnn,
        "Recall": recall_hcnnrnn,
        "F1 Score": f1_hcnnrnn,
    }
}
transformer_results={
    "Transformers":{
        "Accuracy": accuracy_tr,
        "Precision": precision_tr,
        "Recall": recall_tr,
        "F1 Score": f1_tr,
    }
}
BilstmOnly_results={
    "BiLSTMOnly":{
        "Accuracy": accuracy_bilstm,
        "Precision": precision_bilstm,
        "Recall": recall_bilstm,
        "F1 Score": f1_bilstm,
    }
}

In [44]:
# Combine the results of all models
all_results = {
    **results,
    **hybrid_results,
    **hybrid_bilstmresults,
    **transformer_results,
    **BilstmOnly_results,
    'CNN-only': {
        'Accuracy': accuracy_cnn,
        'Precision': precision_cnn,
        'Recall': recall_cnn,
        'F1 Score': f1_score_cnn
    },
    'RNN-only': {
        'Accuracy': accuracy_rnn,
        'Precision': precision_rnn,
        'Recall': recall_rnn,
        'F1 Score': f1_score_rnn
    }

}

# Print the comparison table
print("Results of all models on the FA-KES dataset:")
print("Classifier\tAccuracy\tPrecision\tRecall\t\tF1 Score")
for clf_name, metrics in all_results.items():
    print(f"{clf_name}\t\t{metrics['Accuracy']:.2f}\t\t{metrics['Precision']:.2f}\t\t{metrics['Recall']:.2f}\t\t{metrics['F1 Score']:.2f}")


Results of all models on the FA-KES dataset:
Classifier	Accuracy	Precision	Recall		F1 Score
LR		0.50		0.54		0.65		0.59
RF		0.45		0.51		0.52		0.51
MNB		0.44		0.49		0.36		0.42
SGD		0.56		0.56		0.99		0.71
KNNs		0.50		0.54		0.56		0.55
DT		0.53		0.58		0.53		0.55
AB		0.45		0.51		0.44		0.47
Hybrid CNN-RNN(LSTM)		0.52		0.55		0.65		0.60
Hybrid CNN-RNN(BiLSTM)		0.45		0.51		0.46		0.48
Transformers		0.55		0.55		1.00		0.71
BiLSTMOnly		0.53		0.55		0.80		0.65
CNN-only		0.49		0.54		0.58		0.56
RNN-only		0.48		0.53		0.56		0.55
