# Importing Libraries

In [3]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm_notebook
import re
import nltk
nltk.download(['stopwords','punkt','wordnet','omw-1.4'])
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Bidirectional
from keras.layers import Dropout
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GlobalAveragePooling1D
from keras.layers import Conv1D, MaxPooling1D
%pip install scikeras
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
%pip install keras-self-attention
from keras_self_attention import SeqSelfAttention


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0
Collecting keras-self-attention
  Using cached keras_self_attention-0.51.0-py3-none-any.whl
Installing collected packages: keras-self-attention
Successfully installed keras-self-attention-0.51.0


# Importing Dataset

In [4]:
#importing dataset
df = pd.read_csv('train.csv')
df.head()
#exploratory data analysis
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


# Data Preprocessing and Tokenization


In [5]:
# def preprocess_text(input_text, stopwords_removal=True):
#     result = ""
#     input_text = str(input_text).replace(r'http[\w:/\.]+', '')  # eliminate urls
#     input_text = str(input_text).replace(r'[^\.\w\s]', '')  # remove all but characters and punctuation
#     input_text = str(input_text).replace(r'\.\.+', '.')  # substitute multiple periods with a single one
#     input_text = str(input_text).replace(r'\.', ' . ')  # replace periods with a single one
#     input_text = str(input_text).replace(r'\s\s+', ' ')  # replace multiple spaces with a single one
#     input_text = str(input_text).replace("\n", "")  # remove line breaks
#     input_text = re.sub(r'[^\w\s]', '', input_text).lower()  # convert text to lowercase

#     if stopwords_removal:
#         input_text = input_text.split(" ")
#         for word in input_text:
#             if word not in stopwords.words("english"):
#                 result = result + " " + word
#     else:
#         result = input_text

#     return ' '.join(result).strip()[1:-3].replace(" ", " ")

# texts = []
# x = df['text']
# for line in tqdm_notebook(x, total=df.shape[0]):
#  texts.append(preprocess_text(line))

In [6]:
#data preprocessing
x_train,x_test,y_train,y_test = train_test_split(df['text'],df['label'],
                                                 test_size=0.2,random_state=42)

#tokenization
max_length = 128
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
vocab_size = 1000

tokenizer = Tokenizer(num_words = vocab_size, char_level=False,
                      oov_token=oov_tok)
x_train = x_train.astype(str)
x_test = x_test.astype(str)
tokenizer.fit_on_texts(x_train)

training_sequences = tokenizer.texts_to_sequences(x_train)
training_sequences = pad_sequences(training_sequences, maxlen = max_length,
                                   padding = padding_type,
                                   truncating = trunc_type)

testing_sequences = tokenizer.texts_to_sequences(x_test)
testing_sequences = pad_sequences(testing_sequences, maxlen = max_length,
                                   padding = padding_type,
                                   truncating = trunc_type)


print("Shape of training sequence",training_sequences.shape)
print("Shape of testing sequence",testing_sequences.shape)

Shape of training sequence (16640, 128)
Shape of testing sequence (4160, 128)


# Model Initialization


## BiLSTM

### Model Architecture

In [None]:
#model architecture
embedding_dimension = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dimension, input_length=max_length))
model.add(Bidirectional(LSTM(128, return_sequences=True)))  # Set return_sequences=True
model.add(SeqSelfAttention(attention_activation='sigmoid'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 128, 32)           32000     
                                                                 
 bidirectional_3 (Bidirecti  (None, 128, 256)          164864    
 onal)                                                           
                                                                 
 seq_self_attention_2 (SeqS  (None, 128, 256)          16449     
 elfAttention)                                                   
                                                                 
 global_average_pooling1d (  (None, 256)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                      

In [None]:
#training model
early_stop = EarlyStopping(monitor = 'val_loss', patience = 2)
history = model.fit(training_sequences,
                    y_train,
                    epochs = 4,
                    validation_data = (testing_sequences, y_test),
                    callbacks = [early_stop],
                    verbose = 2)

Epoch 1/4
520/520 - 328s - loss: 0.3572 - accuracy: 0.8424 - val_loss: 0.2785 - val_accuracy: 0.8990 - 328s/epoch - 631ms/step
Epoch 2/4
520/520 - 310s - loss: 0.2473 - accuracy: 0.8964 - val_loss: 0.1919 - val_accuracy: 0.9286 - 310s/epoch - 595ms/step
Epoch 3/4
520/520 - 310s - loss: 0.1833 - accuracy: 0.9305 - val_loss: 0.1830 - val_accuracy: 0.9370 - 310s/epoch - 596ms/step
Epoch 4/4
520/520 - 285s - loss: 0.1579 - accuracy: 0.9409 - val_loss: 0.1781 - val_accuracy: 0.9327 - 285s/epoch - 547ms/step


### Making Predictions with Model


In [None]:
#test
test_news =  ["One person believed to be dead on a property near the U.S. border in Langley, B.C., after a fire broke out during a large-scale police operation Friday, the RCMP said Saturday. B.C.'s police watchdog, the Independent Investigations Office of B.C. (IIOBC), confirmed to CBC News that it is now investigating the incident, which left two police vehicles heavily damaged by fire. Officers responded to a report of a distraught individual on the 23000-block of 0 Avenue property, Friday after 10 a.m., according to an RCMP press release. But when officers arrived, they heard gunshots and called in the force's integrated emergency response team (IERT), the release stated."]

test_news_sequences = tokenizer.texts_to_sequences(test_news)
test_news_sequences = pad_sequences(test_news_sequences, maxlen = max_length,
                                   padding = padding_type,
                                   truncating = trunc_type)
print(test_news_sequences.shape)
test_news_pred = model.predict(test_news_sequences)
print("Probability of fake news:",test_news_pred)

test_news =  ["Alert! The president is dead"]

test_news_sequences = tokenizer.texts_to_sequences(test_news)
test_news_sequences = pad_sequences(test_news_sequences, maxlen = max_length,
                                   padding = padding_type,
                                   truncating = trunc_type)
print(test_news_sequences.shape)
test_news_pred = model.predict(test_news_sequences)
print("Probability of fake news:",test_news_pred)

(1, 128)
Probability of fake news: [[0.02236203]]
(1, 128)
Probability of fake news: [[0.9973208]]


## CNN-LSTM


### Model Architecture

In [16]:
#model architecture
embedding_dimension = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_dimension, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(128, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

#training model
early_stop = EarlyStopping(monitor = 'val_loss', patience = 2)
history = model.fit(training_sequences,
                    y_train,
                    epochs = 4,
                    validation_data = (testing_sequences, y_test),
                    callbacks = [early_stop],
                    verbose = 2)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 128, 32)           32000     
                                                                 
 conv1d_1 (Conv1D)           (None, 128, 32)           3104      
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 64, 32)            0         
 g1D)                                                            
                                                                 
 lstm (LSTM)                 (None, 128)               82432     
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 117665 (459.63 KB)
Trainable params: 117665 (459.63 KB)
Non-trainable params: 0 (0.00 Byte)
______________

In [None]:
# def create_model(learning_rate=0.01):
  # model = Sequential()
  # model.add(Embedding(vocab_size, embedding_dimension, input_length=max_length))
  # model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
  # model.add(MaxPooling1D(pool_size=2))
  # model.add(LSTM(128, return_sequences=False))
  # model.add(Dense(1, activation='sigmoid'))
  # model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy'])
  # return model

# # Create a classifier with best parameters
# model = KerasClassifier(build_fn=create_model, learning_rate=0.01, verbose=0)

# # Define the grid search parameters
# learning_rate = [0.001, 0.01, 0.1]
# param_dist = dict(learning_rate=learning_rate)

# random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_jobs=-1, cv=3)
# random_search_result = random_search.fit(training_sequences, y_train, epochs=4, validation_data=(testing_sequences, y_test), callbacks=[EarlyStopping(monitor='val_loss', patience=2)], verbose=2)

# # Summarize results
# print("Best: %f using %s" % (random_search_result.best_score_, random_search_result.best_params_))


### Making Predictions with Model


In [17]:
#test
test_news =  ["One person believed to be dead on a property near the U.S. border in Langley, B.C., after a fire broke out during a large-scale police operation Friday, the RCMP said Saturday. B.C.'s police watchdog, the Independent Investigations Office of B.C. (IIOBC), confirmed to CBC News that it is now investigating the incident, which left two police vehicles heavily damaged by fire. Officers responded to a report of a distraught individual on the 23000-block of 0 Avenue property, Friday after 10 a.m., according to an RCMP press release. But when officers arrived, they heard gunshots and called in the force's integrated emergency response team (IERT), the release stated."]

test_news_sequences = tokenizer.texts_to_sequences(test_news)
test_news_sequences = pad_sequences(test_news_sequences, maxlen = max_length,
                                   padding = padding_type,
                                   truncating = trunc_type)
print(test_news_sequences.shape)
test_news_pred = model.predict(test_news_sequences)
print("Probability of fake news:",test_news_pred)

test_news =  ["Alert! The president is dead"]

test_news_sequences = tokenizer.texts_to_sequences(test_news)
test_news_sequences = pad_sequences(test_news_sequences, maxlen = max_length,
                                   padding = padding_type,
                                   truncating = trunc_type)
print(test_news_sequences.shape)
test_news_pred = model.predict(test_news_sequences)
print("Probability of fake news:",test_news_pred)

(1, 128)
Probability of fake news: [[0.02587314]]
(1, 128)
Probability of fake news: [[0.99830014]]
