In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/fake-news-classification/WELFake_Dataset.csv")
df.head()

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()
df.isnull().sum()

In [None]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
df.head()

In [None]:
# 0 - Fake, 1 - Real
df['label'].value_counts().plot.pie(autopct='%.2f')

#### Thus, we have balanced data

In [None]:
X = df.drop(columns=['label'])
y = df['label']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
print(X.shape)
print(y.shape)

In [None]:
import tensorflow as tf
tf.__version__

In [None]:
# Vocab_size = Unique words in our Corpus (entire document)
vocab_size = 10000

In [None]:
messages = X.copy()

# We have to reset index as we have used dropna() earlier, otherwise it will throw an error
messages.reset_index(inplace=True)

In [None]:
messages['title'][1]

In [None]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
### Dataset Preprocessing (Stemming, Lower casing text, Removing Stop Words)

# We will perform classification of News on the basis of the "TITLE" of news

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    
    # We are substituting everything apart from (a-z, A-Z) with a " " (space)
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    
    review = review.lower()
    review = review.split()
    
    # if a word is not in Stop Words,then only we will add it to review (list/array)
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus[:30]

In [None]:
max_length = max(len(sentence.split()) for sentence in corpus)

print("Maximum sentence length:", max_length)

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM,Bidirectional
from tensorflow.keras.layers import Dense, Dropout

In [None]:
# Here, we are converting the text (Corpus) into One Hot Representation for the given vocabulary size (i.e. 10000)
onehot_repr=[one_hot(words,vocab_size) for words in corpus] 

In [None]:
# Note: The one hot representation returns the indexes assigned to a particular word
onehot_repr[:5]

### **Embedding**

In [None]:
sentence_length = 50

# Adding padding to sentences so that all sentences are of same length i.e 50, to avoid varying input sizes
embedded_docs = pad_sequences(onehot_repr,padding='pre',maxlen=sentence_length)
print(embedded_docs)

In [None]:
embedded_docs[0]

## **LSTM Model**

In [None]:
embedding_vector_features=100 # We will have a vector representation to text with only 100 features 

model=Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=sentence_length))
model.add(LSTM(100)) 
model.add(Dropout(0.2))

# LSTM layer (output) is fully connected to the Dense layer
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [None]:
X_final = np.array(embedded_docs)
y_final = np.array(y)
X_final.shape,y_final.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

In [None]:
# Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10)

### Performance Metrics

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score

y_log = model.predict(X_test)
y_pred = np.where(y_log>0.5,1,0)

acc = accuracy_score(y_test, y_pred)

confusion_mat = confusion_matrix(y_test, y_pred)
print(confusion_mat)

In [None]:
print(acc)

## **STACKED LSTM MODEL**

In [None]:
embedding_vector_features=100
model1=Sequential()
model1.add(Embedding(vocab_size, embedding_vector_features, input_length=sentence_length))
model1.add(LSTM(100,return_sequences=True)) 
model1.add(Dropout(0.2))

model1.add(LSTM(50,return_sequences=True)) 
model1.add(Dropout(0.1))

model1.add(LSTM(20)) 
model1.add(Dropout(0.1))

model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

In [None]:
model1.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10)

In [None]:
y_log_1 = model1.predict(X_test)
y_pred_1 = np.where(y_log_1>0.5,1,0)

acc_1 = accuracy_score(y_test, y_pred_1)
print(acc_1)

print()

confusion_mat_1 = confusion_matrix(y_test, y_pred_1)
print(confusion_mat_1)

## **Bi-Directional LSTM Model**

In [None]:
embedding_vector_features=150
model2 = Sequential()
model2.add(Embedding(vocab_size, embedding_vector_features, input_length=sentence_length))
model2.add(Bidirectional(LSTM(200))) 
model2.add(Dropout(0.2))
model2.add(Dense(1,activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model2.summary())

In [None]:
model2.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10, batch_size=120)

In [None]:
y_log_2 = model2.predict(X_test)
y_pred_2 = np.where(y_log_2>0.5,1,0)

acc_2 = accuracy_score(y_test, y_pred_2)
print(acc_2)

print()

confusion_mat_2 = confusion_matrix(y_test, y_pred_2)
print(confusion_mat_2)