In [None]:
import pandas as pd
df=pd.read_csv('Clean_Dataset.csv')
df.head()

In [None]:
# Get the dependent and independent features
x=df.drop('labels',axis=1)
y = df['labels']

In [None]:
x.shape

In [None]:
import tensorflow as tf
tf.__version__

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [None]:
voc_size=5000 # Vocabulary size

### Onehot Representation

In [None]:
messages=x.copy()
messages['title'][1]

In [None]:
import nltk
import re
from nltk.corpus import stopwords

In [None]:
# Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

# convert all titles to string
messages['title'] = messages['title'].astype(str)

corpus = []

for i in range(len(messages)):
    review = messages['title'][i]

    # remove anything not a letter
    review = re.sub('[^a-zA-Z]', ' ', review)

    review = review.lower()
    review = review.split()

    # stem + remove stopwords
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]

    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

In [None]:
onehot_repr = [one_hot(words, voc_size) for words in corpus]
onehot_repr

In [None]:
print(corpus[1])
print(onehot_repr[1])

### Embedding Representation

In [None]:
sentence_length = 20
embedded_docs = pad_sequences(onehot_repr, padding='post', maxlen=sentence_length)
print(embedded_docs)

In [None]:
embedded_docs[0]

### Creating the model

In [None]:
embedding_vector_features = 40

from tensorflow.keras.layers import Bidirectional

model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_shape=(sentence_length,)))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(1, activation='sigmoid'))
model.summary()


In [None]:
import numpy as np

# Converting the list into arrays using numpy
x_final = np.array(embedded_docs)
y_final = np.array(y)

# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.33, random_state=42)

In [None]:
# Model training
model.compile(loss = 'binary_crossentropy', optimizer='Adam', metrics=['Accuracy'])
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=20, batch_size=64)

In [None]:
y_pred = model.predict(x_test)
y_pred = np.where(y_pred>0.5, 1, 0)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
from matplotlib import pyplot as plt

# class_names = sorted(df['label_names'].unique()) 
# print(class_names)

print(f"Accuracy : {accuracy_score(y_test, y_pred)}")
print(f"Classification report : {classification_report(y_test, y_pred)}")

In [None]:
cm = confusion_matrix(y_test, y_pred)
class_names = ['Fake','Real']
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()