https://github.com/krishnaik06/Fake-New-LSTM/tree/master

In [None]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
df = pd.read_csv('/content/train.csv')

In [None]:
df.head()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [None]:
# Check the null values

df.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [None]:
# Drop all null values

df = df.dropna()

In [None]:
# Get the independent features

x = df.drop('label', axis = 1)

In [None]:
# Get the dependent features

y = df['label']

In [None]:
x.shape

(18285, 4)

In [None]:
y.shape

(18285,)

In [None]:
messages = x.copy()

In [None]:
messages.reset_index(inplace=True)

In [None]:
messages['content'] = messages['title']+messages['author']

In [None]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
  print(i)
  review = re.sub('[^a-zA-Z]', ' ', messages['content'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten, Normalization
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.preprocessing.text import one_hot

In [None]:
# Vocabulary size

common = set(corpus)
voc_size = len(common)

In [None]:
# One-hot representation

onehot_reps = [one_hot(word, voc_size) for word in corpus]
max_len = max(len(onehot_rep) for onehot_rep in onehot_reps)
max_len

50

In [None]:
# Padding to make all corpus of equal length

embedded_docs = pad_sequences(onehot_reps,padding='pre',maxlen=max_len)
embedded_docs

array([[    0,     0,     0, ...,   889,  6485,  6187],
       [    0,     0,     0, ...,  6906,  1724,  3933],
       [    0,     0,     0, ..., 12618, 14162,  7873],
       ...,
       [    0,     0,     0, ...,  1294,  7206,  4191],
       [    0,     0,     0, ..., 13956, 11601,  7464],
       [    0,     0,     0, ...,  1571, 14580,  2816]], dtype=int32)

In [None]:
# Create model

embedding_vector_feature = 40
model = Sequential()

model.add(Embedding(voc_size, embedding_vector_feature, input_length=max_len))
model.add(LSTM(100))
model.add(Normalization())
model.add(Dropout(0.25))
model.add(Dense(units=6, activation='relu'))
model.add(Normalization())
model.add(Flatten())
model.add(Dense(units=1, activation='sigmoid'))

model.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_18 (Embedding)    (None, 50, 40)            719360    
                                                                 
 lstm_25 (LSTM)              (None, 100)               56400     
                                                                 
 normalization_9 (Normalizat  (None, 100)              201       
 ion)                                                            
                                                                 
 dropout_15 (Dropout)        (None, 100)               0         
                                                                 
 dense_17 (Dense)            (None, 6)                 606       
                                                                 
 normalization_10 (Normaliza  (None, 6)                13        
 tion)                                               

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
x_final = np.array(embedded_docs)
y_final = np.array(y)

In [None]:
(x_train, x_test, y_train, y_test) = train_test_split(x_final, y_final, test_size=0.3, random_state=42)

In [None]:
# Model training

history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

KeyboardInterrupt: ignored

In [None]:
y_pred_probs = model.predict(x_test)
y_pred = np.argmax(y_pred_probs, axis=1)



In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

confusion_matrix(y_test,y_pred)
accuracy_score(y_test,y_pred)