In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# from wordcloud import WordCloud
import re
import nltk
import warnings
%matplotlib inline

warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv(r'C:\Projects\ml_experiments\Datasets\fake_news\train.csv')
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1


In [4]:
df = df.drop(columns=['id', 'title', 'author'], axis=1)

In [5]:
df = df.dropna(axis=0)

In [6]:
df['clean_news'] = df['text'].str.lower()
df['clean_news']

0        house dem aide: we didn’t even see comey’s let...
1        ever get the feeling your life circles the rou...
2        why the truth might get you fired october 29, ...
3        videos 15 civilians killed in single us airstr...
4        print \r\nan iranian woman has been sentenced ...
                               ...                        
20795    rapper t. i. unloaded on black celebrities who...
20796    when the green bay packers lost to the washing...
20797    the macy’s of today grew from the union of sev...
20798    nato, russia to hold parallel exercises in bal...
20799      david swanson is an author, activist, journa...
Name: clean_news, Length: 20761, dtype: object

In [25]:
import string
df['clean_news'] = df['clean_news'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
df['clean_news'] = df['clean_news'].str.replace('\n', '')
df['clean_news'] = df['clean_news'].str.replace('\s+', ' ')
df['clean_news']

0        house dem aide didn’t even see comey’s letter ...
1        ever get feeling life circles roundabout rathe...
2        truth might get fired october 29 2016 tension ...
3        videos 15 civilians killed single us airstrike...
4        print iranian woman sentenced six years prison...
                               ...                        
20795    rapper t i unloaded black celebrities met dona...
20796    green bay packers lost washington redskins wee...
20797    macy’s today grew union several great names am...
20798    nato russia hold parallel exercises balkans 11...
20799    david swanson author activist journalist radio...
Name: clean_news, Length: 20761, dtype: object

In [26]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['clean_news'] = df['clean_news'].apply(lambda x: " ".join([word for word in x.split() if word not in stop]))
df.head()

Unnamed: 0,text,label,clean_news
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide didn’t even see comey’s letter ...
1,Ever get the feeling your life circles the rou...,0,ever get feeling life circles roundabout rathe...
2,"Why the Truth Might Get You Fired October 29, ...",1,truth might get fired october 29 2016 tension ...
3,Videos 15 Civilians Killed In Single US Airstr...,1,videos 15 civilians killed single us airstrike...
4,Print \r\nAn Iranian woman has been sentenced ...,1,print iranian woman sentenced six years prison...


In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [28]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_news'])
word_index = tokenizer.word_index
vocab_size = len(word_index)
vocab_size

267302

In [29]:
sequences = tokenizer.texts_to_sequences(df['clean_news'])
padded_seq = pad_sequences(sequences, maxlen=500, padding='post', truncating='post')

In [30]:
padded_seq[1]

array([   260,     31,   1595,    105,   5044,  29795,    411,   2356,
         2132,    386,    537,   1562,   5505,     32,     14,    796,
          186,    380,   1969,  18343,    884,  21420,   3013,   2790,
          672,    749,    253,   2050,   3204,  11300,    132,     42,
         2186,  98424,   3398,   3644,   3626,   3204,    830,    166,
         3563,     32,  10022,  61171,   3755,     20,    224,     23,
         3260,  21420,  10412,    649,     13,    488,   2539,   2426,
         8296,    475,    706,   1570,    620,    152,     16,  23882,
          898,   7820,    327,     10,    168,   3730,     64,  11862,
          885,   2738,     18,    878,   5058,    181,     49,   5463,
         2186,   4536,     43,    263,    830,    214,     12,      4,
         2409,   3342,   2186,  10022,     59,   8719,  11478, 135235,
        22326,   3786, 135236,  98425, 135237,   5531,  34434,  28868,
         8336,  20233,  80112,   4508,      4,    169,    258,  25628,
      

In [31]:
embedding_index = {}
with open(r'C:\Projects\ml_experiments\Datasets\fake_news\glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [32]:
embedding_matrix = np.zeros((vocab_size+1, 100))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [33]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(padded_seq, df['label'], test_size=0.20, random_state=42, stratify=df['label'])

In [34]:
from keras.layers import LSTM, Dropout, Dense, Embedding
from keras import Sequential

# model = Sequential([
#     Embedding(vocab_size+1, 100, weights=[embedding_matrix], trainable=False),
#     Dropout(0.2),
#     LSTM(128, return_sequences=True),
#     LSTM(128),
#     Dropout(0.2),
#     Dense(512),
#     Dropout(0.2),
#     Dense(256),
#     Dense(1, activation='sigmoid')
# ])

model = Sequential([
    Embedding(vocab_size+1, 100, weights=[embedding_matrix], trainable=False),
    Dropout(0.2),
    LSTM(128),
    Dropout(0.2),
    Dense(256),
    Dense(1, activation='sigmoid')
])

In [35]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [36]:
history = model.fit(x_train, y_train, epochs=10, batch_size=256, validation_data=(x_test, y_test))

Epoch 1/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 1s/step - accuracy: 0.6225 - loss: 0.6486 - val_accuracy: 0.6836 - val_loss: 0.5760
Epoch 2/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 2s/step - accuracy: 0.6576 - loss: 0.6026 - val_accuracy: 0.5536 - val_loss: 0.6989
Epoch 3/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 1s/step - accuracy: 0.6067 - loss: 0.6483 - val_accuracy: 0.6785 - val_loss: 0.5829
Epoch 4/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 2s/step - accuracy: 0.6683 - loss: 0.5901 - val_accuracy: 0.6732 - val_loss: 0.6042
Epoch 5/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 2s/step - accuracy: 0.6531 - loss: 0.6162 - val_accuracy: 0.5480 - val_loss: 0.6977
Epoch 6/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 2s/step - accuracy: 0.6253 - loss: 0.6424 - val_accuracy: 0.6769 - val_loss: 0.5760
Epoch 7/10
[1m65/65[0m [32m━━━━━━━