In [1]:
import numpy as np
import pandas as pd
import gensim
from ast import literal_eval

from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, Input, CuDNNLSTM, LSTM

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv("../data/FakeNews_Preprocessed/data.csv", index_col=0)
dataset.reset_index(drop=True, inplace=True)

In [3]:
dataset

Unnamed: 0,text,label
0,"['house', 'dem', 'aide', 'comey', 'letter', 'j...",1
1,"['feeling', 'life', 'circle', 'roundabout', 'h...",0
2,"['truth', 'fire', 'october', '29', '2016', 'te...",1
3,"['video', '15', 'civilian', 'kill', 'single', ...",1
4,"['print', 'iranian', 'woman', 'sentence', 'yea...",1
...,...,...
18280,"['rapper', 'unload', 'black', 'celebrity', 'me...",0
18281,"['green', 'bay', 'packer', 'lose', 'washington...",0
18282,"['macy', 'today', 'grow', 'union', 'great', 'a...",0
18283,"['nato', 'russia', 'hold', 'parallel', 'exerci...",1


In [4]:
articles = dataset['text'].apply(literal_eval)
articles

0        [house, dem, aide, comey, letter, jason, chaff...
1        [feeling, life, circle, roundabout, head, stra...
2        [truth, fire, october, 29, 2016, tension, inte...
3        [video, 15, civilian, kill, single, airstrike,...
4        [print, iranian, woman, sentence, year, prison...
                               ...                        
18280    [rapper, unload, black, celebrity, meet, donal...
18281    [green, bay, packer, lose, washington, redskin...
18282    [macy, today, grow, union, great, american, re...
18283    [nato, russia, hold, parallel, exercise, balka...
18284    [david, swanson, author, activist, journalist,...
Name: text, Length: 18285, dtype: object

In [5]:
lengths = np.array([len(x) for x in articles])
dataset = dataset[lengths < 1000]
articles = articles[lengths < 1000]
dataset = dataset.reset_index(drop = True)
articles = articles.reset_index(drop = True)

In [6]:
article_length = max(articles.apply(len))
article_length

999

In [7]:
vec_size = 100

word_model = gensim.models.Word2Vec(articles, vector_size = vec_size, window = 5, workers = 12)
word_model.train(articles, epochs = 10, total_examples = len(articles))
wv = word_model.wv

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(articles)
vocabulary_size = len(tokenizer.word_index) + 1
encoded_articles = tokenizer.texts_to_sequences(articles)

In [9]:
padded_articles = pad_sequences(encoded_articles, maxlen = article_length, padding = 'post')

In [10]:
padded_articles.shape

(17475, 999)

In [11]:
emb_matrix = np.zeros(shape=(vocabulary_size, vec_size))
for w, i in tokenizer.word_index.items():
    ind = wv.has_index_for(w)
    if ind:
        emb_matrix[i] = wv.get_vector(w)

In [21]:
x_train.shape

(13106, 999)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(padded_articles, dataset['label'], test_size = 0.25)

In [13]:
from keras.initializers import Constant
from keras.layers import ReLU
from keras.layers import Dropout
model = Sequential()
model.add(Embedding(input_dim = vocabulary_size, 
                    output_dim = vec_size,
                    input_length = article_length,
                    embeddings_initializer = Constant(emb_matrix))
         )
model.add(Flatten())
model.add(Dense(1, activation = 'sigmoid'))

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 999, 100)          13487300  
                                                                 
 flatten (Flatten)           (None, 99900)             0         
                                                                 
 dense (Dense)               (None, 1)                 99901     
                                                                 
Total params: 13,587,201
Trainable params: 13,587,201
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(optimizer = RMSprop(learning_rate = 1e-5), loss = 'binary_crossentropy', metrics = ['accuracy'])

In [16]:
epochs = 20
batch_size = 64

In [17]:
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test,y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1538f6ec100>