#Fake News Classification Task:
Using word2vec to encode sentences into index which refers to the nth frequency in corpus. Ths corpus is constructed based on training and test data. Then apply bidirectional LSTM to train on training data. The best model is saved with the highest validaiton accuracy. Make prediction on test data based on this model.

# Step 1)  Import packages and set parameters


In [None]:
import keras
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint


### Setting paramiters

In [None]:
max_features = 40000 #This is highest frequency threshold for all indices.
maxlen = 500  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

In [None]:
import re  # For preprocessing
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
import spacy  # For preprocessing

# Step 2) Unzip and prepare data

In [None]:
import zipfile
with zipfile.ZipFile("train.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("./")

In [None]:
import zipfile
with zipfile.ZipFile("test.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("./")

In [None]:
import pandas as pd
df=pd.read_csv('./train.csv',header=0)
print(df.columns)
print(df.shape)

Index(['id', 'title', 'text', 'date', 'is_fake'], dtype='object')
(35395, 5)


In [None]:
#how many training data we have:
num_of_train=df.shape[0]
num_of_train

35395

In [None]:
#load test data as df_test.
df_test=pd.read_csv('./test.csv',header=0)

In [None]:
#names of columns of training data:
df.columns

Index(['id', 'title', 'text', 'date', 'is_fake'], dtype='object')

In [None]:
df.isnull().sum() #no missing data

id         0
title      0
text       0
date       0
is_fake    0
dtype: int64

In [None]:
#extract only text column from training and test data:
text=df.text
text_test=df_test.text
print(text.shape)
print(text_test.shape)

(35395,)
(8849,)


combine training's text and test's text together to get bag of words:

In [None]:
text_combine=pd.concat([text,text_test])
len(text_combine)

44244

In [None]:
text_combine

0       21st Century Wire says WikiLeaks has released ...
1       British Prime Minister Theresa May said on Wed...
2       Well, get busy people! With the cyber attacks ...
3       U.S. President Donald Trump will announce a ne...
4       U.S. President Donald Trump gave his  in princ...
                              ...                        
8844    Johnny Carson must be rolling over in his grav...
8845    Special Counsel Robert Muellers office has int...
8846    Uzbek police released dissident writer Nurullo...
8847    Liberals would like you to believe they re mor...
8848    LIBERAL COMPASSION: This story is a perfect ex...
Name: text, Length: 44244, dtype: object

In [None]:
y_train_fake=df.is_fake

Bigrams:
We are using Gensim Phrases package to automatically detect common phrases (bigrams) from a list of sentences.

In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
#As Phrases() takes a list of list of words as input:
#Creates the relevant phrases from the list of sentences:
sent = [row.split() for row in text_combine]
phrases = Phrases(sent, min_count=20, progress_per=10000)
#The goal of Phraser() is to cut down memory consumption of Phrases(), by discarding model state not strictly needed for the bigram detection task:
sentences = phrases[sent]




In [None]:
#sent is list of words that contained in whole corpus
sent[0][:10]

['21st',
 'Century',
 'Wire',
 'says',
 'WikiLeaks',
 'has',
 'released',
 'its',
 'largest',
 'ever']

In [None]:
#sent is list of sentences for the whole training data and test data which broke down into list of words

sentences[0][:10]



['21st_Century',
 'Wire_says',
 'WikiLeaks',
 'has',
 'released',
 'its',
 'largest',
 'ever',
 'publication',
 'of']

Most Frequent Words:
Mainly a sanity check of the effectiveness of the lemmatization, removal of stopwords, and addition of bigrams.

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)



385126

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['the', 'to', 'of', 'and', 'a', 'in', 'that', 's', 'is', 'for']

In [None]:
import multiprocessing

from gensim.models import Word2Vec

In [None]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

2

Preprocessing using word2vec:

Training the model
Gensim Word2Vec Implementation:

In [None]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores-1)

Building the Vocabulary Table:
Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them):

In [None]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))



Time to build vocab: 1.3 mins


In [None]:
w2v_model.init_sims(replace=True)

In [None]:
#most similiar words to "us"
w2v_model.wv.most_similar(positive=["us"])

  if np.issubdtype(vec.dtype, np.int):


[('2017@IvankaTrump', 0.23851890861988068),
 ('casting_ballots', 0.22953510284423828),
 ('similar', 0.2292918562889099),
 ('seedy', 0.22645053267478943),
 ('Wednesday_evening', 0.22404953837394714),
 ('interest_rate', 0.22162044048309326),
 ('media_outlets', 0.21597148478031158),
 ('thru', 0.21518103778362274),
 ('renewed', 0.21356606483459473),
 ('delos', 0.2133629322052002)]

In [None]:
model_name = "news_content_clean"
w2v_model.save(model_name)

In [None]:
model = Word2Vec.load("news_content_clean")

Train a full model, then access its model.wv property, which holds the standalone keyed vectors

In [None]:
word_vectors = model.wv.syn0 #array essentially holds raw word-vectors.these vectors are a 'projection layer' that can convert a one-hot encoding of a word into a dense embedding-vector of the right dimensionality.
word_vectors

  """Entry point for launching an IPython kernel.


array([[-0.03872449,  0.02400741, -0.00940735, ..., -0.02864116,
         0.08158486,  0.0011667 ],
       [ 0.03810813, -0.03264292,  0.01746346, ...,  0.08693753,
         0.08579368, -0.03509155],
       [-0.00315781, -0.02290377, -0.09503513, ...,  0.0131291 ,
         0.00495498,  0.0652029 ],
       ...,
       [ 0.02651773, -0.06607205,  0.09553516, ...,  0.0919237 ,
        -0.07512816, -0.01320431],
       [-0.01420887, -0.05685772, -0.01885691, ...,  0.04244019,
         0.08878259, -0.07486628],
       [ 0.01612616, -0.10234423, -0.09523982, ...,  0.05854018,
        -0.01232063,  0.07443156]], dtype=float32)

In [None]:
#word2index is dictionary of all words as key and value is frequency in corpus:
word2index = {token: token_index for token_index, token in enumerate(model.wv.index2word)}
"US" in word2index

True

In [None]:
len(word2index)

46218

In [None]:
print(model.wv.index2word[:10])
word="country"
print(model.wv.vocab.get(word).index)
print(model.wv.vocab.get('language').index)


['the', 'to', 'of', 'and', 'a', 'in', 'that', 's', 'is', 'for']
131
1682


Sentences_index are list of list of frequency corresponding to each word in our whole training data and test data:

In [None]:
sentences_index=[]
for news in sentences:
    lst=[]
    for word in news:
      if word in word2index:#if the word is in corpus
        if word2index[word]+1<max_features-1:#exclude extremely rare words
           lst.append(word2index[word]+1)
    sentences_index.append(lst)




dataset are cleaned, grouped as bigrams and encoded as word frequencies for  words as bag-of-words corpus: called this new dataset as "sentences_index":

In [None]:
len(sentences_index)

44244

In [None]:
sentences_index[1][:10]

[4760, 5353, 404, 16, 285, 48, 3573, 928, 41, 25]

Split into training data and test data from sentences_index:

In [None]:
x_train_fake=sentences_index[:num_of_train]
x_test_fake=sentences_index[num_of_train:]
print(len(y_train_fake))
print(len(x_train_fake))
print(len(x_test_fake))

35395
35395
8849


In [None]:
# Reserve 7,000 samples for validation
x_val = x_train_fake[-7000:]
y_val = y_train_fake[-7000:]
x_train = x_train_fake[:-7000]
y_train = y_train_fake[:-7000]

In [None]:
# create a callback that will save the best model while training
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', save_best_only=True, verbose=1)


In [None]:
len(x_val)

7000

In [None]:
len(x_train[0])

3441

#Model training:

### Pad sequences
Pad the word sequences in each sample.

In [None]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = sequence.pad_sequences(x_val, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_val.shape)

Pad sequences (samples x time)
x_train shape: (28395, 500)
x_test shape: (7000, 500)


In [None]:
print('Pad sequences (samples x time)')
x_test_fake = sequence.pad_sequences(x_test_fake, maxlen=maxlen)

Pad sequences (samples x time)


In [None]:
print('x_test shape:', x_test_fake.shape)

x_test shape: (8849, 500)


# Step 3) Model Definition
<b>Embedding layer:</b> Turns positive integers (indexes) into dense vectors of fixed size. eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]<sup><a href="#R4" target="_blank">[4]</a></sup> This layer can only be used as the first layer in a model.

#Bidirectional LSTM:

In [None]:
from keras.layers import Bidirectional


In [None]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 10, input_length=maxlen))
model.add(Bidirectional(LSTM(10, return_sequences=True)))
model.add(Bidirectional(LSTM(10)))
model.add(Dense(1, activation='sigmoid'))




Build model...


In [None]:
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 500, 10)           400000    
_________________________________________________________________
bidirectional_24 (Bidirectio (None, 500, 20)           1680      
_________________________________________________________________
bidirectional_25 (Bidirectio (None, 20)                2480      
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 21        
Total params: 404,181
Trainable params: 404,181
Non-trainable params: 0
_________________________________________________________________


# Step 4) Compiling model

In [None]:
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
metrics=['accuracy'])

# Step 5) Learning model and fit it on training data

In [None]:
print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=15,callbacks=[checkpoint],
validation_data=(x_val, y_val))

Train...
Epoch 1/15
Epoch 00001: val_accuracy improved from -inf to 0.98543, saving model to best_model.h5
Epoch 2/15
Epoch 00002: val_accuracy improved from 0.98543 to 0.99271, saving model to best_model.h5
Epoch 3/15
Epoch 00003: val_accuracy did not improve from 0.99271
Epoch 4/15
Epoch 00004: val_accuracy improved from 0.99271 to 0.99400, saving model to best_model.h5
Epoch 5/15
Epoch 00005: val_accuracy did not improve from 0.99400
Epoch 6/15
Epoch 00006: val_accuracy did not improve from 0.99400
Epoch 7/15
Epoch 00007: val_accuracy improved from 0.99400 to 0.99543, saving model to best_model.h5
Epoch 8/15
Epoch 00008: val_accuracy improved from 0.99543 to 0.99614, saving model to best_model.h5
Epoch 9/15
Epoch 00009: val_accuracy did not improve from 0.99614
Epoch 10/15
Epoch 00010: val_accuracy did not improve from 0.99614
Epoch 11/15
Epoch 00011: val_accuracy did not improve from 0.99614
Epoch 12/15
Epoch 00012: val_accuracy did not improve from 0.99614
Epoch 13/15
Epoch 00013:

<tensorflow.python.keras.callbacks.History at 0x7fd2d3ceffd0>

In [None]:
print('Evaluating model...')
score, acc = model.evaluate(x_val, y_val,
                            batch_size=batch_size)
print('\n\nTest score:', score)
print('Test accuracy:', acc)

Evaluating model...


Test score: 0.046872563660144806
Test accuracy: 0.9941428303718567


In [None]:
# Evaluate the best model saved (i.e., model with best validation accuracy) on the test set
saved_model = keras.models.load_model('best_model.h5')
scores = saved_model.evaluate(x_val, y_val, verbose=1)
print('Test accuracy:', scores[1])

Test accuracy: 0.9962857365608215


In [None]:
np.array(x_test_fake)

array([[    0,     0,     0, ..., 25400,  3737,  2334],
       [    0,     0,     0, ..., 33714,  7295, 17087],
       [    0,     0,     0, ...,     1,  3243, 18504],
       ...,
       [    0,     0,     0, ...,   130, 33453,  1168],
       [    0,     0,     0, ...,     3, 25181,   323],
       [    0,     0,     0, ...,     6,    33,    51]], dtype=int32)

In [None]:
import torch
x_test_fake_tensor = torch.FloatTensor(x_test_fake)


## Evaluate and predict labels of test data based on model corresponding to best validation accuracy

In [None]:
y_prd = model.predict(np.array(x_test_fake_tensor))
print(y_prd)
y_prd = [1 if v > 0.5 else 0 for v in y_prd]
print('First ten predicted label and true label of test data')
print(np.array(y_prd[0:10]))
len(y_prd)

[[1.0000000e+00]
 [9.9999988e-01]
 [1.0000000e+00]
 ...
 [4.7378212e-09]
 [9.9999988e-01]
 [9.9999988e-01]]
First ten predicted label and true label of test data
[1 1 1 1 0 1 1 0 1 0]


8849

#Predict on test data and write out to csv file:

In [None]:
df_test.id

0       35395
1       35396
2       35397
3       35398
4       35399
        ...  
8844    44239
8845    44240
8846    44241
8847    44242
8848    44243
Name: id, Length: 8849, dtype: int64

In [None]:
predict=np.stack((df_test.id,np.array(y_prd).T),axis=1)
predict.shape

(8849, 2)

In [None]:
import csv

with open('Annetta Qi.csv', mode='w') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['id','is_fake'])
    writer.writerows(predict)