## Loading Libraries and preprocessing data

In [2]:
!unzip -a '/content/drive/My Drive/NLP/input/fake-news.zip'

Archive:  /content/drive/My Drive/NLP/input/fake-news.zip
  inflating: submit.csv              [binary]
  inflating: test.csv                [binary]
  inflating: train.csv               [binary]


In [4]:
!pip install texthero

Collecting texthero
  Downloading https://files.pythonhosted.org/packages/1f/5a/a9d33b799fe53011de79d140ad6d86c440a2da1ae8a7b24e851ee2f8bde8/texthero-1.0.9-py3-none-any.whl
Collecting unidecode>=1.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 14.0MB/s 
Collecting nltk>=3.3
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 38.4MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.5-cp36-none-any.whl size=1434674 sha256=84cba267fc0bd6ffe92c80128c9afec9c3ef3e128d2d99faa7565a6469545037
  Stored in directory: /root/.cache/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f86023639

In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import gensim.downloader as api
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import nltk
import warnings
from tensorflow.keras.utils import plot_model
from tensorflow.keras import Sequential, layers, losses, callbacks, metrics, Model
from tensorflow.keras.layers import Input, LSTM, Flatten, BatchNormalization, Dense, Activation, Concatenate
import tensorflow_hub as hub
import tensorflow.keras.backend as K
import texthero as hero
from tqdm.notebook import tqdm
import operator 
warnings.filterwarnings('ignore')
tqdm.pandas()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
df = pd.read_csv('train.csv')
df.head(2)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0


### Cleaning text

In [5]:
?BatchNormalization

In [3]:
?hero.clean

In [4]:
df['text'] = hero.clean(df['text'])
df.head(2)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,house dem aide even see comey letter jason cha...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,ever get feeling life circles roundabout rathe...,0


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Tokenization

In [None]:
text = hero.tokenize(df['text'])
text[:10]

0    [house, dem, aide, even, see, comey, letter, j...
1    [ever, get, feeling, life, circles, roundabout...
2    [truth, might, get, fired, october, tension, i...
3    [videos, civilians, killed, single, us, airstr...
4    [print, iranian, woman, sentenced, six, years,...
5    [trying, times, jackie, mason, voice, reason, ...
6    [ever, wonder, britain, iconic, pop, pianist, ...
7    [paris, france, chose, idealistic, traditional...
8    [donald, j, trump, scheduled, make, highly, an...
9    [week, michael, flynn, resigned, national, sec...
Name: text, dtype: object

### Embeddings

In [None]:
api.info()['models'].keys()

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])

In [None]:
glove_model = api.load('glove-wiki-gigaword-100')



In [None]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [None]:
def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [None]:
def create_embeddings(model, data):

    embedded_text = np.zeros((len(data), model.vector_size))
    
    for index, sent in tqdm(enumerate(data), total=len(data)):
        
        sent_vec = np.zeros(100)
        
        for word in sent:
            
            if word in model.wv:
                
                word_vec = model.wv[word]
                sent_vec += word_vec
                
        embedded_text[index] = sent_vec
        
    return embedded_text

In [None]:
vocab = build_vocab(text)
oov = check_coverage(vocab, glove_model)

HBox(children=(FloatProgress(value=0.0, max=20800.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=176203.0), HTML(value='')))


Found embeddings for 57.40% of vocab
Found embeddings for  97.93% of all text


In [None]:
e_text = create_embeddings(glove_model, text)

HBox(children=(FloatProgress(value=0.0, max=20800.0), HTML(value='')))




In [None]:
e_text_train = np.array(e_text[:17000]).reshape(17000, 100, 1)
e_text_valid = np.array(e_text[17000:]).reshape(3800, 100, 1)

In [None]:
label = df['label']

## SimpleRNN Model

In [None]:
rnn_model = Sequential([
    layers.SimpleRNN(128, input_shape=(100,1)),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
rnn_model.fit(e_text_train, label[:17000], epochs=3, validation_data=(e_text_valid, label[17000:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7f57c99550>

### LSTM Model

In [None]:
lstm_model = Sequential([
    layers.LSTM(128, input_shape = (100,1)),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
lstm_model.fit(e_text_train, label[:17000], epochs=3, validation_data=(e_text_valid, label[17000:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7f54258e48>

## GRU model

In [None]:
gru_model = Sequential([
    layers.GRU(128, input_shape = (100,1)),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
gru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
gru_model.fit(e_text_train, label[:17000], epochs=3, validation_data=(e_text_valid, label[17000:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7f52c5f860>

## Bidirectional LSTM

In [None]:
bilstm_model = Sequential([
    layers.Input(shape = (100,1)),
    layers.Bidirectional(layers.LSTM(128)),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
bilstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
bilstm_model.fit(e_text_train, label[:17000], epochs=3, validation_data=(e_text_valid, label[17000:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7f414ff6d8>

## Bidirectional GRU

In [None]:
bigru_model = Sequential([
    layers.Input(shape = (100,1)),
    layers.Bidirectional(layers.GRU(128)),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
bigru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
bigru_model.fit(e_text_train, label[:17000], epochs=3, validation_data=(e_text_valid, label[17000:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7f3fa53588>

## CNN along with LSTM

In [None]:
combined_model = Sequential([
    layers.Input(shape=(100,1)),
    layers.LSTM(128, return_sequences=True),
    layers.Conv1D(64, 3),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
combined_model.fit(e_text_train, label[:17000], epochs=3, validation_data=(e_text_valid, label[17000:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7d883ddfd0>

## CNN with GRU

In [None]:
combined_model = Sequential([
    layers.Input(shape=(100,1)),
    layers.GRU(128, return_sequences=True),
    layers.Conv1D(64, 3),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
combined_model.fit(e_text_train, label[:17000], epochs=3, validation_data=(e_text_valid, label[17000:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7f3cb60c50>

## CNN with Bidirectional LSTM

In [None]:
combined_model = Sequential([
    layers.Input(shape=(100,1)),
    layers.Bidirectional(layers.LSTM(128, return_sequences=True)),
    layers.Conv1D(64, 3),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
combined_model.fit(e_text_train, label[:17000], epochs=3, validation_data=(e_text_valid, label[17000:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7f3b472cc0>

## CNN with Bidirectional  GRU

In [None]:
combined_model = Sequential([
    layers.Input(shape=(100,1)),
    layers.Bidirectional(layers.GRU(128, return_sequences=True)),
    layers.Conv1D(64, 3),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
combined_model.fit(e_text_train, label[:17000], epochs=3, validation_data=(e_text_valid, label[17000:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7f3a104828>

## Using custom embeddings

In [None]:
df.head(2)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,house dem aide even see comey letter jason cha...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,ever get feeling life circles roundabout rathe...,0


In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<OOV>', num_words = 10000)
tokenizer.fit_on_texts(text)
sequence = tokenizer.texts_to_sequences(text)
sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=2000)

In [None]:
vocab_size = len(tokenizer.word_index.keys())
max_len = len(sequence[0])

In [None]:
train_seq = sequence[:17000]
valid_seq = sequence[17000:]

### RNN model

In [None]:
rnn_model = Sequential([
    layers.Embedding(vocab_size, 20),
    layers.SimpleRNN(128),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
rnn_model.fit(train_seq, label[:17000], epochs=3, validation_data=(valid_seq, label[17000:]))

### LSTM model

In [None]:
lstm_model = Sequential([
    layers.Embedding(vocab_size, 20),
    layers.LSTM(128),
    layers.Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

lstm_model.fit(train_seq, label[:17000], epochs=3, validation_data=(valid_seq, label[17000:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7db66ee6a0>

### Bidirectional LSTM

In [None]:
bilstm_model = Sequential([
    layers.Embedding(vocab_size, 20),
    layers.Bidirectional(LSTM(128)),
    layers.Dense(1, activation='sigmoid')
])

bilstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

bilstm_model.fit(train_seq, label[:17000], epochs=3, validation_data=(valid_seq, label[17000:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7db45285c0>

### CNN with LSTM

In [None]:
combined_model = Sequential([
    layers.Embedding(vocab_size, 20),
    layers.Conv1D(64, 3),
    layers.LSTM(128),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

combined_model.fit(train_seq, label[:17000], epochs=3, validation_data=(valid_seq, label[17000:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7d85d2afd0>

### CNN with Bidirectional LSTM

In [None]:
combined_model = Sequential([
    layers.Embedding(vocab_size, 20),
    layers.Conv1D(64, 3),
    layers.Bidirectional(LSTM(128)),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

combined_model.fit(train_seq, label[:17000], epochs=3, validation_data=(valid_seq, label[17000:]))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f7d8258c5f8>

## Using Encoder Based Embeddings

In [None]:
model = "https://tfhub.dev/google/universal-sentence-encoder/4"

In [None]:
hub_layer = hub.KerasLayer(model, input_shape=[],
                           dtype=tf.string, trainable=True)
hub_layer(text[:3])













<tf.Tensor: shape=(3, 512), dtype=float32, numpy=
array([[ 0.04574309, -0.0457449 ,  0.04199997, ..., -0.04574504,
        -0.04574428,  0.04574506],
       [ 0.04512052, -0.04613996, -0.04611556, ..., -0.04613769,
        -0.04613996,  0.04613031],
       [ 0.04513958, -0.04513403,  0.04513819, ..., -0.0451396 ,
        -0.0451396 ,  0.0451396 ]], dtype=float32)>

In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1))

model.summary()

Model: "sequential_64"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_20 (KerasLayer)  (None, 512)               256797824 
_________________________________________________________________
dense_68 (Dense)             (None, 16)                8208      
_________________________________________________________________
dense_69 (Dense)             (None, 1)                 17        
Total params: 256,806,049
Trainable params: 256,806,049
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=[tf.metrics.BinaryAccuracy(threshold=0.0, name='accuracy')])

In [None]:
x_val = text[:17000]
partial_x_train = text[17000:]

y_val = label[:17000]
partial_y_train = label[17000:]

In [None]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=3,
                    validation_data=(x_val, y_val))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Reshape((512, 1)))
model.add(tf.keras.layers.LSTM(16))
model.add(tf.keras.layers.Dense(1))

model.summary()

Model: "sequential_68"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_20 (KerasLayer)  (None, 512)               256797824 
_________________________________________________________________
reshape_3 (Reshape)          (None, 512, 1)            0         
_________________________________________________________________
lstm_54 (LSTM)               (None, 16)                1152      
_________________________________________________________________
dense_70 (Dense)             (None, 1)                 17        
Total params: 256,798,993
Trainable params: 256,798,993
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=[tf.metrics.BinaryAccuracy(threshold=0.0, name='accuracy')])

In [None]:
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=3,
                    validation_data=(x_val, y_val))

Epoch 1/3
Epoch 2/3
Epoch 3/3


**Using tf-hub gave great results and that too without any cleaning of the text. Although it did show overfitting but that could be handled.**