In [24]:
#!unzip -a '/content/drive/My Drive/NLP/input/fake-news.zip'

In [25]:
!pip install texthero

In [26]:
import pandas as pd
import tensorflow as tf
import numpy as np
import tensorflow_hub as hub
import gensim
from gensim import downloader as api
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from tensorflow.keras import Sequential, layers
from tqdm.notebook import tqdm
import texthero as hero
import warnings
warnings.filterwarnings('ignore')

## Loading data and creating folds

In [27]:
data = pd.read_csv('./train.csv')
data.head(2)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0


In [28]:
data.shape

(20800, 5)

In [29]:
df = data[['text', 'label']]

In [30]:
def create_folds(df):

  df['kfold'] = -1

  splitter = StratifiedKFold(n_splits=10)

  for fold, (trn, val) in enumerate(splitter.split(df, df['label'])):

    df.loc[val, 'kfold'] = fold

  return df

In [31]:
df = create_folds(df)

In [32]:
df.head(2)

Unnamed: 0,text,label,kfold
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1,0
1,Ever get the feeling your life circles the rou...,0,0


## Preprocessing the data

In [33]:
df.head(2)

Unnamed: 0,text,label,kfold
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1,0
1,Ever get the feeling your life circles the rou...,0,0


In [34]:
df['text'] = hero.clean(df['text'])
df.head(2)

Unnamed: 0,text,label,kfold
0,house dem aide even see comey letter jason cha...,1,0
1,ever get feeling life circles roundabout rathe...,0,0


In [35]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['text'])
text_sequences = tokenizer.texts_to_sequences(df['text'])
text_sequences = tf.keras.preprocessing.sequence.pad_sequences(text_sequences)
num_record = len(text_sequences)
max_seqlen = len(text_sequences[0])

In [36]:
NUM_CLASSES = 2
labels = tf.keras.utils.to_categorical(df['label'], NUM_CLASSES)

In [37]:
word2idx = tokenizer.word_index
idx2word = {v:k for k,v in word2idx.items()}
word2idx['PAD'] = 0
idx2word[0] = 'PAD'
vocab_size = len(word2idx)

In [38]:
dataset = tf.data.Dataset.from_tensor_slices((text_sequences, labels))

BATCH_SIZE  = 128

dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [39]:
def build_embedding_matrix(sequences, word2idx, embedding_dim, word_vectors):

  E = np.zeros((vocab_size, embedding_dim))

  for word, idx in word2idx.items():
    try:
      E[idx] = word_vectors.wv[word]
    except KeyError:
      pass

  return E

In [40]:
api.info()['models'].keys()

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])

In [41]:
word_vectors = api.load('glove-wiki-gigaword-100')

In [42]:
E = build_embedding_matrix(text_sequences, word2idx, 100, word_vectors)

## Building model

In [43]:
class Fake_News_Detection(tf.keras.Model):

  def __init__(self, vocab_size, embed_size, input_length, lstm_nodes, output_size, run_mode, embedding_weights, **kwargs):

    super(Fake_News_Detection, self).__init__(**kwargs)

    if run_mode == 'scratch':

      self.embedding = tf.keras.layers.Embedding(vocab_size, 
                                                 embed_size, 
                                                 input_length=input_length, 
                                                 trainable=True)

    elif run_mode == 'pretrained':

      self.embedding = tf.keras.layers.Embedding(vocab_size,
                                                 embed_size,
                                                 input_length=input_length,
                                                 weights=[embedding_weights], 
                                                 trainable=False)

    elif run_mode == 'finetune':

      self.embedding = tf.keras.layers.Embedding(vocab_size,
                                                 embed_size,
                                                 input_length=input_length,
                                                 weights=[embedding_weights],
                                                 trainable=True)
      
    
    self.lstm = tf.keras.layers.LSTM(lstm_nodes)
    self.dense = tf.keras.layers.Dense(output_size, activation='sigmoid')
      
  def call(self, x):

    x = self.embedding(x)
    x = self.lstm(x)
    x = self.dense(x)
    return x

In [None]:
model = Fake_News_Detection(vocab_size, 100, max_seqlen, 128, NUM_CLASSES, 'finetune', E)

In [None]:
model.build(input_shape=(None, max_seqlen))

In [None]:
model.compile('adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(dataset, epochs=3)