<a href="https://colab.research.google.com/github/Ducksss/FakeNews/blob/main/FakeNewsModel_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

> Note: Make sure to use a high-ram runtime, else the model may crash due to it's size

In [8]:
!pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.6.0-cp37-cp37m-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 7.1 MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.6.0


In [64]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers as tfl
import zipfile
from tensorflow.keras import Sequential, Input
from tensorflow.keras.utils import get_file
from sklearn.model_selection import train_test_split
BATCH_SIZE = 64

In [67]:
dataset_dir = "/content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/fakeNews.csv"
df = pd.read_csv(dataset_dir)
df.head()

Unnamed: 0,title,isFakeNews,src
0,Coronavirus was created in a government lab as...,1.0,COVID-19-rumor-dataset
1,The lie that coronavirus came from a bat or a ...,1.0,COVID-19-rumor-dataset
2,The health experts had predicted the virus cou...,1.0,COVID-19-rumor-dataset
3,A video clip supposedly showed that the expone...,1.0,COVID-19-rumor-dataset
4,Almost 200 people in Italy died from the coron...,0.0,COVID-19-rumor-dataset


In [68]:
max_seqlen = df["title"].apply(lambda x : len(x.split())).max()
max_words = 100000

In [69]:
dataset_len = len(df)
dataset_len

73653

In [70]:
def train_test_split(dataset, dataset_len, val_split=0.2, shuffle=True, shuffle_size=50000):
    if shuffle:
        dataset = dataset.shuffle(shuffle_size, seed=42)
    train_size = int((1-val_split) * dataset_len)
    val_size = int(val_split * dataset_len)
    try:
      train_ds = dataset.take(train_size).map(lambda x : (x["title"], x["isFakeNews"]))
      val_ds = dataset.skip(train_size).take(val_size).map(lambda x : (x["title"], x["isFakeNews"]))
    except:
      train_ds = dataset.take(train_size)
      val_ds = dataset.skip(train_size).take(val_size)
    train_ds = train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    return train_ds, val_ds

In [71]:
ds = tf.data.experimental.make_csv_dataset(dataset_dir, select_columns=[
    "title",
    "isFakeNews"                                                                    
], batch_size=BATCH_SIZE)

train_ds, val_ds = train_test_split(ds, dataset_len)
val_ds, test_ds = train_test_split(val_ds, int(dataset_len * 0.2), val_split=0.5)

## FakeNewsNet

In [73]:
def text_preprocessor(text):
  punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
  stopwords = {'whom', 'all', 'shouldn', 'wouldn', 'how', 's', 'they', 'were', 'mustn', 'after', 'who', 'its', 'our', 't', 'a', 'very', 'an', 'do', 'be', 'to', 'can', 'had', 'i', 'these', 'himself', 'up', 'just', 'them', 'now', 'has', 'too', 'below', 'did', 'shan', 'until', 'during', 'him', 'into', 'have', "you'd", 'haven', 'theirs', 'ourselves', 'once', "isn't", 'than', "it's", 'wasn', 'yours', "mightn't", 'here', 'ours', 'her', 'doing', 'd', 'yourself', 'y', 'before', 'does', 'then', 'between', 'some', 'with', "needn't", 'but', 'didn', "shouldn't", 'that', "weren't", 'which', 'or', "hasn't", 'own', 'about', 'what', "aren't", 'couldn', 'doesn', 'as', "wouldn't", 'hasn', 'no', 'm', 'hers', 'hadn', 'aren', 'while', 'will', "don't", "shan't", 'why', 'at', 'mightn', 'themselves', 'weren', "that'll", 'isn', 'only', 'the', 'been', "couldn't", 'don', 'should', 'same', 'both', 'where', 'was', 'me', 'through', "hadn't", 've', 'against', 'if', 'under', 'such', 'is', 'll', "haven't", 'ain', 're', "didn't", 'nor', 'not', 'being', 'are', 'your', 'over', 'off', 'having', 'by', "won't", 'myself', 'out', 'more', "wasn't", "doesn't", 'won', 'this', 'my', 'again', 'ma', 'his', 'when', 'you', 'there', 'herself', 'yourselves', 'itself', 'of', "she's", 'needn', 'we', "mustn't", 'above', "you're", 'so', 'it', "should've", 'am', 'he', 'those', 'further', 'she', 'down', 'on', "you'll", 'for', 'other', 'any', 'their', 'from', 'each', 'most', 'because', 'and', 'few', 'in', "you've", 'o'}
  text = tf.strings.lower(text)
  text = tf.strings.strip(text)
  text = tf.strings.regex_replace(text, "<[^>]+>", "") # remove html tags
  text = tf.strings.regex_replace(text, '[%s]' % punctuation, "")
  for stopword in stopwords:
    text = tf.strings.regex_replace(text, r"\b%s\b" % stopword, "")
  
  return text

sample_text = "<p>I'm very <span class='bol'>mad</span> about the results of the election!!! Who agrees with me?</p>"
print(text_preprocessor(sample_text))

tf.Tensor(b'im  mad   results   election  agrees  ', shape=(), dtype=string)


In [74]:
def create_tokenizer(train_ds, max_words, max_seqlen, output_mode = "int", standardize = "lower_and_strip_punctuation"):
  train_text = train_ds.map(lambda x, y : x)
  tokenizer = tfl.TextVectorization(
      standardize=standardize,
      max_tokens=max_words,
      output_sequence_length=max_seqlen,
      output_mode=output_mode
  )
  tokenizer.adapt(train_text)
  return tokenizer

In [77]:
def create_tokenizer_v2(train_ds, max_words, max_seqlen=None, output_mode = "int", standardize = "lower_and_strip_punctuation"):
  train_text = train_ds.map(lambda x, y : x)
  tokenizer = tfl.TextVectorization(
      standardize=standardize,
      max_tokens=max_words,
      output_mode=output_mode
  )
  tokenizer.adapt(train_text)
  return tokenizer

In [82]:
tokenizer = create_tokenizer(train_ds, max_words, max_seqlen, standardize=text_preprocessor)

In [11]:
def load_pretrained_embeddings_v1(url, output_file, embedding_file, embedding_dim, vocabulary, max_words, max_seqlen):
  embedding_vecs = dict()
  word_idx = dict(zip(vocabulary, range(len(vocabulary))))
  file_dir = get_file(output_file, url)

  with zipfile.ZipFile(file_dir, "r") as f:
    f.extractall("/content/")

  with open(embedding_file, "r") as f:
    for line in f:
      values = line.split()
      word = values[0]
      embedding_vec = np.asarray(values[1:], dtype='float32')
      embedding_vecs[word] = embedding_vec

  embedding_matrix = np.zeros((max_words, embedding_dim))
  
  for word, idx in word_idx.items():
    if idx < max_words:
      embedding_vec = embedding_vecs.get(word)
      if embedding_vec is not None:
        embedding_matrix[idx] = embedding_vec
  
  embedding = tfl.Embedding(max_words, embedding_dim, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), mask_zero=True, input_length=max_seqlen, trainable=False)
  return embedding

In [10]:
def load_pretrained_embeddings_v2(url, output_file, embedding_file, embedding_dim, vocabulary, max_words, max_seqlen=None):
  embedding_vecs = dict()
  word_idx = dict(zip(vocabulary, range(len(vocabulary))))
  file_dir = get_file(output_file, url)

  with zipfile.ZipFile(file_dir, "r") as f:
    f.extractall("/content/")

  with open(embedding_file, "r") as f:
    for line in f:
      values = line.split()
      word = values[0]
      embedding_vec = np.asarray(values[1:], dtype='float32')
      embedding_vecs[word] = embedding_vec

  embedding_matrix = np.zeros((max_words, embedding_dim))
  
  for word, idx in word_idx.items():
    if idx < max_words:
      embedding_vec = embedding_vecs.get(word)
      if embedding_vec is not None:
        embedding_matrix[idx] = embedding_vec
  
  embedding = tfl.Embedding(max_words, embedding_dim, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), input_dim=len(vocabulary), trainable=False)
  return embedding

In [83]:
vocabulary = tokenizer.get_vocabulary()
embedding = load_pretrained_embeddings_v1("https://nlp.stanford.edu/data/glove.6B.zip", "glove.6B.zip", "glove.6B.100d.txt", 100, vocabulary=vocabulary, max_words=max_words, max_seqlen=max_seqlen)

In [59]:
def create_fakenewsnet_v1(tokenizer, embedding_layer, max_words, max_seqlen, optimizer='adam'):
  model = Sequential(
      [
      tokenizer,
      embedding_layer,
      tfl.Bidirectional(tfl.LSTM(128, return_sequences=True, input_shape=(max_words, max_seqlen))),
      tfl.Bidirectional(tfl.LSTM(128, return_sequences=False)),
      tfl.Dropout(0.2),
      tfl.Dense(1, activation='sigmoid')
      ]
  )
  model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics = ['accuracy'])

  model.summary()
  return model

In [84]:
model = create_fakenewsnet_v1(tokenizer, embedding, max_words, max_seqlen)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_5 (TextVe (None, 143)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 143, 100)          10000000  
_________________________________________________________________
bidirectional_4 (Bidirection (None, 143, 256)          234496    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 10,628,993
Trainable params: 628,993
Non-trainable params: 10,000,000
____________________________________

In [61]:
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, TerminateOnNaN, EarlyStopping
checkpoint_path = "/content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints"
callbacks = [
             TensorBoard(),
             ModelCheckpoint(checkpoint_path),
             ReduceLROnPlateau(),
             TerminateOnNaN(),
             EarlyStopping(patience=2)
]

In [62]:
def train_model(model, training_ds, validation_ds = None, val_split = 0.2, batch_size = BATCH_SIZE, epochs=5, callbacks=callbacks):
  if validation_ds is None:
    history = model.fit(training_ds, validation_split=val_split, batch_size=batch_size, epochs=epochs, callbacks=callbacks)
  else:
    history = model.fit(training_ds, validation_data=validation_ds, batch_size=batch_size, epochs=epochs, callbacks=callbacks)
  return history

In [85]:
history = train_model(model, train_ds, val_ds, epochs=5)

Epoch 1/5




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints/assets


Epoch 2/5




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints/assets


Epoch 3/5


Exception ignored in: <function ScopedTFGraph.__del__ at 0x7f0bd3967200>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/c_api_util.py", line 58, in __del__
    self.deleter(self.graph)
KeyboardInterrupt


KeyboardInterrupt: ignored

In [None]:
import tensorflowjs as tfjs
def model_to_tfhs(model, output_dir):
  tfjs.converters.save_keras_model(model, output_dir)

In [86]:
model.save("/content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Modelv2")



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Modelv2/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Modelv2/assets


In [None]:
del model

In [None]:
from tensorflow.keras.models import load_model
clone = load_model('/content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Modelv1')

In [None]:
!mv saved_model '/content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Saved'

In [95]:
model.predict(["Essential Oils cure Covid"])

array([[1.]], dtype=float32)

In [None]:
import tensorflow as tf

tf.__version__