<a href="https://colab.research.google.com/github/Ducksss/FakeNews/blob/main/FakeNewsModel_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflowjs



In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers as tfl
import zipfile
from tensorflow.keras import Sequential, Input
from tensorflow.keras.utils import get_file
from sklearn.model_selection import train_test_split
BATCH_SIZE = 64

In [None]:
dataset_dir = "/content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/fakeNews.csv"
df = pd.read_csv(dataset_dir, index_col=0)
df.head()

Unnamed: 0,title,isFakeNews,src
0,Coronavirus was created in a government lab as...,1.0,COVID-19-rumor-dataset
1,The lie that coronavirus came from a bat or a ...,1.0,COVID-19-rumor-dataset
2,The health experts had predicted the virus cou...,1.0,COVID-19-rumor-dataset
3,A video clip supposedly showed that the expone...,1.0,COVID-19-rumor-dataset
4,Almost 200 people in Italy died from the coron...,0.0,COVID-19-rumor-dataset


In [None]:
max_seqlen = df["title"].apply(lambda x : len(x.split())).max()
max_words = 100000

In [None]:
dataset_len = len(df)
dataset_len

28755

In [None]:
def train_test_split(dataset, dataset_len, val_split=0.2, shuffle=True, shuffle_size=50000):
    if shuffle:
        dataset = dataset.shuffle(shuffle_size, seed=42)
    train_size = int((1-val_split) * dataset_len)
    val_size = int(val_split * dataset_len)
    train_ds = dataset.take(train_size).map(lambda x : (x["title"], x["isFakeNews"]))
    val_ds = dataset.skip(train_size).take(val_size).map(lambda x : (x["title"], x["isFakeNews"]))

    train_ds = train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    return train_ds, val_ds

In [None]:
ds = tf.data.experimental.make_csv_dataset(dataset_dir, select_columns=[
    "title",
    "isFakeNews"                                                                    
], batch_size=BATCH_SIZE)

train_ds, val_ds = train_test_split(ds, dataset_len)

## FakeNewsNet

In [None]:
def create_tokenizer(train_ds, max_words, max_seqlen, output_mode = "int", standardize = "lower_and_strip_punctuation"):
  train_text = train_ds.map(lambda x, y : x)
  tokenizer = tfl.TextVectorization(
      standardize=standardize,
      max_tokens=max_words,
      output_sequence_length=max_seqlen,
      output_mode=output_mode
  )
  tokenizer.adapt(train_text)
  return tokenizer

In [None]:
tokenizer = create_tokenizer(train_ds, max_words, max_seqlen)

In [None]:
def load_pretrained_embeddings_v1(url, output_file, embedding_file, embedding_dim, vocabulary, max_words, max_seqlen):
  embedding_vecs = dict()
  word_idx = dict(zip(vocabulary, range(len(vocabulary))))
  file_dir = get_file(output_file, url)

  with zipfile.ZipFile(file_dir, "r") as f:
    f.extractall("/content/")

  with open(embedding_file, "r") as f:
    for line in f:
      values = line.split()
      word = values[0]
      embedding_vec = np.asarray(values[1:], dtype='float32')
      embedding_vecs[word] = embedding_vec

  embedding_matrix = np.zeros((max_words, embedding_dim))
  
  for word, idx in word_idx.items():
    if idx < max_words:
      embedding_vec = embedding_vecs.get(word)
      if embedding_vec is not None:
        embedding_matrix[idx] = embedding_vec
  
  embedding = tfl.Embedding(max_words, embedding_dim, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), input_length=max_seqlen, trainable=False)
  return embedding

In [None]:
vocabulary = tokenizer.get_vocabulary()
embedding = load_pretrained_embeddings_v1("https://nlp.stanford.edu/data/glove.6B.zip", "glove.6B.zip", "glove.6B.100d.txt", 100, vocabulary=vocabulary, max_words=max_words, max_seqlen=max_seqlen)

In [None]:
def create_fakenewsnet(tokenizer, embedding_layer, max_words, max_seqlen, optimizer='adam'):
  model = Sequential(
      [
      tokenizer,
      embedding_layer,
      tfl.Bidirectional(tfl.LSTM(128, return_sequences=True, input_shape=(max_words, max_seqlen))),
      tfl.Bidirectional(tfl.LSTM(128, return_sequences=False)),
      tfl.Dropout(0.2),
      tfl.Dense(1, activation='sigmoid')
      ]
  )
  model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics = ['accuracy'])
  model.summary()
  return model

In [None]:
model = create_fakenewsnet(tokenizer, embedding, max_words, max_seqlen)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 143)               0         
_________________________________________________________________
embedding (Embedding)        (None, 143, 100)          10000000  
_________________________________________________________________
bidirectional (Bidirectional (None, 143, 256)          234496    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 10,628,993
Trainable params: 628,993
Non-trainable params: 10,000,000
______________________________________

In [None]:
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, TerminateOnNaN, EarlyStopping
checkpoint_path = "/content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints"
callbacks = [
             TensorBoard(),
             ModelCheckpoint(checkpoint_path),
             ReduceLROnPlateau(),
             TerminateOnNaN(),
             EarlyStopping(patience=2)
]

In [None]:
def train_model(model, training_ds, validation_ds = None, val_split = 0.2, batch_size = BATCH_SIZE, epochs=5, callbacks=callbacks):
  if validation_ds is None:
    history = model.fit(training_ds, validation_split=val_split, batch_size=batch_size, epochs=epochs, callbacks=callbacks)
  else:
    history = model.fit(training_ds, validation_data=validation_ds, batch_size=batch_size, epochs=epochs, callbacks=callbacks)
  return history

In [None]:
history = train_model(model, train_ds, val_ds, epochs=5)

Epoch 1/5




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints/assets


Epoch 2/5




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints/assets


Epoch 3/5




INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints/assets


Epoch 4/5
  770/23004 [>.............................] - ETA: 16:38 - loss: 0.0120 - accuracy: 0.9939

KeyboardInterrupt: ignored

In [None]:
import tensorflowjs as tfjs
def model_to_tfhs(model, output_dir):
  tfjs.converters.save_keras_model(model, output_dir)

In [None]:
model.save("/content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Modelv1")



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Modelv1/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Modelv1/assets


In [None]:
del model

In [None]:
from tensorflow.keras.models import load_model
clone = load_model('/content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Saved')

In [None]:
!mv saved_model '/content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Saved'

In [None]:
import tensorflow as tf

tf.__version__