<a href="https://colab.research.google.com/github/Ducksss/Project-Cactus/blob/main/Project_Cactus_(Training).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<div id="top"></div>

<!-- PROJECT SHIELDS -->
<!--
*** I'm using markdown "reference style" links for readability.
*** Reference links are enclosed in brackets [ ] instead of parentheses ( ).
*** See the bottom of this document for the declaration of the reference variables
*** for contributors-url, forks-url, etc. This is an optional, concise syntax you may use.
*** https://www.markdownguide.org/basic-syntax/#reference-style-links
-->
[![Contributors](https://img.shields.io/github/contributors/Ducksss/Project-Cactus.svg)][contributors-url]
[![Forks](https://img.shields.io/github/forks/Ducksss/Project-Cactus.svg)][forks-url]
[![Stargazers](https://img.shields.io/github/stars/Ducksss/Project-Cactus.svg)][stars-url]
[![MIT License](https://img.shields.io/github/license/Ducksss/Project-Cactus.svg)][license-url]
[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1NBbmGYUZbKq0fjkI2OJIJx_3PV_Rdf7Z?usp=sharing)


<!-- PROJECT LOGO -->
<br />
<div align="center">
  <a href="https://github.com/Ducksss/Project-Cactus">
    <img src="assets/cactus-bg.png" alt="Logo" width="80" height="80">
  </a>

<h3 align="center">Project Cactus</h3>

  <p align="center">
    A cross-platform AI Fake News Detector
    <br />
    <br />
    <a href="https://project-cactus-c9549.web.app/">Web Application</a>
    ·
    <a href="#browser-extension">Browser Extension</a>
    ·
    <a href="https://github.com/Ducksss/Project-Cactus/issues">Report Bugs</a>
    ·
    <a href="https://github.com/Ducksss/Project-Cactus/issues">Request Features</a>
  </p>
</div>



> Note: Make sure to use a high-ram runtime, else the model may crash due to it's size

## Download the Data Set

In [None]:
!git clone https://github.com/Ducksss/FakeNews.git 

## Library Imports

In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers as tfl
import zipfile
from tensorflow.keras import Sequential, Input
from tensorflow.keras.utils import get_file
BATCH_SIZE = 64

### Settings

In [None]:
#@title Define Hyperparameters
BATCH_SIZE = 64 #@param {type:"integer"}
max_words = 1000000 #@param {type:"integer"}
checkpoint_path = "/tmp/checkpoints" #@param {type:"string"}
save_dir = "saved_model" #@param {type:"string"}

## Data Ingestion

In [2]:
dataset_dir = "data/fakeNews.csv"
df = pd.read_csv(dataset_dir)
df.head()

Unnamed: 0,title,isFakeNews,src
0,Coronavirus was created in a government lab as...,1.0,COVID-19-rumor-dataset
1,The lie that coronavirus came from a bat or a ...,1.0,COVID-19-rumor-dataset
2,The health experts had predicted the virus cou...,1.0,COVID-19-rumor-dataset
3,A video clip supposedly showed that the expone...,1.0,COVID-19-rumor-dataset
4,Almost 200 people in Italy died from the coron...,0.0,COVID-19-rumor-dataset


In [15]:
max_seqlen = df["title"].apply(lambda x : len(x.split())).max()
max_seqlen

In [6]:
dataset_len = len(df)
dataset_len

78578

In [7]:
def train_test_split(dataset, dataset_len, val_split=0.2, shuffle=True, shuffle_size=50000):
    if shuffle:
        dataset = dataset.shuffle(shuffle_size, seed=42)
    train_size = int((1-val_split) * dataset_len)
    val_size = int(val_split * dataset_len)
    try:
      train_ds = dataset.take(train_size).map(lambda x : (x["title"], x["isFakeNews"]))
      val_ds = dataset.skip(train_size).take(val_size).map(lambda x : (x["title"], x["isFakeNews"]))
    except:
      train_ds = dataset.take(train_size)
      val_ds = dataset.skip(train_size).take(val_size)
    train_ds = train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    return train_ds, val_ds

In [8]:
ds = tf.data.experimental.make_csv_dataset(dataset_dir, select_columns=[
    "title",
    "isFakeNews"                                                                    
], batch_size=BATCH_SIZE)

train_ds, val_ds = train_test_split(ds, dataset_len)
val_ds, test_ds = train_test_split(val_ds, int(dataset_len * 0.2), val_split=0.5)

## CactusNet

In [9]:
@tf.keras.utils.register_keras_serializable() # Decorator to allow us to save the TextVectorizer layer
def text_preprocessor(text):
  punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
  stopwords = {'whom', 'all', 'shouldn', 'wouldn', 'how', 's', 'they', 'were', 'mustn', 'after', 'who', 'its', 'our', 't', 'a', 'very', 'an', 'do', 'be', 'to', 'can', 'had', 'i', 'these', 'himself', 'up', 'just', 'them', 'now', 'has', 'too', 'below', 'did', 'shan', 'until', 'during', 'him', 'into', 'have', "you'd", 'haven', 'theirs', 'ourselves', 'once', "isn't", 'than', "it's", 'wasn', 'yours', "mightn't", 'here', 'ours', 'her', 'doing', 'd', 'yourself', 'y', 'before', 'does', 'then', 'between', 'some', 'with', "needn't", 'but', 'didn', "shouldn't", 'that', "weren't", 'which', 'or', "hasn't", 'own', 'about', 'what', "aren't", 'couldn', 'doesn', 'as', "wouldn't", 'hasn', 'no', 'm', 'hers', 'hadn', 'aren', 'while', 'will', "don't", "shan't", 'why', 'at', 'mightn', 'themselves', 'weren', "that'll", 'isn', 'only', 'the', 'been', "couldn't", 'don', 'should', 'same', 'both', 'where', 'was', 'me', 'through', "hadn't", 've', 'against', 'if', 'under', 'such', 'is', 'll', "haven't", 'ain', 're', "didn't", 'nor', 'not', 'being', 'are', 'your', 'over', 'off', 'having', 'by', "won't", 'myself', 'out', 'more', "wasn't", "doesn't", 'won', 'this', 'my', 'again', 'ma', 'his', 'when', 'you', 'there', 'herself', 'yourselves', 'itself', 'of', "she's", 'needn', 'we', "mustn't", 'above', "you're", 'so', 'it', "should've", 'am', 'he', 'those', 'further', 'she', 'down', 'on', "you'll", 'for', 'other', 'any', 'their', 'from', 'each', 'most', 'because', 'and', 'few', 'in', "you've", 'o'}
  text = tf.strings.lower(text)
  text = tf.strings.strip(text)
  text = tf.strings.regex_replace(text, "<[^>]+>", "") # remove html tags
  text = tf.strings.regex_replace(text, '[%s]' % punctuation, "") # remove punctuation
  for stopword in stopwords:
    text = tf.strings.regex_replace(text, r"\b%s\b" % stopword, "") # remove stopwards
  
  return text

sample_text = "<p>I'm very <span class='bold'>mad</span> about the results of the election!!! Who agrees with me?</p>"
print(text_preprocessor(sample_text))

tf.Tensor(b'im  mad   results   election  agrees  ', shape=(), dtype=string)


In [10]:
def create_tokenizer(train_ds, max_words, max_seqlen, output_mode = "int", standardize = "lower_and_strip_punctuation"):
  train_text = train_ds.map(lambda x, y : x)
  tokenizer = tfl.TextVectorization(
      standardize=standardize,
      max_tokens=max_words,
      output_sequence_length=max_seqlen,
      output_mode=output_mode
  )
  tokenizer.adapt(train_text)
  return tokenizer

In [16]:
tokenizer = create_tokenizer(train_ds, max_words, max_seqlen, standardize=text_preprocessor)

In [12]:
def load_pretrained_embeddings_v1(url, output_file, embedding_file, embedding_dim, vocabulary, max_words, max_seqlen):
  embedding_vecs = dict()
  word_idx = dict(zip(vocabulary, range(len(vocabulary))))
  file_dir = get_file(output_file, url)

  with zipfile.ZipFile(file_dir, "r") as f:
    f.extractall("/content/")

  with open(embedding_file, "r") as f:
    for line in f:
      values = line.split()
      word = values[0]
      embedding_vec = np.asarray(values[1:], dtype='float32')
      embedding_vecs[word] = embedding_vec

  embedding_matrix = np.zeros((max_words, embedding_dim))
  
  for word, idx in word_idx.items():
    if idx < max_words:
      embedding_vec = embedding_vecs.get(word)
      if embedding_vec is not None:
        embedding_matrix[idx] = embedding_vec
  
  embedding = tfl.Embedding(max_words, embedding_dim, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), mask_zero=False, input_length=max_seqlen, trainable=False)
  return embedding

In [17]:
vocabulary = tokenizer.get_vocabulary()
embedding = load_pretrained_embeddings_v1("https://nlp.stanford.edu/data/glove.twitter.27B.zip", "glove.twitter.27B.zip", "glove.twitter.27B.100d.txt", 100, vocabulary=vocabulary, max_words=max_words, max_seqlen=max_seqlen)

In [18]:
def create_cactusnet_v2(tokenizer, embedding_layer, max_words, max_seqlen, optimizer='adam'):
  model = Sequential(
      [
      tokenizer,
      embedding_layer,
      tfl.Bidirectional(tfl.LSTM(128, return_sequences=True, input_shape=(max_words, max_seqlen))),
      tfl.Bidirectional(tfl.LSTM(128, return_sequences=False)),
      tfl.Dropout(0.2),
      tfl.Dense(1, activation='sigmoid')
      ]
  )
  model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics = ['accuracy'])

  model.summary()
  return model

In [19]:
model = create_cactusnet_v2(tokenizer, embedding, max_words, max_seqlen)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_1 (TextVe (None, 143)               0         
_________________________________________________________________
embedding (Embedding)        (None, 143, 100)          100000000 
_________________________________________________________________
bidirectional (Bidirectional (None, 143, 256)          234496    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 100,628,993
Trainable params: 628,993
Non-trainable params: 100,000,000
____________________________________

### Model Training

In [20]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, TerminateOnNaN, EarlyStopping

callbacks = [
             ModelCheckpoint(checkpoint_path),
             ReduceLROnPlateau(),
             TerminateOnNaN(),
             EarlyStopping(patience=2)
]

In [21]:
def train_model(model, training_ds, validation_ds = None, val_split = 0.2, batch_size = BATCH_SIZE, epochs=5, callbacks=callbacks):
  if validation_ds is None:
    history = model.fit(training_ds, validation_split=val_split, batch_size=batch_size, epochs=epochs, callbacks=callbacks)
  else:
    history = model.fit(training_ds, validation_data=validation_ds, batch_size=batch_size, epochs=epochs, callbacks=callbacks)
  return history

In [22]:
history = train_model(model, train_ds, val_ds, epochs=1)





INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Model Checkpoints/assets


In [23]:
model.evaluate(test_ds)



[0.010271494276821613, 0.9957382678985596]

In [24]:
model.save(save_dir)



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Modelv2_3/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Data/NTU MLDA Hackathon 2021/Modelv2_3/assets
