# Sarcasm Article prediction

## 1. Importing modules

In [None]:
# Resource url:
# https://www.youtube.com/watch?v=6lMQnaFS3Rc
# https://www.youtube.com/watch?v=Y_hzMnRXjhI&list=PLQY2H8rRoyvzDbLUZkbudP-MFQZwNmU4S&index=3
# https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/Course%203%20-%20Week%202%20-%20Lesson%202.ipynb#scrollTo=BQVuQrZNkPn9
# wget --no-check-certificate https://storage.googleapis.com/learning-datasets/sarcasm.json -O ./tmp/sarcasm.json
# https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

: 

In [None]:

import tensorflow as tf
import matplotlib.pyplot as plt
import json
import math

: 

## 2. Loading data from json file

In [None]:

# Training section
with open("../data/sarcasm.json", "r") as file:
    datastore = json.load(file)

sentences = []
labels = []

for item in datastore:
    sentences.append(item["headline"])
    labels.append(item["is_sarcastic"])

: 

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type = "post"
padding_type = "post"
oov_tok = "<OOV>"
training_size = 20000

# sentence len 26709

: 

## 3. Split training size and label

In [None]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

: 

## 4. Tokenization

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=vocab_size, oov_token=oov_tok
)
# Tokenize the text
tokenizer.fit_on_texts(training_sentences[:training_size])

word_index = tokenizer.word_index

# Turn the training text to sequence
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = tf.keras.preprocessing.sequence.pad_sequences(
    training_sequences, maxlen=max_length, truncating=trunc_type, padding=padding_type
)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = tf.keras.preprocessing.sequence.pad_sequences(
    testing_sequences, maxlen=max_length, truncating=trunc_type, padding=padding_type
)

: 

In [None]:
# Need this block to get it to work with TensorFlow 2.x
# convert "list" (Nth dimesion arrat) to "NDArray" using numpy library
import numpy as np

training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

: 

## 5. Training the model

In [None]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(24, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

: 

In [None]:
num_epochs = 30
history = model.fit(
    training_padded,
    training_labels,
    epochs=num_epochs,
    validation_data=(testing_padded, testing_labels),
    verbose=2,
)

: 

## 6. Create the graph for better understanding

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, str):
    plt.plot(history.history[str])
    plt.plot(history.history["val_" + str])
    plt.xlabel("Epochs")
    plt.ylabel(str)
    plt.legend([str, "val_" + str])
    plt.show()

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

# decode the word index back to original word
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentences(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

print("DECODE sentence")
print(decode_sentences(training_padded[0]))
print("Original sentence")
print(training_sentences[0])
print("Is_sarcasm:")
print(labels[0])

e = model.layers[0]
weights = e.get_weights()[0]
print(weights)  # shape: (vocab_size, embedding_dim)

: 

## 7. Saving the model

In [None]:
# Saving the whole model
# HDF5 format
model.save("nn.h5")
# Saving only weight
model.save_weights("nn_weight.h5")
# Saving only the architecture
json_str = model.to_json()
with open("nn_model.json","w") as f:
    f.write(json_str)

: 

In [None]:
sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))

: 

## 8. Predict data using pre-trained model

In [None]:

# Data to predict
sentences = [
    "granny starting to fear spiders in the garden might be real",
    "game of thrones season finale showing this sunday night",
]

pretrain_model = tf.keras.models.load_model("nn.h5")
pretrain_model.summary()

sequences = tokenizer.texts_to_sequences(sentences)
padded = tf.keras.preprocessing.sequence.pad_sequences(
    sequences, maxlen=max_length, truncating=trunc_type, padding=padding_type
)
print(pretrain_model.predict(padded))

: 