In [73]:
import io
import re
import string
from tqdm import tqdm
import json
import numpy as np

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, Lambda, Dot, Flatten, Input
from tensorflow.keras.preprocessing.sequence import pad_sequences, skipgrams

AUTOTUNE = tf.data.AUTOTUNE

In [74]:
### load the datasets
# save the sentences in here
sentences = []

# the paths to the datasets
paths = [
    'data/sarcasm_headlines/Sarcasm_Headlines_Dataset.json',
    'data/sarcasm_headlines/Sarcasm_Headlines_Dataset_v2.json'
]

for path in paths:
    with open(path, 'r') as file:
        for line in file:
            # load the json
            json_line = json.loads(line)

            # only use sarcastic headlines
            # if not json_line['is_sarcastic']:
            #     continue
            
            # append the headline to the sentences
            sentences.append(json_line['headline'])

In [75]:
# Tokenize the text corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word2id = tokenizer.word_index

# Convert text to a sequence of integers
sequences = tokenizer.texts_to_sequences(sentences)

In [76]:
# Generate Skip-gram pairs for all sequences
skip_grams = [skipgrams(seq, vocabulary_size=len(word2id)+1, window_size=2, negative_samples=1) for seq in sequences]

# Unpack skip-grams
pairs = []
labels = []
for sg in skip_grams:
    pairs.extend(sg[0])
    labels.extend(sg[1])

In [77]:
# translate the pairs to words
for i in range(20):
    pair = pairs[i]
    label = labels[i]
    print(tokenizer.index_word[pair[0]], tokenizer.index_word[pair[1]], label)

clerk geist 0
sues clerk 1
clerk fashionistas 0
clerk naacp 0
minority shoppers 1
store 'flush 0
'black amelia 0
secret tribes 0
secret sues 1
versace former 1
sues store 1
'black counted 0
code' 'black 1
'black secret 1
for 'worked 0
minority squandering 0
secret contradict 0
secret 'perfect' 0
sues choose 0
secret code' 1


In [78]:
targets, contexts = zip(*pairs)
targets = np.array(targets, dtype="int32")
contexts = np.array(contexts, dtype="int32")

# Convert labels to numpy array
labels = np.array(labels, dtype="int32")

In [82]:
# Define the model
embedding_dim = 128
vocab_size = len(word2id) + 1  # Add 1 because of reserved 0 index

# Define the input layers
input_target = Input((1,))
input_context = Input((1,))

# Define the embedding layers
target_embedding = Embedding(vocab_size, embedding_dim, input_length=1, name="w2v_embedding")(input_target)
context_embedding = Embedding(vocab_size, embedding_dim, input_length=1)(input_context)

# Merge layers with dot product along the embedding dimension
dot_product = Dot(axes=2)([target_embedding, context_embedding])

# Flatten the output
output = Flatten()(dot_product)

# Define the model
model = Model(inputs=[input_target, input_context], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 w2v_embedding (Embedding)   (None, 1, 128)               3953280   ['input_1[0][0]']             
                                                                                                  
 embedding_3 (Embedding)     (None, 1, 128)               3953280   ['input_2[0][0]']             
                                                                                              

In [83]:
# Create a dataset
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(buffer_size=1024).batch(32).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

# Train the model
model.fit(dataset, epochs=20)

Epoch 1/20


   613/119803 [..............................] - ETA: 2:00:52 - loss: 5.0931 - accuracy: 0.5002

KeyboardInterrupt: 