In [25]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import layers, initializers
import numpy as np
import pandas as pd

In [5]:
from numpy.random import seed
seed(67)
from tensorflow import random
random.set_seed(67)

In [6]:
# Consts
max_features = 20000
embedding_dim = 100
sequence_length = 500
PATH_TO_GLOVE = "./glove.6B.100d.txt"

In [7]:
embeddings_index = {}
with open(PATH_TO_GLOVE) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

In [8]:
def load_dataset(folder_name, batch_size):
	batch_size = 5
	raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
		"{}/train".format(folder_name),
		batch_size=batch_size,
		validation_split=0.2,
		subset="training",
		seed=1337,
	)
	raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
		"{}/train".format(folder_name),
		batch_size=batch_size,
		validation_split=0.2,
		subset="validation",
		seed=1337,
	)
	raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
		"{}/test".format(folder_name), batch_size=batch_size
	)

	return raw_train_ds, raw_val_ds, raw_test_ds

In [9]:
def get_vectorize_layer(raw_train_ds):
	vectorize_layer = TextVectorization(
		standardize='lower_and_strip_punctuation',
		max_tokens=max_features,
		output_mode="int",
		output_sequence_length=sequence_length,
	)

	text_ds = raw_train_ds.map(lambda x, y: x)
	vectorize_layer.adapt(text_ds)

	return vectorize_layer

In [10]:
def get_vectorize_text_map(vectorize_layer):
	def map_text(text, label):
		text = tf.expand_dims(text, -1)
		return vectorize_layer(text), label
	
	return map_text

def create_final_datasets(raw_train_ds, raw_val_ds, raw_test_ds, vectorize_layer):
	vectorize_text = get_vectorize_text_map(vectorize_layer)
	train_ds = raw_train_ds.map(vectorize_text)
	val_ds = raw_val_ds.map(vectorize_text)
	test_ds = raw_test_ds.map(vectorize_text)

	train_ds = train_ds.cache().prefetch(buffer_size=10)
	val_ds = val_ds.cache().prefetch(buffer_size=10)
	test_ds = test_ds.cache().prefetch(buffer_size=10)

	return train_ds, val_ds, test_ds

In [11]:
def create_model(embedding_matrix, num_tokens):
	inputs = tf.keras.Input(shape=(None,), dtype="int64")

	x = layers.Embedding(
		num_tokens,
		embedding_dim,
		embeddings_initializer= initializers.Constant(embedding_matrix),
		trainable=False,
	)(inputs)
	x = layers.Dropout(0.5)(x)

	x = layers.Conv1D(100, 7, padding="valid", activation="relu", strides=3)(x)
	x = layers.Conv1D(100, 7, padding="valid", activation="relu", strides=3)(x)
	x = layers.GlobalMaxPooling1D()(x)

	x = layers.Dense(100, activation="relu")(x)
	x = layers.Dropout(0.5)(x)

	predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

	model = tf.keras.Model(inputs, predictions)

	model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

	return model

In [12]:
def create_e2e_model(model, vectorize_layer):
	inputs = tf.keras.Input(shape=(1,), dtype="string")
	indices = vectorize_layer(inputs)
	outputs = model(indices)

	e2e_model = tf.keras.Model(inputs, outputs)
	e2e_model.compile(
		loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
	)

	return e2e_model

In [13]:
def train_model(model, train_ds, val_ds, epochs = 7):
	model.fit(train_ds, validation_data=val_ds, epochs=epochs)
	return model

In [14]:
def create_embedding_matrix(vectorize_layer):
	voc = vectorize_layer.get_vocabulary()
	word_index = dict(zip(voc, range(len(voc))))
	num_tokens = len(voc) + 2
	hits = 0
	misses = 0

	embedding_matrix = np.zeros((num_tokens, embedding_dim))
	for word, i in word_index.items():
		embedding_vector = embeddings_index.get(word)
		if embedding_vector is not None:
			embedding_matrix[i] = embedding_vector
			hits += 1
		else:
			misses += 1
	
	return embedding_matrix, num_tokens

In [28]:
def test_dataset(dataset_name, epochs):
	raw_train_ds, raw_val_ds, raw_test_ds = load_dataset(dataset_name, 5)
	vectorize_layer = get_vectorize_layer(raw_train_ds)

	train_ds, val_ds, test_ds = create_final_datasets(raw_train_ds, raw_val_ds, raw_test_ds, vectorize_layer)
	matrix, tokens = create_embedding_matrix(vectorize_layer)
	model = create_model(matrix, tokens)
	model = train_model(model, train_ds, val_ds, epochs)
	model.evaluate(test_ds)
	e2e_model = create_e2e_model(model, vectorize_layer)
	return e2e_model

In [29]:
hyperbole_model = test_dataset("hyperboleset", 8)

Found 80 files belonging to 2 classes.
Using 64 files for training.
Found 80 files belonging to 2 classes.
Using 16 files for validation.
Found 21 files belonging to 2 classes.
Number of batches in raw_train_ds: 13
Number of batches in raw_val_ds: 4
Number of batches in raw_test_ds: 5
Converted 359 words (26 misses)
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [32]:
uncertainty_model = test_dataset("incertezaset", 8)

Found 63 files belonging to 2 classes.
Using 51 files for training.
Found 63 files belonging to 2 classes.
Using 12 files for validation.
Found 16 files belonging to 2 classes.
Number of batches in raw_train_ds: 11
Number of batches in raw_val_ds: 3
Number of batches in raw_test_ds: 4
Converted 354 words (25 misses)
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [34]:
metonymy_model = test_dataset("metonymyset", 10)

Found 43 files belonging to 2 classes.
Using 35 files for training.
Found 43 files belonging to 2 classes.
Using 8 files for validation.
Found 11 files belonging to 2 classes.
Number of batches in raw_train_ds: 7
Number of batches in raw_val_ds: 2
Number of batches in raw_test_ds: 3
Converted 194 words (11 misses)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
antithesis_model = test_dataset("antithesis_set", 10)

Found 55 files belonging to 2 classes.
Using 44 files for training.
Found 55 files belonging to 2 classes.
Using 11 files for validation.
Found 14 files belonging to 2 classes.
Number of batches in raw_train_ds: 9
Number of batches in raw_val_ds: 3
Number of batches in raw_test_ds: 3
Converted 245 words (10 misses)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
captions = pd.read_csv("captions.csv", sep=";")

In [31]:
hyperbole_model.evaluate(captions.text, captions.hyperbole)



[0.05150493606925011, 1.0]

In [33]:
uncertainty_model.evaluate(captions.text, captions.uncertainty)



[0.9790690541267395, 0.5714285969734192]

In [36]:
metonymy_model.evaluate(captions.text, captions.metonymy)



[1.1553064584732056, 0.4285714328289032]

In [37]:
antithesis_model.evaluate(captions.text, captions.antithesis)



[2.146885633468628, 0.5714285969734192]