In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import layers

In [2]:
from numpy.random import seed
seed(67)
from tensorflow import random
random.set_seed(67)

In [3]:
# Consts
max_features = 20000
embedding_dim = 128
sequence_length = 500

In [4]:
def load_dataset(folder_name, batch_size):
	batch_size = 5
	raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
		"{}/train".format(folder_name),
		batch_size=batch_size,
		validation_split=0.2,
		subset="training",
		seed=1337,
	)
	raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
		"{}/train".format(folder_name),
		batch_size=batch_size,
		validation_split=0.2,
		subset="validation",
		seed=1337,
	)
	raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
		"{}/test".format(folder_name), batch_size=batch_size
	)

	return raw_train_ds, raw_val_ds, raw_test_ds

In [5]:
def get_vectorize_layer(raw_train_ds):
	vectorize_layer = TextVectorization(
		standardize='lower_and_strip_punctuation',
		max_tokens=max_features,
		output_mode="int",
		output_sequence_length=sequence_length,
	)

	text_ds = raw_train_ds.map(lambda x, y: x)
	vectorize_layer.adapt(text_ds)

	return vectorize_layer

In [6]:
def get_vectorize_text_map(vectorize_layer):
	def map_text(text, label):
		text = tf.expand_dims(text, -1)
		return vectorize_layer(text), label
	
	return map_text

def create_final_datasets(raw_train_ds, raw_val_ds, raw_test_ds, vectorize_layer):
	vectorize_text = get_vectorize_text_map(vectorize_layer)
	train_ds = raw_train_ds.map(vectorize_text)
	val_ds = raw_val_ds.map(vectorize_text)
	test_ds = raw_test_ds.map(vectorize_text)

	train_ds = train_ds.cache().prefetch(buffer_size=10)
	val_ds = val_ds.cache().prefetch(buffer_size=10)
	test_ds = test_ds.cache().prefetch(buffer_size=10)

	return train_ds, val_ds, test_ds

In [7]:
def create_model():
	inputs = tf.keras.Input(shape=(None,), dtype="int64")

	x = layers.Embedding(max_features, embedding_dim)(inputs)
	x = layers.Dropout(0.5)(x)

	x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
	x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
	x = layers.GlobalMaxPooling1D()(x)

	x = layers.Dense(128, activation="relu")(x)
	x = layers.Dropout(0.5)(x)

	predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

	model = tf.keras.Model(inputs, predictions)

	model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

	return model

In [8]:
def train_model(model, train_ds, val_ds, epochs = 7):
	model.fit(train_ds, validation_data=val_ds, epochs=epochs)
	return model

In [9]:
def create_e2e_model(model, vectorize_layer):
	inputs = tf.keras.Input(shape=(1,), dtype="string")
	indices = vectorize_layer(inputs)
	outputs = model(indices)

	e2e_model = tf.keras.Model(inputs, outputs)
	e2e_model.compile(
		loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
	)

	return e2e_model

In [10]:
def test_dataset(dataset_name, epochs):
	raw_train_ds, raw_val_ds, raw_test_ds = load_dataset(dataset_name, 5)
	vectorize_layer = get_vectorize_layer(raw_train_ds)
	train_ds, val_ds, test_ds = create_final_datasets(raw_train_ds, raw_val_ds, raw_test_ds, vectorize_layer)
	model = create_model()
	model = train_model(model, train_ds, val_ds, epochs)
	model.evaluate(test_ds)
	e2e_model = create_e2e_model(model, vectorize_layer)
	return e2e_model

In [11]:
hyperbole_model = test_dataset("hyperboleset", 8)

Found 80 files belonging to 2 classes.
Using 64 files for training.
Found 80 files belonging to 2 classes.
Using 16 files for validation.
Found 21 files belonging to 2 classes.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [12]:
uncertainty_model  = test_dataset("incertezaset", 8)

Found 63 files belonging to 2 classes.
Using 51 files for training.
Found 63 files belonging to 2 classes.
Using 12 files for validation.
Found 16 files belonging to 2 classes.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [13]:
metonymy_model = test_dataset("metonymyset", 10)

Found 43 files belonging to 2 classes.
Using 35 files for training.
Found 43 files belonging to 2 classes.
Using 8 files for validation.
Found 11 files belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
antithesis_model = test_dataset("antithesis_set", 10)

Found 55 files belonging to 2 classes.
Using 44 files for training.
Found 55 files belonging to 2 classes.
Using 11 files for validation.
Found 14 files belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
captions = pd.read_csv("captions.csv", sep=";")

In [26]:
hyperbole_model.evaluate(captions.text, captions.hyperbole)
hyperbole_model.predict(captions.text)



array([[0.98555505],
       [0.99903405],
       [0.9945671 ],
       [0.93080693],
       [0.9831228 ],
       [0.998544  ],
       [0.9729843 ]], dtype=float32)

In [25]:
uncertainty_model.evaluate(captions.text, captions.uncertainty)
uncertainty_model.predict(captions.text)



array([[0.92356837],
       [0.8670976 ],
       [0.6801777 ],
       [0.823809  ],
       [0.8568609 ],
       [0.84368724],
       [0.47467005]], dtype=float32)

In [24]:
metonymy_model.evaluate(captions.text, captions.metonymy)
metonymy_model.predict(captions.text)



array([[0.3946709 ],
       [0.4261344 ],
       [0.41406006],
       [0.40117604],
       [0.4079222 ],
       [0.39703694],
       [0.42626292]], dtype=float32)

In [23]:
antithesis_model.evaluate(captions.text, captions.antithesis)
antithesis_model.predict(captions.text)



array([[0.6579479 ],
       [0.89892447],
       [0.9297848 ],
       [0.6341864 ],
       [0.97060925],
       [0.83124745],
       [0.63600916]], dtype=float32)