In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, TextVectorization, Embedding, Dropout, Bidirectional, GRU
from tensorflow.keras.models import Sequential
from keras.losses import BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy
from keras.optimizers import Adam, RMSprop
import numpy as np
import pandas as pd

In [49]:
path = 'SemEval2024-Task8\\SubtaskA\\subtaskA_dev_monolingual.jsonl'

def load_data(path):
    data = pd.read_json(path, lines=True)
    data = data[['text', 'label']]    
    return data

In [50]:
data = load_data(path)

In [51]:
MAX_TOKENS = 5000
MAX_OUT = 500

In [52]:
vectorize_layer = TextVectorization(
    max_tokens=MAX_TOKENS,
    output_mode="int",
    output_sequence_length=MAX_OUT,
)

In [53]:
def vectorize_text(text):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)

In [54]:
def create_model():
    model = Sequential()
    model.add(Embedding(MAX_TOKENS, 64, input_length=MAX_OUT))
    model.add(Bidirectional(GRU(64, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation="tanh"))
    model.add(Dense(64, activation="tanh"))
    model.add(Dense(64, activation="tanh"))
    model.add(Dense(32, activation="tanh"))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(
        loss=BinaryCrossentropy(),
        optimizer=RMSprop(learning_rate=0.005),
        metrics=["accuracy"],
    )
    model.summary()
    return model

In [55]:
model = create_model()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 500, 64)           320000    
                                                                 
 bidirectional_4 (Bidirectio  (None, 128)              49920     
 nal)                                                            
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_20 (Dense)            (None, 64)                8256      
                                                                 
 dense_21 (Dense)            (None, 64)                4160      
                                                                 
 dense_22 (Dense)            (None, 64)                4160      
                                                      

In [60]:
# shuffle data
data = data.sample(frac=1)

# split data into train and test
texts = data['text'].to_numpy()
labels = data['label'].to_numpy()

vectorize_layer.adapt(texts)

In [61]:
train_size = int(0.8 * len(data))
test_size = int(0.1 * len(data))
val_size = int(0.1 * len(data))

# bring labels into correct format
train_labels = labels[:train_size].reshape(-1, 1)
test_labels = labels[train_size:train_size+test_size].reshape(-1, 1)
val_labels = labels[train_size+test_size:].reshape(-1, 1)

# map texts to vectors
train_dataset = tf.data.Dataset.from_tensor_slices((texts[:train_size], train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((texts[train_size:train_size+test_size], test_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((texts[train_size+test_size:], val_labels))

# vectorize texts
train_dataset = train_dataset.map(lambda text, label: (vectorize_text(text), label))
test_dataset = test_dataset.map(lambda text, label: (vectorize_text(text), label))
val_dataset = val_dataset.map(lambda text, label: (vectorize_text(text), label))

# configure dataset for performance
AUTOTUNE = tf.data.AUTOTUNE
train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)
val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [62]:
BATCH_SIZE=32

In [63]:
model.fit(train_dataset, epochs=10, validation_data=val_dataset, steps_per_epoch=len(train_dataset)//BATCH_SIZE, validation_steps=len(val_dataset)//BATCH_SIZE)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f5b1b1a4c8>

In [64]:
scores = model.evaluate(test_dataset, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))


Accuracy: 48.80%


In [65]:
model.save('model.h5')

In [69]:
submit_path = 'data.jsonl'
new_data = load_data(submit_path)

In [70]:
new_texts = new_data['text'].to_numpy()
new_labels = new_data['label'].to_numpy()

new_dataset = tf.data.Dataset.from_tensor_slices((new_texts, new_labels))
new_dataset = new_dataset.map(lambda text, label: (vectorize_text(text), label))
new_dataset = new_dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [71]:
# get labels for submission
labels = model.predict(new_dataset)
        



KeyError: 'id'