In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, TextVectorization, Embedding, Dropout, Bidirectional, GRU
from tensorflow.keras.models import Sequential
from keras.losses import BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy
from keras.optimizers import Adam, RMSprop
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

2024-01-16 21:24:42.473083: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-16 21:24:42.804282: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-16 21:24:42.804386: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-16 21:24:42.847856: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-16 21:24:42.940942: I tensorflow/core/platform/cpu_feature_guar

In [3]:
def get_test_data(test_path):
    test_df = pd.read_json(test_path, lines=True)
    test_df = test_df[['text', 'label']]
    return test_df

def get_data(train_path):
   train_df = pd.read_json(train_path, lines=True)
   train_df = train_df[['text', 'label']]
   train_df, validation_df = train_test_split(train_df, test_size=0.1, stratify=train_df['label'])
   return train_df, validation_df

In [4]:
file_train_path = 'data/subtaskA_train_monolingual.jsonl'
train_df, validation_df = get_data(file_train_path)

In [5]:
file_test_path = 'data/subtaskA_dev_monolingual.jsonl'
test_df = get_test_data(file_test_path)

In [6]:
MAX_TOKENS = 5000
MAX_OUT = 500

In [7]:
vectorize_layer = TextVectorization(
    max_tokens=MAX_TOKENS,
    output_mode="int",
    output_sequence_length=MAX_OUT,
)

2024-01-16 21:24:51.009950: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-16 21:24:51.153586: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-16 21:24:51.154603: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-16 21:24:51.159447: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-01-16 21:24:51.160568: I external/local_xla/xla/stream_executor

In [8]:
def vectorize_text(text):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)

In [9]:
def create_model():
    model = Sequential()
    model.add(Embedding(MAX_TOKENS, 64, input_length=MAX_OUT))
    model.add(Bidirectional(GRU(64, return_sequences=False)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation="tanh"))
    model.add(Dense(64, activation="tanh"))
    model.add(Dense(64, activation="tanh"))
    model.add(Dense(32, activation="tanh"))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(
        loss=BinaryCrossentropy(),
        optimizer=RMSprop(learning_rate=0.005),
        metrics=["accuracy"],
    )
    model.summary()
    return model

In [10]:
model = create_model()

2024-01-16 21:25:23.694496: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 64)           320000    
                                                                 
 bidirectional (Bidirection  (None, 128)               49920     
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                        

In [11]:
# shuffle train data
train_df = train_df.sample(frac=1)

train_texts = train_df['text'].to_numpy()
train_labels = train_df['label'].to_numpy()

validation_texts = validation_df['text'].to_numpy()
validation_labels = validation_df['label'].to_numpy()

test_texts = test_df['text'].to_numpy()
test_labels = test_df['label'].to_numpy()

vectorize_layer.adapt(train_texts)
vectorize_layer.adapt(validation_texts)
vectorize_layer.adapt(test_texts)

In [12]:
# bring labels into correct format
train_labels = train_labels.reshape(-1, 1)
test_labels = test_labels.reshape(-1, 1)
val_labels = validation_labels.reshape(-1, 1)

# map texts to vectors
train_dataset = tf.data.Dataset.from_tensor_slices((train_texts, train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_texts, test_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((validation_texts, val_labels))

# vectorize texts
train_dataset = train_dataset.map(lambda text, label: (vectorize_text(text), label))
test_dataset = test_dataset.map(lambda text, label: (vectorize_text(text), label))
val_dataset = val_dataset.map(lambda text, label: (vectorize_text(text), label))

# configure dataset for performance
AUTOTUNE = tf.data.AUTOTUNE
train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)
val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [13]:
print(train_labels)

[[0]
 [0]
 [1]
 ...
 [0]
 [0]
 [1]]


In [14]:
BATCH_SIZE=32

In [15]:
model.fit(train_dataset, epochs=10, validation_data=val_dataset, steps_per_epoch=len(train_dataset)//BATCH_SIZE, validation_steps=len(val_dataset)//BATCH_SIZE)

Epoch 1/10


2024-01-16 21:26:28.996522: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2024-01-16 21:26:29.869581: I external/local_xla/xla/service/service.cc:168] XLA service 0x7efa4fa84a20 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-01-16 21:26:29.869619: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Ti Laptop GPU, Compute Capability 8.6
2024-01-16 21:26:29.881490: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1705433189.946562    7384 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




In [64]:
scores = model.evaluate(test_dataset, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))


Accuracy: 48.80%


In [65]:
model.save('model.h5')

In [69]:
submit_path = 'data.jsonl'
new_data = load_data(submit_path)

In [70]:
new_texts = new_data['text'].to_numpy()
new_labels = new_data['label'].to_numpy()

new_dataset = tf.data.Dataset.from_tensor_slices((new_texts, new_labels))
new_dataset = new_dataset.map(lambda text, label: (vectorize_text(text), label))
new_dataset = new_dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [71]:
# get labels for submission
labels = model.predict(new_dataset)
        



KeyError: 'id'