### This code was run on Deepnote with a lot of RAM and computational resources.

In [None]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
import datasets
import pandas as pd
import json
import numpy as np

2023-12-19 17:56:46.767524: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model = TFAutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

2023-12-19 17:56:50.909230: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from

In [None]:
train_df = pd.read_json('subtaskA_train_monolingual.jsonl', lines=True)
test_df = pd.read_json('subtaskA_dev_monolingual.jsonl', lines=True)

shuffled_train = train_df.sample(frac=1.0, random_state=42)
shuffled_test  = test_df.sample(frac=1.0, random_state=42)

train_dataset = datasets.Dataset.from_dict(shuffled_train[:10000]) # 10k samples, since it takes a lot of time to train
test_dataset = datasets.Dataset.from_dict(shuffled_test)
complete_dict = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
complete_encoded = complete_dict.map(tokenize, batched=True, batch_size=None) # tokenize the dataset

Map: 100%|██████████| 10000/10000 [00:03<00:00, 3117.12 examples/s]
Map: 100%|██████████| 5000/5000 [00:01<00:00, 3794.73 examples/s]


In [None]:
input_ids = complete_encoded["test"]["input_ids"]
ids = complete_encoded["test"]["id"]

input_ids_tuples = [tuple(row) for row in input_ids]

id_map = dict(zip(input_ids_tuples, ids)) # make a map of input_ids and id to be able to map the predictions to the correct id

In [None]:
# This code is taken from Keggle https://www.kaggle.com/code/pritishmishra/fine-tune-bert-for-text-classification/notebook?scriptVersionId=116951029
complete_encoded.set_format('tf', 
                            columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])

# setting BATCH_SIZE to 64.
BATCH_SIZE = 64

def order(inp):
    '''
    This function will group all the inputs of BERT
    into a single dictionary and then output it with
    labels.
    '''
    data = list(inp.values())
    return {
        'input_ids': data[1],
        'attention_mask': data[2],
        'token_type_ids': data[3]
    }, data[0]
    

train_dataset = tf.data.Dataset.from_tensor_slices(complete_encoded['train'][:])
train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1000)
train_dataset = train_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)


test_dataset = tf.data.Dataset.from_tensor_slices(complete_encoded['test'][:])
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
# This code is taken from Keggle https://www.kaggle.com/code/pritishmishra/fine-tune-bert-for-text-classification/notebook?scriptVersionId=116951029

class BERTForClassification(tf.keras.Model):
    
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')
        
    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

In [None]:
# This code is taken from Keggle https://www.kaggle.com/code/pritishmishra/fine-tune-bert-for-text-classification/notebook?scriptVersionId=116951029

classifier = BERTForClassification(model, num_classes=2)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [None]:
history = classifier.fit(
    train_dataset,
    epochs=1
)



In [None]:
all_input_ids = test_dataset.map(lambda x, y: x['input_ids'])

all_input_ids_numpy = np.concatenate(list(all_input_ids.as_numpy_iterator()))

ids = []
for input_id in all_input_ids_numpy:
    specific_input_id = tuple(np.array(input_id))  
    ids.append(id_map.get(specific_input_id, "Not Found"))

# translate input_ids to id to match predictions with the correct id

In [None]:
# Make predictions and save them to a json file in the correct format.

predictions = classifier.predict(test_dataset)

predicted_labels = tf.argmax(predictions, axis=1).numpy()

results = [{"id": int(i), "label": int(label)} for i, label in zip(ids, predicted_labels)]

output_file = 'predictions.jsonl'
with open(output_file, 'w', encoding='utf-8') as f:
    for result in results:
        f.write(json.dumps(result) + '\n')

print(f'Predictions saved to {output_file}')

Predictions saved to predictions.jsonl


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=4fbf66f6-adcb-47c6-bdee-342fcaac18fa' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>