In [92]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import tensorflow as tf
import pandas as pd
import numpy as np
import keras
from keras import layers, models
from transformers import BertTokenizer, TFBertForSequenceClassification
from tqdm import tqdm
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from keras.metrics import SparseCategoricalAccuracy

import string
import re

In [65]:
skip_training = False

In [66]:
# Load training and validation datasets
train_data = pd.read_csv("./train_2024.csv")
valid_data = pd.read_csv("./dev_2024.csv")

X_train = tf.convert_to_tensor(train_data["text"])
y_train = tf.convert_to_tensor(train_data["label"])
X_val = tf.convert_to_tensor(valid_data["text"])
y_val = tf.convert_to_tensor(valid_data["label"])

# Unsqueeze the X tensors (x,) -> (x,1)
X_train = tf.reshape(X_train, (len(X_train), 1))
X_val = tf.reshape(X_val, (len(X_val), 1))

In [67]:
print(f"Example of training input:\n{X_train[0][0].numpy()} maps to label {y_train[0].numpy()}")
print(f"Example of validation input:\n{X_val[0][0].numpy()} maps to label {y_val[0].numpy()}")
print(f"Training dataset size is {len(y_train)} and validation dataset size is {len(y_val)}")
tf.config.list_physical_devices()

Example of training input:
b'Except that Desmond played first base last night. Tapia was in LF  and Reynolds had a night off.' maps to label 0
Example of validation input:
b'He was older  and was carrying a small bucket.' maps to label 0
Training dataset size is 98629 and validation dataset size is 10980


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [83]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
MAX_LENGTH = 50
LR = 2e-5
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

def encode(X, y):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []

      for i in tqdm(range(len(X))):
            bert_input = tokenizer.encode_plus(
                            str(X[i][0].numpy().decode('utf-8')),
                            add_special_tokens=True,
                            max_length = MAX_LENGTH, # max length of the text that can go to BERT
                            pad_to_max_length = True, # add [PAD] tokens
                            return_attention_mask = True, # add attention mask to not focus on pad tokens
                            truncation=True
              )
    input_ids_list.append(bert_input["input_ids"])
    token_type_ids_list.append(bert_input["token_type_ids"])
    attention_mask_list.append(bert_input["attention_mask"])
    label_list.append(y[i])

      return np.array(input_ids_list), np.array(attention_mask_list), np.array(label_list)
  #return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [84]:
train_ids, train_attention_masks, train_labels = encode(X_train, y_train)
val_ids, val_attention_masks, val_labels = encode(X_val, y_val)

#train_dataset = encode(X_train, y_train)
#val_dataset = encode(X_val, y_val)
#print(train_dataset)

100%|██████████| 98629/98629 [03:45<00:00, 436.65it/s]
100%|██████████| 10980/10980 [00:24<00:00, 457.02it/s]


In [108]:
print(train_ids[0], train_attention_masks[0], train_labels[0])
#train_dataset = train_dataset.batch(32)
#val_dataset = val_dataset.batch(32)

[  101  2205  2919 21591  2187  3358  8351  2066  2017  1999  1996  3915
  2024 26476  2098  2296  2154  2011  1996  4507  1997  3974   999   999
   999   999  2009  2097  2022  2488  2449  2084  1996  3915  2038  2464
  1999  5109 10916   102     0     0     0     0     0     0     0     0
     0     0] [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 0 0 0 0 0 0 0 0 0] 1


In [93]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
print(model.summary())
opt = "adam" # Adam(learning_rate=2e-5,epsilon=1e-08) # Fucks sake, wont interpret optimizer adam.Adam -object, lr=0.001 and high epsilon fuck up finetune
loss = SparseCategoricalCrossentropy(from_logits=True)
metric = SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=opt, loss=loss, metrics=[metric])


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_38"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_1481 (Dropout)      multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [94]:
if not skip_training:
    checkpoint_filepath = './checkpoint.keras'
    model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True,
        save_weights_only=True
    )

    model.fit([train_ids, train_attention_masks], train_labels,
              epochs=1,
              batch_size=32,
              validation_data=([val_ids, val_attention_masks], val_labels),
              callbacks=[model_checkpoint_callback],
              )



In [95]:
def save_model(path: str, model):
    model.save_weights(path)

def load_model(path: str, model):
    model.load_weights(path)

In [96]:
if not skip_training:
    # path must en in .weights.h5 as in checkpoint format
    model.load_weights("checkpoint.keras")
    save_model("./model_best2.weights.h5", model)



In [None]:
if skip_training:
    load_model("./model_best2.weights.h5", model)

In [97]:
# Run the tests with trained model

test_data = pd.read_csv("./test_2024.csv", quoting=3)
X_test = tf.convert_to_tensor(test_data["text"])
X_test = tf.reshape(X_test, (len(X_test), 1))

input_ids_list = []
token_type_ids_list = []
attention_mask_list = []

for i in range(len(X_test)):
    bert_input = tokenizer.encode_plus(
                        str(X_test[i][0].numpy().decode('utf-8')),
                        max_length = MAX_LENGTH,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        truncation=True
              )
    input_ids_list.append(bert_input["input_ids"])
    token_type_ids_list.append(bert_input["token_type_ids"])
    attention_mask_list.append(bert_input["attention_mask"])

test_ids = np.array(input_ids_list)
test_attention_masks = np.array(attention_mask_list)



In [105]:
y_preds = model.predict([test_ids, test_attention_masks])



In [107]:
y_preds_clean = np.argmax(y_preds.logits, axis=-1)
print(y_preds_clean)
print(f"Predictions are of shape {y_preds_clean.shape} and head of predictions is:")
#rounded = np.where(y_preds.numpy() > 0.5, 1, 0)
ans_df = pd.DataFrame(y_preds_clean)
print(ans_df.head)

final_df = pd.concat([test_data["text"], ans_df], axis=1)

final_df.to_csv("./bert_preds.csv", index=False)

[1 1 1 ... 1 1 1]
Predictions are of shape (12001,) and head of predictions is:
<bound method NDFrame.head of        0
0      1
1      1
2      1
3      1
4      1
...   ..
11996  1
11997  1
11998  1
11999  1
12000  1

[12001 rows x 1 columns]>
