In [122]:
from datasets import load_dataset

paradigms = load_dataset("debate-land/2023-paradigms")["train"].train_test_split(test_size=0.3)



  0%|          | 0/1 [00:00<?, ?it/s]

In [123]:
paradigms['train'][0]

{'text': 'I debated', 'label': 0}

In [124]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [125]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


In [126]:
tokenized_paradigms = paradigms.map(preprocess_function, batched=True)

Map:   0%|          | 0/471 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

In [127]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [128]:
import evaluate

accuracy = evaluate.load("accuracy")

In [129]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [130]:
id2label = {
    0: "FLOW",
    1: "LAY"
}

label2id = {
    "FLOW": 0,
    "LAY": 1
}

In [131]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_paradigms["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
#optimizer = tf.keras.optimizers.legacy.Adam()
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)



In [132]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [133]:
tf_train_set: tf.data.Dataset = model.prepare_tf_dataset(
    tokenized_paradigms["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_paradigms["test"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [134]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [135]:
callbacks = [metric_callback]

In [136]:
import tensorflow as tf

model.compile(optimizer=optimizer)  # No loss argument!



In [137]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x2e783d510>

In [138]:
# model.export(filepath='model')

In [139]:
import math

def logit_to_prob(logit: float):
    odds = math.pow(math.e, logit)
    return odds / (1 + odds)

In [166]:
text = """

I am a parent judge.
no theory, no K's, no complicated phil, no tricks
Speed:
DO NOT SPREAD, please speak clearly


"""
inputs = tokenizer(text, return_tensors="tf")

logits = model(**inputs).logits

for i, logit in enumerate(logits.numpy()[0]):
    print(f'{id2label[i]}: {round(logit_to_prob(logit) * 100, 1)}%')
predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
model.config.id2label[predicted_class_id]

FLOW: 22.8%
LAY: 73.6%


'LAY'

In [174]:
# #model.save_pretrained(save_directory='model')
# tf.keras.saving.save_model(
#     model,
#     "model2",
# )

model.save_pretrained('model')
tokenizer.save_pretrained('model')

Configuration saved in paradigm-model/config.json
Model weights saved in paradigm-model/tf_model.h5
tokenizer config file saved in paradigm-model/tokenizer_config.json
Special tokens file saved in paradigm-model/special_tokens_map.json


('paradigm-model/tokenizer_config.json',
 'paradigm-model/special_tokens_map.json',
 'paradigm-model/vocab.txt',
 'paradigm-model/added_tokens.json',
 'paradigm-model/tokenizer.json')