In [21]:
import os

import pandas as pd
import numpy as np

import tensorflow as tf

import evaluate
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, create_optimizer, TFAutoModelForSequenceClassification
from transformers.keras_callbacks import KerasMetricCallback

DATA_DIR = os.path.join('data', '.')

In [22]:
def read_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, names=['label', 'title', 'text'])

    # Add title to the text
    df['text'] = df['title'] + '\n' + df['text']

    # Drop title as it's not gonna be used
    df = df.drop('title', axis=1)

    # Initially labels start from 1, many models work only when labels start from 0
    df['label'] = df['label'] - 1

    return df

# Read data
train_df = read_data(os.path.join(DATA_DIR, 'train.csv'))
test_df = read_data(os.path.join(DATA_DIR, 'test.csv'))

# Initialize huggingface Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [23]:
# Example of data
train_dataset[0]

{'label': 2,
 'text': "Wall St. Bears Claw Back Into the Black (Reuters)\nReuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."}

In [24]:
# Load pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")

In [25]:
def preprocess_function(examples, tokenizer=tokenizer):
    """Apply tokenizer to text."""
    return tokenizer(examples["text"], truncation=True)

# Apply tokenizer
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [26]:
# Initialize DataCollatorWithPadding to pad the sentences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [27]:
# Define huggingface metric
accuracy = evaluate.load("accuracy")

In [28]:
def compute_metrics(eval_pred):
    """Callback function to compute metrics."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [29]:
# Create map of ids to labels, for this part we use range of labels 0-3 as mentioned in `read_data`
id2label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
label2id = {value: key for key, value in id2label.items()}

In [30]:
# Define model training params
batch_size = 16
num_epochs = 3
batches_per_epoch = len(train_dataset) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [31]:
# Load pretrained model
model = TFAutoModelForSequenceClassification.from_pretrained(
    "microsoft/MiniLM-L12-H384-uncased", num_labels=4, id2label=id2label, label2id=label2id
)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
# Create train and validation sets
tf_train_set = model.prepare_tf_dataset(
    tokenized_train_dataset,
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_test_dataset,
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [34]:
# Fit model
model.compile(optimizer=optimizer, metrics=['accuracy'])

# Create metric callback
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs, callbacks=[metric_callback])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f686e7e9ac0>

In [35]:
model.save_pretrained('pretrained_MiniLM')