# core

> Fill in a module description here

In [None]:
# | default_exp core

In [None]:
# | export
import evaluate
import time

import numpy as np

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
)

In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
# | export
def classify(x, labels, llm_labeler, max_failures=5, default_label=0):
    failures = 0
    while failures < max_failures:
        try:
            return labels.index(llm_labeler(x)[0])
        except Exception as e:
            failures += 1
            print(e)
            time.sleep(1)
            pass
    if failures == max_failures:
        return default_label

In [None]:
# | export
def label_dataset(
    dataset, text_column, labeler_model, labels, sample=0.1, num_workers=4
):
    """
    Filters a dataset using a labeler model.

    Args:
        dataset (datasets.Dataset): Dataset to filter
        text_column (str): Name of the column containing the text to classify
        labeler_model (Any): Model to use for labeling
        labels (List[str]): List of labels
        sample (float): The fraction of the dataset to label and use for filtering
        batch_size (int): Batch size for labeling
        num_workers (int): Number of workers for labeling
    """

    # Get a subset of the dataset
    subset = dataset.shuffle(seed=115).select(range(int(len(dataset) * sample)))

    # Label the subset
    subset = subset.map(
        lambda x: {"label": classify(x[text_column], labels, labeler_model)},
        batched=False,
        num_proc=num_workers,
    )

    return subset

In [None]:
from functools import partial
from datasets import load_dataset


def mock_labeler(x, labels):
    return [np.random.choice(labels, p=[0.25, 0.75])]


labels = ["positive", "negative"]
labeler = partial(mock_labeler, labels=labels)
ds = load_dataset("bigcode/the-stack-smol", data_dir="data/python")["train"]

subset = label_dataset(ds, "content", labeler, labels, sample=0.1)

assert "label" in subset.column_names

Using custom data configuration bigcode--the-stack-smol-8f8055c3a4e4b4e3
Found cached dataset json (/home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-8f8055c3a4e4b4e3/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/nathan/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-8f8055c3a4e4b4e3/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-feaf44b92e145e5a.arrow


  0%|          | 0/1000 [00:00<?, ?ex/s]

In [None]:
# | export
def train_labeler(
    dataset,
    text_column,
    base_model_name,
    n_labels,
    training_args,
    num_workers=4,
    max_length=512,
    push_to_hub=True,
):
    """
    Trains a labeler model on a labeled dataset.

    Args:
        dataset (datasets.Dataset): Dataset to train on
        text_column (str): Name of the text column
        base_model_name (str): Name of the base model to use
        n_labels (int): Number of labels
        epochs (int): Number of epochs to train
        batch_size (int): Batch size for training
        num_workers (int): Number of workers for training
        max_length (int): Maximum length of the input
    """
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, max_length=max_length)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Load the model
    model = AutoModelForSequenceClassification.from_pretrained(
        base_model_name, num_labels=n_labels, max_length=max_length
    )
    model.config.id2label = {i: i for i in range(n_labels)}

    # Preprocess the dataset
    dataset = dataset.map(
        lambda x: tokenizer(
            x[text_column], padding="max_length", truncation=True, max_length=max_length
        ),
        batched=True,
        num_proc=num_workers,
    )

    # Split the dataset
    dataset = dataset.train_test_split(test_size=0.1, seed=42)

    # Get the data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    def compute_metrics(eval_preds):
        metric = evaluate.load("glue", "mrpc")
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    # Get the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Push the model to the hub
    if push_to_hub:
        trainer.push_to_hub()

    # Return the model
    return model, tokenizer

In [None]:
from transformers import pipeline

base_model_name = "prajjwal1/bert-small"
model, tokenizer = train_labeler(
    subset,
    "content",
    base_model_name,
    n_labels=len(labels),
    epochs=1,
    batch_size=4,
    num_workers=4,
)
assert type(model) == AutoModelForSequenceClassification

Some weights of the model checkpoint at prajjwal1/bert-small were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initi

  0%|          | 0/1 [00:00<?, ?ba/s]



  0%|          | 0/225 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.7437, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.04}
{'loss': 0.711, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.09}
{'loss': 0.6896, 'learning_rate': 3e-06, 'epoch': 0.13}
{'loss': 0.6414, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.18}
{'loss': 0.6547, 'learning_rate': 5e-06, 'epoch': 0.22}
{'loss': 0.5845, 'learning_rate': 6e-06, 'epoch': 0.27}
{'loss': 0.5528, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.31}
{'loss': 0.6287, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.36}
{'loss': 0.6309, 'learning_rate': 9e-06, 'epoch': 0.4}
{'loss': 0.6, 'learning_rate': 1e-05, 'epoch': 0.44}
{'loss': 0.6651, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.49}
{'loss': 0.5361, 'learning_rate': 1.2e-05, 'epoch': 0.53}
{'loss': 0.674, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.58}
{'loss': 0.6853, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.62}
{'loss': 0.6342, 'learning_rate': 1.5e-05, 'epoch': 0.67}
{'loss': 0.6266, 'learning_ra

  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 0.5282490253448486, 'eval_accuracy': 0.8, 'eval_f1': 0.888888888888889, 'eval_runtime': 0.933, 'eval_samples_per_second': 107.178, 'eval_steps_per_second': 26.794, 'epoch': 1.0}


Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


{'train_runtime': 19.5343, 'train_samples_per_second': 46.073, 'train_steps_per_second': 11.518, 'train_loss': 0.6103349855211047, 'epoch': 1.0}


In [None]:
# | export
def filter_dataset(
    dataset, text_column, labeler_model, labels_to_keep, batch_size=32, num_workers=4
):
    """
    Filters a dataset using a labeler model.

    Args:
        dataset (datasets.Dataset): Dataset to filter
        text_column (str): Name of the text column
        labeler_model (transformers.pipelines.TextClassificationPipeline): Model to use for labeling
        labels_to_keep (list): List of labels to keep
        batch_size (int): Batch size for labeling
        num_workers (int): Number of workers for labeling
    """

    def label(x):
        predicted = labeler_model(x, padding=True, truncation=True, max_length=512)
        return {
            "label": [l["label"] for l in predicted],
            "score": [l["score"] for l in predicted],
        }

    # Label the dataset
    dataset = dataset.map(
        lambda x: label(x[text_column]),
        batched=True,
        batch_size=batch_size,
        num_proc=num_workers,
    )

    # Filter the dataset
    dataset = dataset.filter(lambda x: x["label"] in labels_to_keep)

    return dataset

In [None]:
pipe = pipeline(
    "text-classification", model=model, tokenizer=tokenizer, device=model.device
)
filtered_ds = filter_dataset(ds, "content", pipe, [0])

assert len(filtered_ds) < len(ds)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()