In [None]:
# | hide
from treasure_trove.core import *
from squeakily.helpers import LLMLabeler

# treasure_trove

> Find the treasure in your trove of data

In [None]:
#| eval: false
from datasets import load_dataset
from squeakily.helpers import LLMLabeler
from transformers import pipeline, TrainingArguments
from treasure_trove.core import filter_dataset, label_dataset, train_labeler

instruction = """Please label the following code as either educational or non-educational.
Educational code is code that is well written, follows best practices, has documentation such that it might be found in a textbook.
Non-educational code is code that is poorly written, lacks documentation, contain bugs, or is not idiomatic.
Labels:
"""
labels = ["educational", "non-educational"]
api_key = "<api_key>"
labeler = LLMLabeler(instruction, labels, model_name="gpt-4", api_key=api_key)

ds = load_dataset("bigcode/the-stack-smol", data_dir="data/python")["train"]

# Get the training arguments
batch_size=4,
training_args = TrainingArguments(
    output_dir="./code_edu",
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    seed=42,
    push_to_hub=True,
)

In [None]:
#| eval: false
subset = label_dataset(ds, "content", labeler, labels, sample=0.001)
base_model_name = "EleutherAI/pythia-70m-deduped"
model, tokenizer = train_labeler(
    subset,
    "content",
    base_model_name,
    n_labels=len(labels),
    training_args=training_args,
    num_workers=4,
    max_length=512,
    push_to_hub=True,
)
pipe = pipeline(
    "text-classification", model=model, tokenizer=tokenizer, device=model.device
)
filtered_ds = filter_dataset(ds, "content", model, labels.index("educational"))
filtered_ds.push_to_hub("ncoop57/code_edu")