In [38]:
from datasets import load_dataset
from evaluate import load
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

from src.config import MODELS_DIR, PROCESSED_DATA_DIR, HF_TOKEN

In [39]:
from huggingface_hub import login

login(token=HF_TOKEN)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [40]:
ds = load_dataset(
    "parquet",
    data_files={
        "train": str(PROCESSED_DATA_DIR / "train.parquet"),
        "validation": str(PROCESSED_DATA_DIR / "validation.parquet"),
    },
)

In [41]:
ds["train"].features

{'text': Value(dtype='large_string', id=None),
 'labels': Value(dtype='int64', id=None)}

In [42]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize(examples):
    outputs = tokenizer(examples["text"], truncation=True)
    return outputs


tokenized_ds = ds.map(tokenize, batched=True)

Map:   0%|          | 0/9916 [00:00<?, ? examples/s]

In [44]:
metric = load("accuracy")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    num_train_epochs=1,
    output_dir=MODELS_DIR / "distilbert-imdb-checkpoint",
    push_to_hub=False,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
)

In [46]:
data_collator = DataCollatorWithPadding(tokenizer)

In [47]:
trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    data_collator=data_collator,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
)

In [48]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2967,0.273993,0.884127


TrainOutput(global_step=2170, training_loss=0.33893170510568926, metrics={'train_runtime': 239.6391, 'train_samples_per_second': 144.83, 'train_steps_per_second': 9.055, 'total_flos': 1149386501288448.0, 'train_loss': 0.33893170510568926, 'epoch': 1.0})

In [50]:
trainer.save_model(MODELS_DIR / "distilbert-imdb")