In [3]:
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import evaluate


MODEL_NAME = "distilbert-base-uncased"
NUM_LABELS = 2         
MAX_LENGTH = 256
SEED = 12

# Load paper dataset
dataset = load_dataset("imdb")


# Rename split for consistency
dataset = {
    "train": dataset["train"],
    "validation": dataset["test"],  # paper reports test accuracy
}

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LENGTH,
    )

tokenized = {
    "train": dataset["train"].map(tokenize_fn, batched=True),
    "validation": dataset["validation"].map(tokenize_fn, batched=True),
}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Remove unused columns
keep_cols = ["input_ids", "attention_mask", "label"]
tokenized["train"] = tokenized["train"].remove_columns(
    [c for c in tokenized["train"].column_names if c not in keep_cols]
)
tokenized["validation"] = tokenized["validation"].remove_columns(
    [c for c in tokenized["validation"].column_names if c not in keep_cols]
)

# Rename labels
tokenized["train"] = tokenized["train"].rename_column("label", "labels")
tokenized["validation"] = tokenized["validation"].rename_column("label", "labels")


# Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
)


# Metrics
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(
            predictions=preds,
            references=labels
        )["accuracy"]
    }


# Training Arguments
args = TrainingArguments( 
    output_dir="distilbert-imdb",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=200,
    seed=SEED,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train & Evaluate
trainer.train()
print(trainer.evaluate())


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
200,0.4104
400,0.2935
600,0.2884
800,0.2787
1000,0.2708
1200,0.2682
1400,0.2807
1600,0.2355
1800,0.1762
2000,0.1633




{'eval_loss': 0.3485184907913208, 'eval_accuracy': 0.914, 'eval_runtime': 843.8547, 'eval_samples_per_second': 29.626, 'eval_steps_per_second': 0.927, 'epoch': 3.0}


In [4]:
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import evaluate


MODEL_NAME = "bert-base-uncased"
NUM_LABELS = 2         
MAX_LENGTH = 256
SEED = 12

# Load paper dataset
dataset = load_dataset("imdb")


# Rename split for consistency
dataset = {
    "train": dataset["train"],
    "validation": dataset["test"],  # paper reports test accuracy
}

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LENGTH,
    )

tokenized = {
    "train": dataset["train"].map(tokenize_fn, batched=True),
    "validation": dataset["validation"].map(tokenize_fn, batched=True),
}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Remove unused columns
keep_cols = ["input_ids", "attention_mask", "label"]
tokenized["train"] = tokenized["train"].remove_columns(
    [c for c in tokenized["train"].column_names if c not in keep_cols]
)
tokenized["validation"] = tokenized["validation"].remove_columns(
    [c for c in tokenized["validation"].column_names if c not in keep_cols]
)

# Rename labels
tokenized["train"] = tokenized["train"].rename_column("label", "labels")
tokenized["validation"] = tokenized["validation"].rename_column("label", "labels")


# Model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
)


# Metrics
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(
            predictions=preds,
            references=labels
        )["accuracy"]
    }


# TrainingArguments
args = TrainingArguments(
    output_dir="distilbert-imdb",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=200,
    seed=SEED,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train & Evaluate
trainer.train()
print(trainer.evaluate())

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
200,0.3821
400,0.2849
600,0.2623
800,0.2625
1000,0.2512
1200,0.2455
1400,0.2547
1600,0.2189
1800,0.1569
2000,0.1366




{'eval_loss': 0.34386318922042847, 'eval_accuracy': 0.92264, 'eval_runtime': 1740.2365, 'eval_samples_per_second': 14.366, 'eval_steps_per_second': 0.449, 'epoch': 3.0}
