In [1]:
from datasets import load_dataset

imdb = load_dataset("imdb")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 213526.65 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 221237.70 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 213745.59 examples/s]


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [4]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [5]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map: 100%|██████████| 25000/25000 [00:03<00:00, 7633.10 examples/s]
Map: 100%|██████████| 25000/25000 [00:03<00:00, 7842.98 examples/s]
Map: 100%|██████████| 50000/50000 [00:06<00:00, 7669.46 examples/s]


In [6]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 15.3MB/s]


In [8]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [9]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {v: k for k, v in id2label.items()}

In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [12]:
trainer.train()

 16%|█▌        | 500/3126 [05:34<29:24,  1.49it/s]

{'loss': 0.3162, 'grad_norm': 8.666570663452148, 'learning_rate': 1.6801023672424827e-05, 'epoch': 0.32}


 32%|███▏      | 1000/3126 [11:08<23:15,  1.52it/s]

{'loss': 0.2525, 'grad_norm': 5.8583526611328125, 'learning_rate': 1.3602047344849649e-05, 'epoch': 0.64}


 48%|████▊     | 1500/3126 [16:44<18:00,  1.51it/s]

{'loss': 0.2239, 'grad_norm': 8.28808879852295, 'learning_rate': 1.0403071017274472e-05, 'epoch': 0.96}


                                                   
 50%|█████     | 1563/3126 [23:11<15:03,  1.73it/s]

{'eval_loss': 0.2164992243051529, 'eval_accuracy': 0.91552, 'eval_runtime': 344.8879, 'eval_samples_per_second': 72.487, 'eval_steps_per_second': 4.532, 'epoch': 1.0}


 64%|██████▍   | 2000/3126 [28:07<12:46,  1.47it/s]    

{'loss': 0.1623, 'grad_norm': 10.18687915802002, 'learning_rate': 7.204094689699297e-06, 'epoch': 1.28}


 80%|███████▉  | 2500/3126 [33:41<07:04,  1.47it/s]

{'loss': 0.1445, 'grad_norm': 10.377318382263184, 'learning_rate': 4.005118362124121e-06, 'epoch': 1.6}


 96%|█████████▌| 3000/3126 [39:15<01:25,  1.48it/s]

{'loss': 0.1473, 'grad_norm': 8.502082824707031, 'learning_rate': 8.061420345489445e-07, 'epoch': 1.92}


100%|██████████| 3126/3126 [40:40<00:00,  1.72it/s]

KeyboardInterrupt: 

In [15]:
trainer.save_model()

In [13]:
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

In [21]:
from transformers import pipeline
import torch 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


classifier = pipeline("sentiment-analysis", model="./my_awesome_model", device=device)
classifier(text)

[{'label': 'POSITIVE', 'score': 0.997658371925354}]