In [1]:
! pip install transformers datasets evaluate
! pip install accelerate -U

Looking in indexes: https://alyydi:****@tratonregistry.jfrog.io/artifactory/api/pypi/ats-pypi-virtual/simple
Collecting datasets
  Downloading https://tratonregistry.jfrog.io/artifactory/api/pypi/ats-pypi-virtual/packages/packages/ed/a5/33cf000137545a08b0a3a6ea76c8ccbd87917f78bb5d737f9f56f3b11ef6/datasets-3.1.0-py3-none-any.whl (480 kB)
[K     |████████████████████████████████| 480 kB 1.3 MB/s eta 0:00:01
[?25hCollecting evaluate
  Using cached https://tratonregistry.jfrog.io/artifactory/api/pypi/ats-pypi-virtual/packages/packages/a2/e7/cbca9e2d2590eb9b5aa8f7ebabe1beb1498f9462d2ecede5c9fd9735faaf/evaluate-0.4.3-py3-none-any.whl (84 kB)
Collecting multiprocess<0.70.17
  Using cached https://tratonregistry.jfrog.io/artifactory/api/pypi/ats-pypi-virtual/packages/packages/ea/89/38df130f2c799090c978b366cfdf5b96d08de5b29a4a293df7f7429fa50b/multiprocess-0.70.16-py38-none-any.whl (132 kB)
Collecting aiohttp
  Downloading https://tratonregistry.jfrog.io/artifactory/api/pypi/ats-pypi-virtual/p

In [2]:
from datasets import load_dataset
imdb = load_dataset('imdb')
imdb['test'][0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [5]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

In [6]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

Map: 100%|██████████| 25000/25000 [00:03<00:00, 7262.86 examples/s]
Map: 100%|██████████| 25000/25000 [00:03<00:00, 7744.49 examples/s]
Map: 100%|██████████| 50000/50000 [00:06<00:00, 7520.02 examples/s]


In [7]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
id2label = {0: 'NEGATIVE', 1: 'POSITIVE'}
label2id = {'NEGATIVE': 0, 'POSITIVE': 1}

In [10]:
import evaluate
accuracy = evaluate.load('accuracy')

In [11]:
import numpy as np

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions,axis = 1)
    return accuracy.compute(predictions=predictions, references=labels)

In [13]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
training_args = TrainingArguments(
    output_dir='finetuned_distilbert',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb['train'],
    eval_dataset=tokenized_imdb['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [16]:
trainer.train()

 16%|█▌        | 500/3126 [02:27<13:07,  3.33it/s]

{'loss': 0.318, 'grad_norm': 15.825604438781738, 'learning_rate': 1.6801023672424827e-05, 'epoch': 0.32}


 32%|███▏      | 1000/3126 [04:58<10:28,  3.38it/s]

{'loss': 0.2519, 'grad_norm': 8.544846534729004, 'learning_rate': 1.3602047344849649e-05, 'epoch': 0.64}


 48%|████▊     | 1500/3126 [07:30<08:04,  3.36it/s]

{'loss': 0.2234, 'grad_norm': 9.381420135498047, 'learning_rate': 1.0403071017274472e-05, 'epoch': 0.96}


                                                   
 50%|█████     | 1563/3126 [10:36<06:48,  3.83it/s]

{'eval_loss': 0.2050236165523529, 'eval_accuracy': 0.92124, 'eval_runtime': 166.8268, 'eval_samples_per_second': 149.856, 'eval_steps_per_second': 9.369, 'epoch': 1.0}


 64%|██████▍   | 2000/3126 [12:49<05:42,  3.29it/s]   

{'loss': 0.1651, 'grad_norm': 7.215705394744873, 'learning_rate': 7.204094689699297e-06, 'epoch': 1.28}


 80%|███████▉  | 2500/3126 [15:19<03:11,  3.27it/s]

{'loss': 0.145, 'grad_norm': 18.691476821899414, 'learning_rate': 4.005118362124121e-06, 'epoch': 1.6}


 96%|█████████▌| 3000/3126 [17:49<00:37,  3.32it/s]

{'loss': 0.1447, 'grad_norm': 15.025524139404297, 'learning_rate': 8.061420345489445e-07, 'epoch': 1.92}


                                                   
100%|██████████| 3126/3126 [21:14<00:00,  3.84it/s]

{'eval_loss': 0.2374497801065445, 'eval_accuracy': 0.9316, 'eval_runtime': 166.2849, 'eval_samples_per_second': 150.344, 'eval_steps_per_second': 9.4, 'epoch': 2.0}


100%|██████████| 3126/3126 [21:15<00:00,  2.45it/s]

{'train_runtime': 1275.8198, 'train_samples_per_second': 39.19, 'train_steps_per_second': 2.45, 'train_loss': 0.20561727864270934, 'epoch': 2.0}





TrainOutput(global_step=3126, training_loss=0.20561727864270934, metrics={'train_runtime': 1275.8198, 'train_samples_per_second': 39.19, 'train_steps_per_second': 2.45, 'total_flos': 6556904415524352.0, 'train_loss': 0.20561727864270934, 'epoch': 2.0})

In [17]:
text = 'This was a masterpiece!'

In [18]:
inputs = tokenizer(text, return_tensors='pt')

In [19]:
from transformers import AutoModelForSequenceClassification
import torch
model = AutoModelForSequenceClassification.from_pretrained('finetuned_distilbert/checkpoint-1563', local_files_only=True)

with torch.no_grad():
    logits = model(**inputs).logits

In [21]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'POSITIVE'