In [1]:
!pip install datasets
!pip install transformers[torch]
!pip install evaluate
!pip install -U accelerate
!pip install rouge_score



In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

In [3]:
ds = load_dataset('imdb')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
ds['train'][100]

{'text': "Terrible movie. Nuff Said.<br /><br />These Lines are Just Filler. The movie was bad. Why I have to expand on that I don't know. This is already a waste of my time. I just wanted to warn others. Avoid this movie. The acting sucks and the writing is just moronic. Bad in every way. The only nice thing about the movie are Deniz Akkaya's breasts. Even that was ruined though by a terrible and unneeded rape scene. The movie is a poorly contrived and totally unbelievable piece of garbage.<br /><br />OK now I am just going to rag on IMDb for this stupid rule of 10 lines of text minimum. First I waste my time watching this offal. Then feeling compelled to warn others I create an account with IMDb only to discover that I have to write a friggen essay on the film just to express how bad I think it is. Totally unnecessary.",
 'label': 0}

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

### Preprocess text

In [6]:
# use the default preprocessor
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples['text'], padding='max_length', truncation=True)

#apply to all datasets with .map(). Built-in function of the HF datasets class
tokenized_datasets = ds.map(tokenize_function, batched=True)



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [7]:
small_train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(1000))

### loading the distilbert model

In [8]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
training_args = TrainingArguments(
    output_dir='test_trainer',
    evaluation_strategy='epoch',
    num_train_epochs=3)




In [11]:
!pip install --upgrade torch torchvision transformers



In [12]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
  logits, labels = eval_pred #raw outputs, actual labels
  predictions = np.argmax(logits, axis=-1) #prediction is the highest output probability
  return metric.compute(predictions=predictions, references=labels) # accuracy computation

### Build the actual Trainer object

In [23]:
trainer_small = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics

)

In [24]:
trainer_small.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.71694,0.863
2,No log,0.736941,0.864
3,No log,0.879622,0.86


TrainOutput(global_step=375, training_loss=0.08846072387695313, metrics={'train_runtime': 197.2641, 'train_samples_per_second': 15.208, 'train_steps_per_second': 1.901, 'total_flos': 397402195968000.0, 'train_loss': 0.08846072387695313, 'epoch': 3.0})

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2734,0.269264,0.91092
2,0.1465,0.330812,0.91944
3,0.0637,0.373322,0.92828


TrainOutput(global_step=9375, training_loss=0.17750357198079428, metrics={'train_runtime': 4924.5335, 'train_samples_per_second': 15.23, 'train_steps_per_second': 1.904, 'total_flos': 9935054899200000.0, 'train_loss': 0.17750357198079428, 'epoch': 3.0})