In [1]:
!pip install -q transformers==4.28.0
!pip install -q datasets

## Using HuggingFace Dataset class for loading data

In [2]:
from datasets import load_dataset, load_from_disk

dataset = load_dataset('csv', data_files='/content/IMDB Dataset.csv', split='train')
dataset



Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})

In [3]:
dataset = dataset.rename_column('review', 'text')
dataset = dataset.rename_column('sentiment', 'labels')

In [4]:
dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 50000
})

## Using AutoTokenizer for tokenization

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased")
 
encoding_fn = lambda x: 1 if x == 'positive' else 0

def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.class_encode_column("labels")



In [6]:
split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset, test_dataset = split['train'], split['test']

In [7]:
train_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 40000
})

In [8]:
test_dataset

Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10000
})

## Using AutoModelForSequenceClassification for classifying text

In [10]:
import torch
from transformers import AutoModelForSequenceClassification

checkpoint = "google/mobilebert-uncased"

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.config.ctc_zero_infinity = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

if device == 'cude':
  model = model.half()

Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some 

In [12]:
!pip install -q --upgrade accelerate

from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="steps",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='f1'  # This will keep the best model measured by eval f1 score
)
 
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': accuracy, 'f1': f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)
 
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
100,0.2966,0.362025,0.8548,0.853896
200,0.2176,0.305292,0.8739,0.873897
300,0.2916,0.286531,0.8741,0.873874
400,0.2269,0.293293,0.8786,0.878468
500,0.2184,0.281442,0.883,0.882937
600,0.2111,0.274056,0.8863,0.88629
700,0.1785,0.292068,0.8856,0.885522
800,0.1666,0.290442,0.8873,0.88726
900,0.1664,0.296549,0.8866,0.886534


TrainOutput(global_step=939, training_loss=0.21714167955457467, metrics={'train_runtime': 1274.1069, 'train_samples_per_second': 94.184, 'train_steps_per_second': 0.737, 'total_flos': 1881255628800000.0, 'train_loss': 0.21714167955457467, 'epoch': 3.0})

In [13]:
trainer.save_model('./best-f1-imdb')

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
import shutil

In [18]:
shutil.copytree('/content/best-f1-imdb', '/content/drive/MyDrive/best-f1-imdb/best')

'/content/drive/MyDrive/best-f1-imdb/best'