### HuggingFace Transformers

In [3]:
from transformers import BertTokenizer

In [4]:
import torch
print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

# Make sure the model and data are sent to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

GPU available: True
GPU name: Tesla T4


In [5]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# See how many tokens are in the vocabulary
tokenizer.vocab_size

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


28996

In [6]:
# Tokenize the sentence
tokens = tokenizer.tokenize("My heart is Generative")

# Print the tokens
print(tokens)

# Show the token ids assigned to each token
print(tokenizer.convert_tokens_to_ids(tokens))

['My', 'heart', 'is', 'Gene', '##rative']
[1422, 1762, 1110, 9066, 15306]


In [7]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

In [8]:
# Load a pre-trained sentiment analysis model
model_name = "textattack/bert-base-uncased-imdb"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [9]:
# Tokenize the input sequence
tokenizer = BertTokenizer.from_pretrained(model_name)
inputs = tokenizer("I am Generative AI", return_tensors="pt")

# Make prediction
with torch.no_grad():
    outputs = model(**inputs).logits
    probabilities = torch.nn.functional.softmax(outputs, dim=1)
    predicted_class = torch.argmax(probabilities)

# Display sentiment result
if predicted_class == 1:
    print(f"Sentiment: Positive ({probabilities[0][1] * 100:.2f}%)")
else:
    print(f"Sentiment: Negative ({probabilities[0][0] * 100:.2f}%)")
# Sentiment: Positive (88.68%)

Sentiment: Positive (70.16%)


### HuggingFace Datasets library

In [10]:
%pip install datasets



In [11]:
from datasets import load_dataset
from IPython.display import HTML, display

In [12]:
# Load the IMDB dataset, which contains movie reviews
# and sentiment labels (positive or negative)
dataset = load_dataset("imdb")

In [13]:
# Fetch a revie from the training set
review_number = 42
sample_review = dataset["train"][review_number]

In [14]:
sample_review

 'label': 0}

In [15]:
display(HTML(sample_review["text"][:450] + "..."))

In [16]:
if sample_review["label"] == 1:
    print("Sentiment: Positive")
else:
    print("Sentiment: Negative")

Sentiment: Negative


### Hugging Face trainers

In [17]:
from transformers import (DistilBertForSequenceClassification,
    DistilBertTokenizer,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

In [18]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)
model.to(device)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
def tokenize_function(data):
    return tokenizer(data["text"], padding="max_length", truncation=True)

In [21]:
dataset = load_dataset("imdb")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    per_device_train_batch_size=64,
    output_dir="./results",
    learning_rate=2e-5,
    num_train_epochs=3,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

In [22]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,0.252
1000,0.1401


TrainOutput(global_step=1173, training_loss=0.1840189667918798, metrics={'train_runtime': 3340.7474, 'train_samples_per_second': 22.45, 'train_steps_per_second': 0.351, 'total_flos': 9935054899200000.0, 'train_loss': 0.1840189667918798, 'epoch': 3.0})