In [1]:
import os

import pytorch_lightning as pl
import torch
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoModel, AutoTokenizer

In [2]:
# Load the Amazon Reviews dataset
dataset = load_dataset("amazon_reviews_multi", "en")
dataset = dataset.rename_column("stars", "labels")

Downloading builder script:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/37.4k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.0k [00:00<?, ?B/s]

Downloading and preparing dataset amazon_reviews_multi/en to /home/ubuntu/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/82.0M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/200000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Dataset amazon_reviews_multi downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [15]:
# Split the dataset into train and test sets
train_data = dataset["train"]
test_data = dataset["test"]

In [16]:
def tokenize_function(batch):
    """Tokenize."""
    return tokenizer(batch["review_body"], padding=True, truncation=True, return_tensors="pt")


tokenized_train_data = train_data.map(tokenize_function, batched=True)
tokenized_train_data = tokenized_train_data.map(lambda batch: {"labels": batch["labels"] - 1})

loader_columns = [
    "input_ids",
    "attention_mask",
    "labels",
]
tokenized_train_data.set_format(type="torch", columns=loader_columns)


Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-056357a8158c731c.arrow
Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/724e94f4b0c6c405ce7e476a6c5ef4f87db30799ad49f765094cf9770e0f7609/cache-26da38fdc9a7d3f7.arrow


In [19]:
tokenized_train_data[0].keys()

dict_keys(['labels', 'input_ids', 'attention_mask'])

In [10]:
tokenizer("This is a test")

{'input_ids': [0, 713, 16, 10, 1296, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [20]:
# Create a dataloader and print a batch
train_dataloader = torch.utils.data.DataLoader(tokenized_train_data, batch_size=2)

for batch in train_dataloader:
    break

# Create a model
model = AutoModel.from_pretrained("roberta-base")

# Pass the input to the model
output = model(batch["input_ids"], batch["attention_mask"])


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
type(batch)

dict

In [21]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-8.0747e-02,  1.1227e-01, -1.2671e-02,  ..., -7.1223e-02,
          -5.7412e-02, -2.5465e-02],
         [-2.3012e-02,  1.3107e-01, -4.0970e-02,  ..., -4.1194e-01,
           1.0667e-01,  1.6875e-01],
         [-4.1873e-02,  1.6118e-01,  8.0111e-02,  ..., -4.8685e-01,
          -8.3322e-02,  1.8065e-01],
         ...,
         [-6.1766e-02,  1.8361e-01, -9.7400e-03,  ...,  4.3802e-02,
           2.5736e-02,  4.4247e-02],
         [-6.1766e-02,  1.8361e-01, -9.7400e-03,  ...,  4.3802e-02,
           2.5736e-02,  4.4247e-02],
         [-6.1766e-02,  1.8361e-01, -9.7400e-03,  ...,  4.3802e-02,
           2.5736e-02,  4.4247e-02]],

        [[-4.3226e-02,  5.0265e-02, -2.5575e-02,  ..., -1.2738e-01,
          -6.5216e-02,  6.1907e-03],
         [-6.7968e-02, -4.9521e-02, -8.1624e-02,  ..., -4.6546e-01,
           2.8303e-01, -1.1778e-01],
         [-2.3176e-02,  6.5568e-02,  1.4714e-04,  ..., -3.1120e-01,
           7.

In [25]:
output.last_hidden_state.shape

torch.Size([2, 505, 768])