# Dataset preparation

1. Importing dataset

In [1]:
from datasets import load_dataset
dataset = load_dataset("yelp_review_full")

Found cached dataset yelp_review_full (/home/arjun/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [3]:
dataset["train"][:3]

{'label': [4, 1, 3],
 'text': ["dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.",
  "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have o

2. Creating tokenised dataset

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Loading cached processed dataset at /home/arjun/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-abfb413a29e68f59.arrow
Loading cached processed dataset at /home/arjun/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf/cache-fb3047aca4907541.arrow


In [5]:
tokenized_datasets['train']

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 650000
})

In [6]:
tokenize_function(dataset["train"][3])

{'input_ids': [101, 7348, 170, 2998, 1107, 1103, 6346, 1314, 1989, 1115, 1163, 1987, 119, 20029, 1110, 2232, 1106, 4565, 1106, 1321, 170, 1207, 1700, 1175, 1107, 1340, 119, 1124, 1209, 1129, 4007, 1304, 1277, 119, 165, 183, 165, 183, 2240, 1341, 4006, 170, 1207, 3995, 1107, 17520, 1115, 1128, 2140, 1176, 1547, 1593, 1129, 1112, 9684, 1112, 1774, 1106, 1525, 170, 2236, 106, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [7]:
tokenized_datasets['train'].features

{'label': ClassLabel(names=['1 star', '2 star', '3 stars', '4 stars', '5 stars'], id=None),
 'text': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [8]:
print(tokenized_datasets['train'][0])

{'label': 4, 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.", 'input_ids': [101, 173, 1197, 119, 2284, 2953, 3272, 1917, 178, 1440, 1111, 1107, 170, 1704, 22351, 119, 1119, 112, 188, 3505, 1105, 3123, 1106, 2037, 1106, 1443, 1217, 10063, 4404, 132, 1119, 112, 188, 1579, 1113, 1159, 1107, 3195, 1117, 4420, 132, 1119, 112, 188, 6559, 1114, 170, 1499, 118, 23555, 2704, 113, 183, 9379, 114, 1134, 1139, 2153, 1138, 3716, 1106, 1143, 1110, 1304, 1696, 1107, 1692, 1380, 5940, 1105, 1128, 1444, 6059, 132, 

Processing tokenized_dataset

In [9]:
tokenized_datasets = tokenized_datasets.remove_columns("text")
# We cannot use text data for training

tokenized_datasets = tokenized_datasets.rename_column("label", "labels") 
# Rename the label column to labels because the model expects the argument to be named labels :O

tokenized_datasets.set_format("torch")

In [10]:
tokenized_datasets['train'].features

{'labels': ClassLabel(names=['1 star', '2 star', '3 stars', '4 stars', '5 stars'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

Reduced dataset size for faster training

In [11]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [12]:
d = small_train_dataset
d

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [13]:
len(d[0])

4

3. Defining DataLoader

In [14]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [15]:
print(len(train_dataloader)) # we have 8 batches

125


4. Loading model

In [16]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
                                            # num_layers refers to the five classes of output


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/arjun/NewPytorchEnv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
/home/arjun/NewPytorchEnv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
ERROR: /home/arjun/NewPytorchEnv/bin/python3.10: undefined symbol: cudaRuntimeGetVersion
CUDA SETUP: libcudart.so path is None
CUDA SETUP: Is seems that your cuda installation is not in your path. See https://github.com/TimDettmers/bitsandbytes/issues/85 for more information.
CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 00
CUDA SETUP: Loading binary /home/arjun/NewPytorchE

  warn("The installed version of bitsandbytes was compiled without GPU support. "
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


5. Optimizer and learning rate scheduler

In [17]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

While optimizers like Adam have adaptive learning rates, they might still require some fine-tuning or annealing during training to achieve better convergence. The scheduler can decrease the learning rate over time, which helps the model to stabilize and find better minima in the loss landscape.

In [18]:
from transformers import get_scheduler

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

6. GPU code

In [19]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(device)

cuda


7. Training Loop

In [20]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps),desc='Training', unit='steps')

model.train()   # Some layers behave differently to training and inference. This sets all those 
                 # layers into training mode
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # print(batch)
        # break
        batch = {x: y.to(device) for x, y in batch.items()} # Put tokenised text to GPU
        outputs = model(**batch) # **batch allows you to unpack the key-value pairs from 
                                  # dictionary batch into the correct parameter space of model()
        loss = outputs.loss      # compute loss 
        loss.backward()          # computes gradients
        optimizer.step()         # optimises
        lr_scheduler.step()      # updates lr according to schedule. Improves performance
        optimizer.zero_grad()    # resets the gradients
        progress_bar.update(1)   # updates progress bar by 1

Training:   0%|          | 0/125 [00:00<?, ?steps/s]

8. Evaluation

In [21]:
import evaluate

bar = tqdm(range(len(eval_dataloader)))
metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    # This line calculates the model's predictions by taking the index of the maximum value 
     # along the last dimension of the logits tensor. It is a common way to obtain the class 
      # predictions from the logits. The resulting predictions tensor contains the predicted 
       # class labels for each example in the batch.

    metric.add_batch(predictions=predictions, references=batch["labels"])
    # This line adds the current batch's predictions and the corresponding reference 
     # (ground truth) labels to the metric object. The specific implementation of the 
      # add_batch() method in the evaluate module will store these predictions and labels to
       # compute the evaluation metric later.

    bar.update(1)

metric.compute()

  0%|          | 0/125 [00:00<?, ?it/s]

{'accuracy': 0.525}

9. Inference

In [25]:
input_text = 'The worst hotel ever. Never going in again. But customer service was okay'

inputs = tokenizer(input_text, return_tensors='pt').to(device)

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()

print("Rating:", predicted_class)

Rating: 0
