In [1]:
import torch
import numpy as np
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, DefaultDataCollator, TrainingArguments, Trainer
from datasets import load_dataset, load_metric

In [2]:
device = torch.device("mps") if torch.cuda.is_available() else torch.device("cpu")
metric = load_metric('accuracy')

  metric = load_metric('accuracy')


# **Fine-Tuning Of Pre-Trained LLM (BERT)**

In [3]:
yelp_review_dataset = load_dataset('yelp_review_full')
yelp_review_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [4]:
def tokenize_function(examples):
    return yelp_review_tokenizer(examples['text'], padding = 'max_length', truncation = True)

In [5]:
def compute_metrics(evaluate_predictions):
    predictions, labels = evaluate_predictions
    predictions = np.argmax(predictions, axis = 1)
    return metric.compute(predictions = predictions, references = labels)

In [6]:
tokenized_dataset = yelp_review_dataset.map(tokenize_function, batched = True)
training_dataset = tokenized_dataset['train'].shuffle(seed = 42).select(range(1000))
evaluation_dataset = tokenized_dataset['test'].shuffle(seed = 42).select(range(1000))

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [7]:
data_collator = DefaultDataCollator(return_tensors = 'pt')

# Converting the tokenized datasets to TensorFlow Datasets
pytorch_training_dataset = torch.utils.data.DataLoader(training_dataset, shuffle = True, collate_fn = data_collator, batch_size = 8)
pytorch_validation_dataset = torch.utils.data.DataLoader(evaluation_dataset, shuffle = False, collate_fn = data_collator, batch_size = 8)

In [8]:
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels = 5)
fine_tuned_model.to(device)
training_args = TrainingArguments(output_dir = 'checkpoints', logging_dir = 'logs', num_train_epochs = 3, disable_tqdm = False,
                                  logging_steps = 10, per_device_train_batch_size = 8, per_device_eval_batch_size = 8)
model_trainer = Trainer(model = fine_tuned_model, args = training_args, train_dataset = training_dataset, eval_dataset = evaluation_dataset, 
                        compute_metrics = compute_metrics)
model_trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33medgoh95[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
10,1.684
20,1.5795
30,1.6293
40,1.6223
50,1.5708
60,1.356
70,1.37
80,1.4244
90,1.2761
100,1.2553


TrainOutput(global_step=375, training_loss=0.98653231493632, metrics={'train_runtime': 618.9005, 'train_samples_per_second': 4.847, 'train_steps_per_second': 0.606, 'total_flos': 789354427392000.0, 'train_loss': 0.98653231493632, 'epoch': 3.0})