In [1]:
import torch
print(torch.cuda.is_available())   # should be True
print(torch.cuda.get_device_name(0))  # should print "NVIDIA GeForce RTX 4050"
print(torch.cuda.current_device())  # should be 0

True
NVIDIA GeForce RTX 4050 Laptop GPU
0


In [2]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [3]:
torch.set_float32_matmul_precision("high")

In [4]:
from datasets import load_dataset

dataset = load_dataset("yelp_polarity", split="train")
dataset[0]

  from .autonotebook import tqdm as notebook_tqdm


{'text': "Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars.",
 'label': 0}

In [5]:
lowercased_dataset = dataset.map(lambda example: {"text": [text.lower() for text in example["text"]]}, batched=True)

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def tokenize_function(example):
  return tokenizer(example["text"],max_length=128, truncation=True)

tokenized_dataset = lowercased_dataset.map(tokenize_function, batched=True)

tokenized_dataset.train_test_split(test_size=0.2)

Map: 100%|██████████| 560000/560000 [01:55<00:00, 4829.28 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 448000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 112000
    })
})

In [8]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)
# tokenized_dataset["train"][0]

In [9]:
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

In [10]:
tokenized_dataset = tokenized_dataset.rename_column(original_column_name="label", new_column_name="labels")

In [11]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)  

In [12]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import numpy as np

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = torch.argmax(torch.from_numpy(logits), dim=-1).numpy()

  accuracy = accuracy_score(labels, predictions)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')

  return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    num_train_epochs=5,
    weight_decay=0.01
)

train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

trainer.train()




  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1224,0.114444,0.959509,0.962517,0.956316,0.959406
2,0.0946,0.120373,0.961321,0.960543,0.962223,0.961382
3,0.0735,0.14197,0.961839,0.958625,0.965399,0.962
4,0.0514,0.178333,0.962045,0.95754,0.967023,0.962258
5,0.0353,0.201197,0.962268,0.960716,0.964007,0.962359


TrainOutput(global_step=70000, training_loss=0.08039951117379325, metrics={'train_runtime': 17879.1541, 'train_samples_per_second': 125.286, 'train_steps_per_second': 3.915, 'total_flos': 1.473421910016e+17, 'train_loss': 0.08039951117379325, 'epoch': 5.0})