In [14]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
import datasets
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from transformers import TrainingArguments, Trainer

In [15]:
sentiment_pipeline = pipeline('sentiment-analysis',model="cardiffnlp/twitter-roberta-base-sentiment",device='cuda')
data = ['I love you', 'I hate you']
sentiment_pipeline(data)

[{'label': 'LABEL_2', 'score': 0.9557049870491028},
 {'label': 'LABEL_0', 'score': 0.9654269218444824}]

In [16]:
from datasets import load_dataset
imdb = load_dataset('imdb')

In [17]:
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])

In [18]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [19]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)
 
tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)


In [20]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
def compute_metrics(eval_pred):  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = accuracy_score(labels, predictions)
   f1 = f1_score(labels, predictions)
   return {"accuracy": accuracy, "f1": f1}


In [23]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
repo_name = "finetuning-sentiment-model-3000-samples"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [25]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=376, training_loss=0.30192427939557015, metrics={'train_runtime': 61.8065, 'train_samples_per_second': 97.077, 'train_steps_per_second': 6.084, 'total_flos': 782725021021056.0, 'train_loss': 0.30192427939557015, 'epoch': 2.0})

In [26]:
trainer.evaluate()

{'eval_loss': 0.34990736842155457,
 'eval_accuracy': 0.8633333333333333,
 'eval_f1': 0.8637873754152824,
 'eval_runtime': 0.9554,
 'eval_samples_per_second': 314.015,
 'eval_steps_per_second': 19.888,
 'epoch': 2.0}

In [None]:
#NLP method did not work...