<a href="https://colab.research.google.com/github/CuriousAboutData/portfolio-projects/blob/main/Fine_tune_a_pre_trained_model_for_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Activate GPU and Install Dependencies

In [None]:
# Activate GPU for faster training by clicking on 'Runtime' > 'Change runtime type' and then selecting GPU as the Hardware accelerator
# Then check if GPU is available
import torch
torch.cuda.is_available()

True

In [None]:
# Install required libraries
!pip install datasets transformers huggingface_hub
!apt-get install git-lfs
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#2. Preprocess data

In [None]:
# Load data
from datasets import load_dataset
imdb = load_dataset("imdb")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Create a smaller training dataset for faster training times
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(5000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(500))])
print(small_train_dataset[0])
print(small_test_dataset[0])



{'text': 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...', 'label': 1}
{'text': "<br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining King Lear story and of course Michelle Pfeiffer was in it, so what could go wrong?<br /><br />Very quickly, 

In [None]:
# Set DistilBERT tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 3. Training the model

In [None]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

In [None]:
# Define the evaluation metrics 
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [None]:
# Log in to your Hugging Face account 
# Get your API token here https://huggingface.co/settings/token
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-sentiment-model-5000-samples"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch", 
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

/content/finetuning-sentiment-model-5000-samples is already a clone of https://huggingface.co/imrelori/finetuning-sentiment-model-5000-samples. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
# Train the model
trainer.train()



Step,Training Loss
500,0.2703


Several commits (5) will be pushed upstream.
Several commits (6) will be pushed upstream.


TrainOutput(global_step=626, training_loss=0.25319388575447255, metrics={'train_runtime': 505.0889, 'train_samples_per_second': 19.798, 'train_steps_per_second': 1.239, 'total_flos': 1305886383784992.0, 'train_loss': 0.25319388575447255, 'epoch': 2.0})

In [None]:
# Compute the evaluation metrics
trainer.evaluate()

{'eval_loss': 0.27538156509399414,
 'eval_accuracy': 0.906,
 'eval_f1': 0.9065606361829026,
 'eval_runtime': 8.8972,
 'eval_samples_per_second': 56.197,
 'eval_steps_per_second': 3.597,
 'epoch': 2.0}

# 4. Analyzing new data with the model

In [None]:
!huggingface-cli login

In [None]:
model.push_to_hub("finetuning-sentiment-model-5000-samples")

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/imrelori/finetuning-sentiment-model-5000-samples/commit/f8f44a4318f3e502f4b36bcacf08987a322425d0', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='f8f44a4318f3e502f4b36bcacf08987a322425d0', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("finetuning-sentiment-model-5000-samples")

CommitInfo(commit_url='https://huggingface.co/imrelori/finetuning-sentiment-model-5000-samples/commit/7cee109878173e2a54d79ee58b32ffe0f57abadb', commit_message='Upload tokenizer', commit_description='', oid='7cee109878173e2a54d79ee58b32ffe0f57abadb', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Upload the model to the Hub
trainer.push_to_hub("finetuning-sentiment-model-5000-samples")

In [None]:
# Run inferences with your new model using Pipeline
from transformers import pipeline

sentiment_model = pipeline(model="imrelori/finetuning-sentiment-model-5000-samples")

sentiment_model(["It was great!", "I think this move was fine, but some the first version was better.", "This movie sucks!"])

[{'label': 'LABEL_1', 'score': 0.9835646152496338},
 {'label': 'LABEL_1', 'score': 0.6649791598320007},
 {'label': 'LABEL_0', 'score': 0.9709738492965698}]