# Fine-tuning a Pretrained Model for sentiment analysis

## Importing necessary libraries and data


In [4]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install ipywidgets
!pip install torch
!pip install transformers[torch]
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install scikit-learn

Looking in indexes: https://download.pytorch.org/whl/cu118


In [28]:
!nvidia-smi
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device: {torch.cuda.get_device_name(0)}")
else:
    print("WARNING: Training on CPU will be very slow!")

Fri Nov 14 19:39:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 581.42                 Driver Version: 581.42         CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070      WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   43C    P5             25W /  215W |    4740MiB /  12282MiB |     21%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [63]:
from pathlib import Path
from urllib.request import urlretrieve
import zipfile
import pandas as pd

base_dir = Path().resolve()

amazon_mobile_reviews_url = "https://eduds.blob.core.windows.net/nlp/Amazon_Unlocked_Mobile.csv.zip"
filename = "data/Amazon_Unlocked_Mobile.csv.zip"
data_dir = base_dir / "data"

zip_path = data_dir / "Amazon_Unlocked_Mobile.csv.zip"
csv_path = data_dir / "Amazon_Unlocked_Mobile.csv"

urlretrieve(amazon_mobile_reviews_url, filename)

with zipfile.ZipFile(zip_path) as zfile:
    zfile.extractall(data_dir)

df = pd.read_csv(csv_path)

In [64]:
df.info()
df.describe()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413840 entries, 0 to 413839
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Product Name  413840 non-null  object 
 1   Brand Name    348669 non-null  object 
 2   Price         407907 non-null  float64
 3   Rating        413840 non-null  int64  
 4   Reviews       413770 non-null  object 
 5   Review Votes  401544 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 18.9+ MB


Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


## CONFIGURATION


In [45]:
DATA_URL = "https://eduds.blob.core.windows.net/nlp/Amazon_Unlocked_Mobile.csv.zip"
DATA_FILE = "data/Amazon_Unlocked_Mobile.csv"
MODEL_CHECKPOINT = "distilbert-base-uncased"
REPO_NAME = "Floressek/sentiment_classification_from_distillbert"
HUGGING_FACE_TOKEN = "here input your token"

MAX_REVIEW_LENGTH = 128
TEST_SIZE = 0.3
BATCH_SIZE = 48
BATCH_SIZE_TOKEN = 1000
LEARNING_RATE = 2e-5
NUM_EPOCHS = 2
WEIGHT_DECAY = 0.01
SEED = 100

## Data cleaning


In [46]:
df = df.drop(columns=["Brand Name", "Price", "Review Votes", "Product Name"])
df.head()

Unnamed: 0,Rating,Reviews
0,5,I feel so LUCKY to have found this used (phone...
1,4,"nice phone, nice up grade from my pantach revu..."
2,5,Very pleased
3,4,It works good but it goes slow sometimes but i...
4,4,Great phone to replace my lost phone. The only...


In [47]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

dataset = dataset.filter(
    lambda x: (
            x["Reviews"] is not None
            and len(x["Reviews"].split()) < MAX_REVIEW_LENGTH
            and x["Rating"] in [1, 5]  # skrajnosci do binary classification
    )
)

dataset_split = dataset.train_test_split(test_size=TEST_SIZE, seed=SEED)

Filter:   0%|          | 0/413840 [00:00<?, ? examples/s]

### Tokenization

In [48]:
from datasets import DatasetDict
from typing import Any
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


# zostawiamy inputs id z tokenizacji i attention mask dla padding/tresc oraz labels
def tokenize_and_label(dataset: DatasetDict, tokenizer: Any):
    def tokenize_function(example):
        return tokenizer(example["Reviews"], padding="max_length", truncation=True)

    def convert_to_binary_label(example):
        return {'label': [0 if r == 1 else 1 for r in example['Rating']]}

    tokenized = dataset.map(tokenize_function, batched=True, batch_size=BATCH_SIZE_TOKEN)
    tokenized = tokenized.map(convert_to_binary_label, batched=True, batch_size=BATCH_SIZE_TOKEN)
    tokenized = tokenized.remove_columns(["Reviews", "Rating"])

    return tokenized

In [50]:
import numpy as np
from evaluate import load


def compute_metrics(eval_pred) -> dict:
    accuracy_metric = load("accuracy")
    f1_metric = load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]

    return {"accuracy": accuracy, "f1": f1}

## Huggingface - logging and model loading


In [51]:
from huggingface_hub import login

token = HUGGING_FACE_TOKEN

if token:
    login(token=HUGGING_FACE_TOKEN)
else:
    login()

## Train and fine-tuning the model


In [52]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


def train_model(tokenized_datasets: DatasetDict, tokenizer) -> Trainer:
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_CHECKPOINT,
        num_labels=2,
    )

    training_args = TrainingArguments(
        output_dir=REPO_NAME,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=NUM_EPOCHS,
        weight_decay=WEIGHT_DECAY,
        save_strategy="epoch",
        eval_strategy="epoch",
        push_to_hub=True,
        remove_unused_columns=False,
        logging_steps=100,
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['test'],
        processing_class=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()

    return trainer


print(f"Train size: {len(dataset_split['train'])}")
print(f"Test size: {len(dataset_split['test'])}")

tokenized_datasets = tokenize_and_label(dataset_split, tokenizer)

trainer = train_model(tokenized_datasets, tokenizer)

Train size: 196375
Test size: 84162


Map:   0%|          | 0/196375 [00:00<?, ? examples/s]

Map:   0%|          | 0/84162 [00:00<?, ? examples/s]

Map:   0%|          | 0/196375 [00:00<?, ? examples/s]

Map:   0%|          | 0/84162 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0432,0.040534,0.989152,0.992839
2,0.0299,0.035284,0.991718,0.994545


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

## Testing the newly fine-tuned model


In [53]:
from transformers import pipeline


def create_classifier(model_path=f"./{REPO_NAME}"):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    return pipeline("text-classification", model=model, tokenizer=tokenizer)


def test_classifier(classifier) -> None:
    examples = [
        "Shame. I wish I hadn't buy it.",
        "Great handset!",
        "Terrible product, waste of money",
        "Best phone ever, highly recommend!"
    ]

    for text in examples:
        result = classifier(text)
        print(f"Text: {text}")
        print(f"Result: {result}\n")


### Eval metrics

In [62]:
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, precision_recall_curve, auc
import numpy as np
import torch

model_path = f"./{REPO_NAME}"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
print(model)

preds = trainer.predict(tokenized_datasets["test"])
logits = preds.predictions
y_true = preds.label_ids
y_pred = np.argmax(logits, axis=-1)
probs = torch.softmax(torch.from_numpy(logits), dim=-1).numpy()[:, 1]

cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
roc_auc = roc_auc_score(y_true, probs)
prec_curve, rec_curve, _ = precision_recall_curve(y_true, probs)
pr_auc = auc(rec_curve, prec_curve)
report = classification_report(y_true, y_pred, digits=4)

print("Confusion matrix:")
print(cm)
print(f"TP={tp} FP={fp} TN={tn} FN={fn}")
print(f"ROC-AUC={roc_auc:.4f} PR-AUC={pr_auc:.4f}")
print(report)


Architecture: DistilBertForSequenceClassification
DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "dtype": "float32",
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.57.1",
  "vocab_size": 30522
}

Total params: 66,955,010
Trainable params: 66,955,010
Confusion matrix:
[[19922   352]
 [  345 63543]]
TP=63543 FP=352 TN=19922 FN=345
ROC-AUC=0.9983 PR-AUC=0.9994
              precision    recall  f1-score   support

           0     0.9830    0.9826    0.9828     20274
           1     0.9945    0.9946    0.9945     63888

    accuracy                         0.99

### Testing the model with text samples

In [58]:
# Pomijam bo mialem w trakcie uczenia eval wlaczony
# eval_results = trainer.evaluate()
# print(f"Evaluation results: {eval_results}")

print("Testing the fine-tuned model:")
classifier = create_classifier()
test_classifier(classifier)

Device set to use cuda:0


Testing the fine-tuned model:
Text: Shame. I wish I hadn't buy it.
Result: [{'label': 'LABEL_0', 'score': 0.9975292086601257}]

Text: Great handset!
Result: [{'label': 'LABEL_1', 'score': 0.9996094107627869}]

Text: Terrible product, waste of money
Result: [{'label': 'LABEL_0', 'score': 0.998723566532135}]

Text: Best phone ever, highly recommend!
Result: [{'label': 'LABEL_1', 'score': 0.9996873140335083}]

