In [1]:
!pip install -q transformers datasets evaluate accelerate



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# ==== CELL 2: imports and setup ====

import numpy as np
import pandas as pd

from datasets import load_dataset
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset_name = "guyhadad01/Hotels_reviews"
raw_datasets = load_dataset(dataset_name)
raw_datasets


DatasetDict({
    train: Dataset({
        features: ['date', 'rating', 'title', 'text', 'property_dict', 'Name', 'City', 'County'],
        num_rows: 21112546
    })
})

In [4]:
raw_datasets["train"][0]


{'date': '2019-01-01 00:00:00',
 'rating': 5.0,
 'title': 'Xmas holiday',
 'text': 'We went here with our kids for Xmas holiday and we really liked it. Large options of food for breakfast and lunch , you can really taste the quality of the food in there. The surrounding area is nice and clean. Good experience. Hardly recommended .',
 'property_dict': '{"service": null, "location": null, "sleep quality": null, "rooms": null, "cleanliness": null, "value": null, "check in / front desk": null, "business service (e.g., internet access)": null}',
 'Name': 'Baltic',
 'City': 'Giulianova Lido',
 'County': 'Italy'}

In [5]:
def map_rating_to_label(example):
    """
    Map 1-5 star rating to:
      0 = Negative (1-2 stars)
      1 = Neutral  (3 stars)
      2 = Positive (4-5 stars)
    """
    rating = example["rating"]
    r = int(round(rating))  # make sure it's an int

    if r <= 2:
        label = 0
    elif r == 3:
        label = 1
    else:
        label = 2

    return {"label": label}

labeled_dataset = raw_datasets["train"].map(map_rating_to_label)
labeled_dataset


Map: 100%|██████████| 21112546/21112546 [1:45:09<00:00, 3345.92 examples/s]  


Dataset({
    features: ['date', 'rating', 'title', 'text', 'property_dict', 'Name', 'City', 'County', 'label'],
    num_rows: 21112546
})

In [6]:
subset_size = 12000  # total examples used (80% train, 20% val)
labeled_dataset = labeled_dataset.shuffle(seed=42).select(range(subset_size))
len(labeled_dataset)


12000

In [7]:
label_counts = pd.Series(labeled_dataset["label"]).value_counts().sort_index()
label_counts.index = ["Negative (0)", "Neutral (1)", "Positive (2)"]
label_counts


Negative (0)    1291
Neutral (1)     1375
Positive (2)    9334
Name: count, dtype: int64

In [8]:
dataset_split = labeled_dataset.train_test_split(
    test_size=0.2,
    seed=42,
)

train_dataset = dataset_split["train"]
val_dataset   = dataset_split["test"]

len(train_dataset), len(val_dataset)


(9600, 2400)

In [9]:
model_name = "distilbert-base-uncased"
num_labels = 3  # Negative, Neutral, Positive

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

model


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [10]:
max_length = 128  # shorter sequences = faster training

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )

encoded_train = train_dataset.map(preprocess_function, batched=True)
encoded_val   = val_dataset.map(preprocess_function, batched=True)

cols_to_keep = ["input_ids", "attention_mask", "label"]

encoded_train = encoded_train.remove_columns(
    [c for c in encoded_train.column_names if c not in cols_to_keep]
)
encoded_val = encoded_val.remove_columns(
    [c for c in encoded_val.column_names if c not in cols_to_keep]
)

encoded_train.set_format(type="torch")
encoded_val.set_format(type="torch")

encoded_train[0]


Map: 100%|██████████| 9600/9600 [00:10<00:00, 925.90 examples/s]
Map: 100%|██████████| 2400/2400 [00:01<00:00, 1315.91 examples/s]


{'label': tensor(2),
 'input_ids': tensor([  101,  1996,  3095,  2182,  2003,  2307,  1998,  2200, 14044,  1012,
          1996,  4734,  2024,  4550,  1998,  2200,  3835,  1012,  2009,  2003,
          3733,  2006,  1998,  2125,  1996,  7553,  1012,  4370,  1019,  6385,
          1998,  2097,  2994,  2182,  2153,  1012,  2065,  2017,  2342,  2505,
          1996,  3095,  2003,  2061, 14044,  1998,  2097,  2393,  2017,  2151,
          2126,  2027,  2064,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,   

In [11]:
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    result = accuracy_metric.compute(predictions=preds, references=labels)
    return result  # {"accuracy": ...}


In [12]:
def run_experiment(learning_rate, batch_size, num_epochs, run_name):
    # fresh model each time so runs are comparable
    exp_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

    training_args = TrainingArguments(
        output_dir=f"./results/{run_name}",
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        logging_steps=50,
        seed=42,
    )

    trainer = Trainer(
        model=exp_model,
        args=training_args,
        train_dataset=encoded_train,
        eval_dataset=encoded_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_results = trainer.evaluate()
    val_acc = eval_results["eval_accuracy"]
    print(f"{run_name} -> validation accuracy: {val_acc:.4f}")
    return val_acc


In [13]:
# ==== CELL 14: Hyperparameter experiments overview (text) ====
# In the real notebook, you'd use markdown. Here we'll just print a note.

print("""
We now explore:
1. Learning rate (batch_size=16, epochs=1)
2. Batch size (lr=5e-5, epochs=1)
3. Number of epochs (lr=5e-5, batch_size=16)
""")



We now explore:
1. Learning rate (batch_size=16, epochs=1)
2. Batch size (lr=5e-5, epochs=1)
3. Number of epochs (lr=5e-5, batch_size=16)



In [None]:


learning_rates = [2e-5, 5e-5, 1e-4]
results_lr = []

for lr in learning_rates:
    run_name = f"lr_{lr}_bs_16_ep_1"
    acc = run_experiment(
        learning_rate=lr,
        batch_size=16,
        num_epochs=1,   # keep it fast
        run_name=run_name,
    )
    results_lr.append({
        "type": "learning_rate",
        "lr": lr,
        "batch_size": 16,
        "epochs": 1,
        "val_accuracy": acc,
    })

df_lr = pd.DataFrame(results_lr)
df_lr


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.6653
100,0.394
150,0.3543
200,0.4021
250,0.3931
300,0.339
350,0.3874
400,0.313
450,0.3015
500,0.3438




lr_2e-05_bs_16_ep_1 -> validation accuracy: 0.8658


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.5788
100,0.3808
150,0.348
200,0.3984
250,0.3884
300,0.3475
350,0.3865
400,0.3248
450,0.2956
500,0.3455




lr_5e-05_bs_16_ep_1 -> validation accuracy: 0.8646


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.6265
100,0.4171
150,0.3752
200,0.4315
250,0.4253
300,0.38
350,0.4174
400,0.3525
450,0.2997
500,0.3637




lr_0.0001_bs_16_ep_1 -> validation accuracy: 0.8638


Unnamed: 0,type,lr,batch_size,epochs,val_accuracy
0,learning_rate,2e-05,16,1,0.865833
1,learning_rate,5e-05,16,1,0.864583
2,learning_rate,0.0001,16,1,0.86375


In [15]:
batch_sizes = [8, 16, 32]
results_bs = []

for bs in batch_sizes:
    run_name = f"lr_5e-5_bs_{bs}_ep_1"
    acc = run_experiment(
        learning_rate=5e-5,
        batch_size=bs,
        num_epochs=1,   # keep it fast
        run_name=run_name,
    )
    results_bs.append({
        "type": "batch_size",
        "lr": 5e-5,
        "batch_size": bs,
        "epochs": 1,
        "val_accuracy": acc,
    })

df_bs = pd.DataFrame(results_bs)
df_bs


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.6285
100,0.4888
150,0.5034
200,0.4113
250,0.3683
300,0.3984
350,0.4447
400,0.3543
450,0.4341
500,0.3951




lr_5e-5_bs_8_ep_1 -> validation accuracy: 0.8712


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.5788
100,0.3808
150,0.348
200,0.3984
250,0.3884
300,0.3475
350,0.3865
400,0.3248
450,0.2956
500,0.3455




lr_5e-5_bs_16_ep_1 -> validation accuracy: 0.8646


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.5257
100,0.3759
150,0.3609
200,0.3573
250,0.3153
300,0.3366




lr_5e-5_bs_32_ep_1 -> validation accuracy: 0.8642


Unnamed: 0,type,lr,batch_size,epochs,val_accuracy
0,batch_size,5e-05,8,1,0.87125
1,batch_size,5e-05,16,1,0.864583
2,batch_size,5e-05,32,1,0.864167


In [16]:
epoch_values = [1, 2]  # just 1 and 2 to keep runtime reasonable
results_ep = []

for ep in epoch_values:
    run_name = f"lr_5e-5_bs_16_ep_{ep}"
    acc = run_experiment(
        learning_rate=5e-5,
        batch_size=16,
        num_epochs=ep,
        run_name=run_name,
    )
    results_ep.append({
        "type": "epochs",
        "lr": 5e-5,
        "batch_size": 16,
        "epochs": ep,
        "val_accuracy": acc,
    })

df_ep = pd.DataFrame(results_ep)
df_ep


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.5788
100,0.3808
150,0.348
200,0.3984
250,0.3884
300,0.3475
350,0.3865
400,0.3248
450,0.2956
500,0.3455




lr_5e-5_bs_16_ep_1 -> validation accuracy: 0.8646


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.5788
100,0.3774
150,0.3363
200,0.3962
250,0.395
300,0.3589
350,0.4023
400,0.3206
450,0.3197
500,0.3924




lr_5e-5_bs_16_ep_2 -> validation accuracy: 0.8671


Unnamed: 0,type,lr,batch_size,epochs,val_accuracy
0,epochs,5e-05,16,1,0.864583
1,epochs,5e-05,16,2,0.867083


In [4]:
import pandas as pd

all_results = pd.concat([df_lr, df_bs, df_ep], ignore_index=True)
all_results.sort_values("val_accuracy", ascending=False)


NameError: name 'df_lr' is not defined