In [18]:
!pip install -q transformers datasets evaluate accelerate



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
import numpy as np
import pandas as pd

from datasets import load_dataset
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)


In [20]:
dataset_name = "guyhadad01/Hotels_reviews"

raw_datasets = load_dataset(dataset_name)
raw_datasets


DatasetDict({
    train: Dataset({
        features: ['date', 'rating', 'title', 'text', 'property_dict', 'Name', 'City', 'County'],
        num_rows: 21112546
    })
})

In [21]:
raw_datasets["train"][0]


{'date': '2019-01-01 00:00:00',
 'rating': 5.0,
 'title': 'Xmas holiday',
 'text': 'We went here with our kids for Xmas holiday and we really liked it. Large options of food for breakfast and lunch , you can really taste the quality of the food in there. The surrounding area is nice and clean. Good experience. Hardly recommended .',
 'property_dict': '{"service": null, "location": null, "sleep quality": null, "rooms": null, "cleanliness": null, "value": null, "check in / front desk": null, "business service (e.g., internet access)": null}',
 'Name': 'Baltic',
 'City': 'Giulianova Lido',
 'County': 'Italy'}

In [22]:
def map_rating_to_label(example):
    """
    Map 1-5 star rating to:
    0 = Negative (1-2 stars)
    1 = Neutral  (3 stars)
    2 = Positive (4-5 stars)
    """
    rating = example["rating"]
    # Some datasets store rating as float, some as int
    # Convert to int just in case
    r = int(round(rating))

    if r <= 2:
        label = 0
    elif r == 3:
        label = 1
    else:
        label = 2

    return {"label": label}

labeled_dataset = raw_datasets["train"].map(map_rating_to_label)
labeled_dataset


Dataset({
    features: ['date', 'rating', 'title', 'text', 'property_dict', 'Name', 'City', 'County', 'label'],
    num_rows: 21112546
})

In [23]:
# Shuffle and select a subset for experimentation
subset_size = 100_000  # adjust smaller/bigger based on your GPU/CPU

labeled_dataset = labeled_dataset.shuffle(seed=42).select(range(subset_size))
len(labeled_dataset)


100000

In [24]:
label_counts = pd.Series(labeled_dataset["label"]).value_counts().sort_index()
label_counts.index = ["Negative (0)", "Neutral (1)", "Positive (2)"]
label_counts


Negative (0)    10569
Neutral (1)     11598
Positive (2)    77833
Name: count, dtype: int64

In [25]:
from datasets import ClassLabel

# Define the label names (optional but nice for readability)
label_names = ["negative", "neutral", "positive"]

# Copy existing features and replace "label" with a ClassLabel feature
features = labeled_dataset.features.copy()
features["label"] = ClassLabel(num_classes=3, names=label_names)

# Cast the dataset
labeled_dataset = labeled_dataset.cast(features)

labeled_dataset


Dataset({
    features: ['date', 'rating', 'title', 'text', 'property_dict', 'Name', 'City', 'County', 'label'],
    num_rows: 100000
})

In [26]:
# Now we can do a stratified split by "label"
dataset_split = labeled_dataset.train_test_split(
    test_size=0.2,
    stratify_by_column="label"
)

train_dataset = dataset_split["train"]
val_dataset   = dataset_split["test"]

len(train_dataset), len(val_dataset)


(80000, 20000)

In [27]:
model_name = "distilbert-base-uncased"
num_labels = 3  # Negative, Neutral, Positive

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

model


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [28]:
max_length = 256  # good default for reviews

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )

encoded_train = train_dataset.map(preprocess_function, batched=True)
encoded_val   = val_dataset.map(preprocess_function, batched=True)

# Keep only columns needed for training
cols_to_keep = ["input_ids", "attention_mask", "label"]

encoded_train = encoded_train.remove_columns(
    [c for c in encoded_train.column_names if c not in cols_to_keep]
)
encoded_val = encoded_val.remove_columns(
    [c for c in encoded_val.column_names if c not in cols_to_keep]
)

encoded_train.set_format(type="torch")
encoded_val.set_format(type="torch")

encoded_train[0]


Map: 100%|██████████| 80000/80000 [00:15<00:00, 5250.04 examples/s]
Map: 100%|██████████| 20000/20000 [00:03<00:00, 5483.97 examples/s]


{'label': tensor(2),
 'input_ids': tensor([  101,  1996,  3309,  2003, 14057,  2135,  2284,  1999,  1996,  2103,
          2803,  1010,  2485,  2000,  2048,  1057,  1011, 17392,  3703,  1998,
          1037,  3181,  3295,  1006, 26520,  4918,  1007,  2073,  1996,  4068,
          2813,  4055,  2264,  1998,  2225,  4068,  1012,  2045,  2024,  2116,
          2248,  7884,  1999,  1996,  2181,  1012,  2116,  9941,  2024,  2485,
          2011,  1998,  4089,  7801,  2011,  3345,  1012,  2057,  2020, 15936,
          2011,  1996, 22445,  2282,  1998,  2019,  4866,  6350,  3659,  1006,
          2348,  2025,  2443,  1999,  2256,  2282, 23234,  1007,  1012,  1996,
          2392,  4624,  3095,  2001,  2200, 14044,  1999, 14669,  2149,  2005,
          4356,  1011,  3773,  7562,  2030,  2005,  6265,  1998,  4596,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,   

In [29]:
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    result = accuracy_metric.compute(predictions=preds, references=labels)
    return result  # {"accuracy": ...}


Downloading builder script: 4.20kB [00:00, 440kB/s]


In [30]:
def run_experiment(learning_rate, batch_size, num_epochs, run_name):
    # fresh model each time (so experiments are comparable)
    exp_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

    training_args = TrainingArguments(
        output_dir=f"./results/{run_name}",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        report_to="none",  # don't send to wandb/tensorboard unless you want to
        run_name=run_name,
    )

    trainer = Trainer(
        model=exp_model,
        args=training_args,
        train_dataset=encoded_train,
        eval_dataset=encoded_val,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_results = trainer.evaluate()
    val_acc = eval_results["eval_accuracy"]
    print(f"{run_name} -> validation accuracy: {val_acc:.4f}")
    return val_acc


In [31]:
learning_rates = [2e-5, 5e-5, 1e-4]
results_lr = []

for lr in learning_rates:
    run_name = f"lr_{lr}_bs_16_ep_3"
    acc = run_experiment(
        learning_rate=lr,
        batch_size=16,
        num_epochs=3,
        run_name=run_name,
    )
    results_lr.append({
        "type": "learning_rate",
        "lr": lr,
        "batch_size": 16,
        "epochs": 3,
        "val_accuracy": acc,
    })

df_lr = pd.DataFrame(results_lr)
df_lr


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
batch_sizes = [8, 16, 32]
results_bs = []

for bs in batch_sizes:
    run_name = f"lr_5e-5_bs_{bs}_ep_3"
    acc = run_experiment(
        learning_rate=5e-5,
        batch_size=bs,
        num_epochs=3,
        run_name=run_name,
    )
    results_bs.append({
        "type": "batch_size",
        "lr": 5e-5,
        "batch_size": bs,
        "epochs": 3,
        "val_accuracy": acc,
    })

df_bs = pd.DataFrame(results_bs)
df_bs


In [None]:
epoch_values = [2, 3, 4]
results_ep = []

for ep in epoch_values:
    run_name = f"lr_5e-5_bs_16_ep_{ep}"
    acc = run_experiment(
        learning_rate=5e-5,
        batch_size=16,
        num_epochs=ep,
        run_name=run_name,
    )
    results_ep.append({
        "type": "epochs",
        "lr": 5e-5,
        "batch_size": 16,
        "epochs": ep,
        "val_accuracy": acc,
    })

df_ep = pd.DataFrame(results_ep)
df_ep


In [None]:
all_results = pd.concat([df_lr, df_bs, df_ep], ignore_index=True)
all_results


In [None]:
all_results.sort_values("val_accuracy", ascending=False)
