## Project NLP | Business Case: Automated Customer Reviews
Review classification file. Using bert base uncased (bbu)

## Downloads

In [1]:
# The following downloads are needed to run this notebook
# !pip -q install transformers datasets
!pip -q install -U "transformers>=4.30" "accelerate>=0.21" datasets
import transformers, torch
print("transformers:", transformers.__version__, "| torch:", torch.__version__)

transformers: 4.55.4 | torch: 2.8.0+cu126


## Libraries

In [None]:
# The following imports should be done to run this notebook
import os, json, importlib
import pandas as pd
import numpy as np
import torch

from datetime import datetime
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
)

import transformers
import transformers.training_args as ta
importlib.reload(transformers)
importlib.reload(ta)

from transformers.training_args import TrainingArguments
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,   
)
from datasets import Dataset

os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("transformers:", transformers.__version__)
print("`evaluation_strategy` in TrainingArguments? ",
      "evaluation_strategy" in TrainingArguments.__init__.__code__.co_varnames)


transformers: 4.55.4
`evaluation_strategy` in TrainingArguments?  False


## Loading preprocessed data

In [3]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Paths
BASE = "/content/drive/MyDrive/Project_NLP"
PATH_FULL   = os.path.join(BASE, "video_games_preprocessed.parquet")
PATH_SAMPLE = os.path.join(BASE, "video_games_preprocessed_sample1pct.parquet")
PATH_CFG    = os.path.join(BASE, "preprocess_config.json")

# Loading
df        = pd.read_parquet(PATH_FULL)
df_sample = pd.read_parquet(PATH_SAMPLE)
cfg       = json.load(open(PATH_CFG))

df_use = df_sample.copy()

# Checking
print(" - sample: ", len(df_sample), "rows")
print(" - config keys:", list(cfg.keys()))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 - sample:  46246 rows
 - config keys: ['text_col', 'label_col', 'label2id', 'max_length', 'class_counts', 'class_weights']


# Splitting data set


In [None]:
# Just inspecting + record class stats
label_col = cfg["label_col"]
order = ["negative", "neutral", "positive"]

def class_stats(series, order):
    counts = series.value_counts().reindex(order, fill_value=0)
    weights = (counts.sum() / (len(order) * counts)).astype(float)  # inverse freq
    return counts, weights

counts_use, weights_use = class_stats(df_use[label_col], order)

print("WORKING DF counts:\n", counts_use.to_dict())
print("WORKING DF class_weights:\n", weights_use.to_dict())

WORKING DF counts:
 {'negative': 8394, 'neutral': 3401, 'positive': 34451}
WORKING DF class_weights:
 {'negative': 1.8364704947978714, 'neutral': 4.532588454376164, 'positive': 0.4474567743558484}


In [5]:
# train/val/test split on the sample dataset (70/15/15)
text_col  = cfg["text_col"]
label_col = cfg["label_col"]
order     = ["negative", "neutral", "positive"]

train_df, tmp_df = train_test_split(
    df_use, test_size=0.30, stratify=df_use[label_col], random_state=42
)
val_df, test_df = train_test_split(
    tmp_df, test_size=0.50, stratify=tmp_df[label_col], random_state=42
)

def pct(d):
    c = d[label_col].value_counts().reindex(order, fill_value=0)
    return (c / c.sum() * 100).round(2)

print("Sizes -> train/val/test:", len(train_df), len(val_df), len(test_df))
print("\nClass % in train:\n", pct(train_df))
print("\nClass % in val:\n",   pct(val_df))
print("\nClass % in test:\n",  pct(test_df))

Sizes -> train/val/test: 32372 6937 6937

Class % in train:
 sentiment
negative    18.15
neutral      7.36
positive    74.49
Name: count, dtype: float64

Class % in val:
 sentiment
negative    18.15
neutral      7.35
positive    74.50
Name: count, dtype: float64

Class % in test:
 sentiment
negative    18.15
neutral      7.35
positive    74.50
Name: count, dtype: float64


As seen in preprocessing, the data set was very positive balanced. So we should rebalance the training split before we can do model training.

Negative 18.15%, neutral 7.35%, positive 74.50%

## Tokenizing for bert base uncased

In [6]:
# Setting up config for second model bert base uncased
MODEL_NAME_bbu = "bert-base-uncased"
SHORT_bbu      = "bbu"                     # <- shortname for this model
MAX_LENGTH     = cfg["max_length"]
label2id       = cfg["label2id"]
text_col       = cfg["text_col"]
label_col      = cfg["label_col"]
order          = ["negative","neutral","positive"]

# recompute class weights from the TRAIN split (recommended)
train_counts_bbu = train_df[label_col].value_counts().reindex(order, fill_value=0)
train_class_weights_bbu = (train_counts_bbu.sum() / (len(order) * train_counts_bbu)).astype(float).to_dict()
print("BBU train counts:", train_counts_bbu.to_dict())
print("BBU train class weights:", train_class_weights_bbu)


BBU train counts: {'negative': 5876, 'neutral': 2381, 'positive': 24115}
BBU train class weights: {'negative': 1.8363966417063762, 'neutral': 4.531989360212796, 'positive': 0.4474669984103946}


In [7]:
# Making data frames to keep only text and labels/sentiment
def prep_for_model(df_):
    d = df_[[text_col, label_col]].copy()
    d["labels"] = d[label_col].map(label2id).astype("int64")
    d.rename(columns={text_col: "text"}, inplace=True)
    return d[["text","labels"]]

train_p_bbu = prep_for_model(train_df)
val_p_bbu   = prep_for_model(val_df)
test_p_bbu  = prep_for_model(test_df)

for name, d in [("train_bbu", train_p_bbu), ("val_bbu", val_p_bbu), ("test_bbu", test_p_bbu)]:
    print(name, d.shape)

train_bbu (32372, 2)
val_bbu (6937, 2)
test_bbu (6937, 2)


In [8]:
# Converting to datasets, should be more memory effiecient
ds_train_bbu = Dataset.from_pandas(train_p_bbu, preserve_index=False)
ds_val_bbu   = Dataset.from_pandas(val_p_bbu,   preserve_index=False)
ds_test_bbu  = Dataset.from_pandas(test_p_bbu,  preserve_index=False)

print(ds_train_bbu, ds_val_bbu, ds_test_bbu)

Dataset({
    features: ['text', 'labels'],
    num_rows: 32372
}) Dataset({
    features: ['text', 'labels'],
    num_rows: 6937
}) Dataset({
    features: ['text', 'labels'],
    num_rows: 6937
})


In [9]:
# Load tokenizer and tokenize, see note if ram issues
tokenizer_bbu = AutoTokenizer.from_pretrained(MODEL_NAME_bbu)
def tok_fn_bbu(batch):
    return tokenizer_bbu(
        batch["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding=False,                 # dynamic padding via collator
        return_attention_mask=True,
    )

# NOTE: Tokenizing the FULL dataset is heavy in Colab.
# If you hit RAM/time issues, try first with a subset by uncommenting:
# ds_train_bbu = ds_train.select(range(200_000))  # e.g., first 200k for a dry run

ds_train_tok_bbu = ds_train_bbu.map(tok_fn_bbu, batched=True, remove_columns=["text"])
ds_val_tok_bbu   = ds_val_bbu.map(tok_fn_bbu,     batched=True, remove_columns=["text"])
ds_test_tok_bbu  = ds_test_bbu.map(tok_fn_bbu,    batched=True, remove_columns=["text"])

print("Tokenized (bbu):", len(ds_train_tok_bbu), len(ds_val_tok_bbu), len(ds_test_tok_bbu))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/32372 [00:00<?, ? examples/s]

Map:   0%|          | 0/6937 [00:00<?, ? examples/s]

Map:   0%|          | 0/6937 [00:00<?, ? examples/s]

Tokenized (bbu): 32372 6937 6937


In [10]:
# Setting PyTorch format
ds_train_tok_bbu.set_format(type="torch")
ds_val_tok_bbu.set_format(type="torch")
ds_test_tok_bbu.set_format(type="torch")

data_collator_bbu = DataCollatorWithPadding(tokenizer=tokenizer_bbu, padding="longest")

ex = ds_train_tok_bbu[0]
print("BBU example keys:", ex.keys(), "| input_ids length:", len(ex["input_ids"]))

BBU example keys: dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask']) | input_ids length: 3


## Model training w/ bert base uncased

In [11]:
# Label maps
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

# Model
model_bbu = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME_bbu,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
)

def compute_metrics_bbu(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    p_macro, r_macro, f1_macro, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    return {"accuracy": acc, "f1_macro": f1_macro, "precision_macro": p_macro, "recall_macro": r_macro}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Class weighted training to handle that the training test is imbalanced
label_order = ["negative", "neutral", "positive"]

# copy current class weights in the train split
w = {k: train_class_weights_bbu[k] for k in label_order}

# boost neutral
w["neutral"] *= 1.6

# checking weight used
weight_tensor_bbu = torch.tensor([w[k] for k in label_order], dtype=torch.float32)
print("class weights used (neg, neu, pos):", [float(x) for x in weight_tensor_bbu])

class WeightedTrainerBBU(Trainer):
    # accept extra kwargs from new Trainer (e.g., num_items_in_batch) to stay future-proof
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        # make sure labels are correct dtype for CE loss
        if labels.dtype != torch.long:
            labels = labels.long()

        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = torch.nn.CrossEntropyLoss(
            weight=weight_tensor_bbu.to(logits.device)
        )
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

class weights used (neg, neu, pos): [1.8363966941833496, 7.251183032989502, 0.44746699929237366]


In [None]:
# Setting training arguments
import importlib, transformers
importlib.reload(transformers)
from transformers.training_args import TrainingArguments

def make_training_args_bbu(output_dir=f"runs_{SHORT_bbu}"):
    """
    Build TrainingArguments with a compatibility shim:
    - Attempt A: modern API (>=3.x): eval/save 'epoch' + best model at end
    - Attempt B: fallback (very old API): no eval during training,
      no 'best model at end' (avoids the mismatch error).
    """
    # ---- Base hyperparams (tuned for the SAMPLE split) ----
    base = dict(
        output_dir=output_dir,

        # Optimization
        learning_rate=1e-5,
        weight_decay=0.01,

        # Batching (effective train batch ≈ 16 * 2 = 32)
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=2,

        # Train length on SAMPLE
        num_train_epochs=3,

        # Logging
        logging_strategy="steps",
        logging_steps=100,

        # Mixed precision
        fp16=True,

        # Keep disk tidy & avoid external loggers by default
        save_total_limit=2,
        report_to="none",
    )

    # ---- Attempt A: modern API (eval/save must MATCH) ----
    try:
        args = TrainingArguments(
            **base,
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1_macro",
            greater_is_better=True,
        )
        print("[TrainingArguments] Using modern API (eval/save='epoch').")
        return args
    except TypeError as e:
        print("[TrainingArguments] Modern API not supported:", e)

    # ---- Attempt B: minimal fallback (NO eval during training) ----
    # Remove modern-only fields and DISABLE 'best model at end'
    base_min = dict(base)
    base_min.pop("logging_strategy", None)   # some very old builds don't have this
    args = TrainingArguments(
        **base_min,
        # No evaluation during training in this fallback
        load_best_model_at_end=False,  # <- avoids the mismatch error
        # Don't set evaluation_strategy/save_strategy here
    )
    print("[TrainingArguments] Using minimal fallback (no eval during training; no best model at end).")
    return args

# Build args and quick sanity check
args_bbu = make_training_args_bbu(f"runs_{SHORT_bbu}")
print("Has 'evaluation_strategy':", hasattr(args_bbu, "evaluation_strategy"))
print("Has 'load_best_model_at_end':", getattr(args_bbu, "load_best_model_at_end", None))


[TrainingArguments] Using modern API (eval/save='epoch').
Has 'evaluation_strategy': False
Has 'load_best_model_at_end': True


In [None]:
# Training
trainer_bbu = WeightedTrainerBBU(
    model=model_bbu,
    args=args_bbu,                       # from builder
    train_dataset=ds_train_tok_bbu,
    eval_dataset=ds_val_tok_bbu,         # optional; used when you call evaluate()
    processing_class=tokenizer_bbu,      # replaces deprecated tokenizer=
    data_collator=data_collator_bbu,
    compute_metrics=compute_metrics_bbu,
)

train_result_bbu = trainer_bbu.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.6084,0.584274,0.828744,0.699842,0.689986,0.760734
2,0.5442,0.600956,0.848494,0.715276,0.699518,0.766918
3,0.4672,0.685247,0.861612,0.719295,0.703248,0.755173


In [None]:
# Validation metrics
val_metrics_bbu  = trainer_bbu.evaluate(ds_val_tok_bbu)
print("VAL (bbu):",  val_metrics_bbu)

test_metrics_bbu = trainer_bbu.evaluate(ds_test_tok_bbu, metric_key_prefix="test")
print("TEST (bbu):", test_metrics_bbu)

pred = trainer_bbu.predict(ds_test_tok_bbu)
y_true = pred.label_ids
y_pred = pred.predictions.argmax(axis=-1)

print(classification_report(y_true, y_pred,
                            target_names=["negative","neutral","positive"],
                            digits=3))
print("Confusion matrix (rows=true, cols=pred):\n",
      confusion_matrix(y_true, y_pred, labels=[0,1,2]))


VAL (bbu): {'eval_loss': 0.6852468848228455, 'eval_accuracy': 0.8616116476863197, 'eval_f1_macro': 0.7192947531897212, 'eval_precision_macro': 0.7032482847930593, 'eval_recall_macro': 0.7551734938781332, 'eval_runtime': 22.417, 'eval_samples_per_second': 309.452, 'eval_steps_per_second': 9.68, 'epoch': 3.0}
TEST (bbu): {'test_loss': 0.6905398964881897, 'test_accuracy': 0.8601701023497189, 'test_f1_macro': 0.721122193910008, 'test_precision_macro': 0.7055493892927887, 'test_recall_macro': 0.7619163522466245, 'test_runtime': 22.8372, 'test_samples_per_second': 303.759, 'test_steps_per_second': 9.502, 'epoch': 3.0}
              precision    recall  f1-score   support

    negative      0.821     0.806     0.814      1259
     neutral      0.322     0.578     0.414       510
    positive      0.973     0.901     0.936      5168

    accuracy                          0.860      6937
   macro avg      0.706     0.762     0.721      6937
weighted avg      0.898     0.860     0.875      6937


## Saving model

In [19]:
# saving too google drive
BASE_SAVE = "/content/drive/MyDrive/Project_NLP/models"
RUN_TAG   = "bbu_sample_e3_lr1e-5_wNeutral1p3"
SAVE_DIR  = f"{BASE_SAVE}/{RUN_TAG}"

import os, json, datetime, numpy as np
os.makedirs(SAVE_DIR, exist_ok=True)
print("Saving to:", SAVE_DIR)


Saving to: /content/drive/MyDrive/Project_NLP/models/bbu_sample_e3_lr1e-5_wNeutral1p3


In [20]:
# Model + tokenizer
trainer_bbu.save_model(SAVE_DIR)
tokenizer_bbu.save_pretrained(SAVE_DIR)

('/content/drive/MyDrive/Project_NLP/models/bbu_sample_e3_lr1e-5_wNeutral1p3/tokenizer_config.json',
 '/content/drive/MyDrive/Project_NLP/models/bbu_sample_e3_lr1e-5_wNeutral1p3/special_tokens_map.json',
 '/content/drive/MyDrive/Project_NLP/models/bbu_sample_e3_lr1e-5_wNeutral1p3/vocab.txt',
 '/content/drive/MyDrive/Project_NLP/models/bbu_sample_e3_lr1e-5_wNeutral1p3/added_tokens.json',
 '/content/drive/MyDrive/Project_NLP/models/bbu_sample_e3_lr1e-5_wNeutral1p3/tokenizer.json')

In [None]:
# Puting all diagnostics beside model, but in a separate "runs" area
RUNS_BASE = "/content/drive/MyDrive/Project_NLP/runs"
RUN_DIR    = f"{RUNS_BASE}/{RUN_TAG}"
os.makedirs(RUN_DIR, exist_ok=True)
print("Writing metrics & reports to:", RUN_DIR)

# 1) raw val/test metrics → JSON
with open(f"{RUN_DIR}/metrics.json", "w") as f:
    json.dump(
        {
            "val":  val_metrics_bbu,     # from trainer_bbu.evaluate(ds_val_tok_bbu)
            "test": test_metrics_bbu,    # from trainer_bbu.evaluate(ds_test_tok_bbu, metric_key_prefix="test")
        },
        f,
        indent=2
    )

# 2) classification report → CSV
label_order = ["negative", "neutral", "positive"]
rep_dict = classification_report(
    y_true, y_pred, target_names=label_order, output_dict=True, digits=3
)
pd.DataFrame(rep_dict).T.to_csv(f"{RUN_DIR}/classification_report.csv")

# 3) confusion matrix → CSV
cm = confusion_matrix(y_true, y_pred, labels=[0, 1, 2])
pd.DataFrame(
    cm,
    index=[f"true_{l}" for l in label_order],
    columns=[f"pred_{l}" for l in label_order],
).to_csv(f"{RUN_DIR}/confusion_matrix.csv")

# 4) test-set predictions → CSV
pd.DataFrame({"y_true": y_true, "y_pred": y_pred}).to_csv(
    f"{RUN_DIR}/preds_test.csv", index=False
)

# 5) compact run summary (to remember how it was trained) → JSON
summary = {
    "run_tag": RUN_TAG,
    "model_path": SAVE_DIR,
    "model_name": MODEL_NAME_bbu,            
    "max_length": MAX_LENGTH,
    "epochs": getattr(args_bbu, "num_train_epochs", None),
    "learning_rate": getattr(args_bbu, "learning_rate", None),
    "train_sizes": {
        "train": len(ds_train_tok_bbu),
        "val":   len(ds_val_tok_bbu),
        "test":  len(ds_test_tok_bbu),
    },
    # if you used class weights:
    "class_weights_used": (
        [float(x) for x in weight_tensor_bbu.tolist()]
        if "weight_tensor_bbu" in globals() else None
    ),
    "notes": "BBU on SAMPLE; neutral boosted ×1.3",
}
with open(f"{RUN_DIR}/run_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("Saved: metrics.json, classification_report.csv, confusion_matrix.csv, preds_test.csv, run_summary.json")



Writing metrics & reports to: /content/drive/MyDrive/Project_NLP/runs/bbu_sample_e3_lr1e-5_wNeutral1p3
Saved: metrics.json, classification_report.csv, confusion_matrix.csv, preds_test.csv, run_summary.json
