## Project NLP | Business Case: Automated Customer Reviews
Review classification file. Using roberta-base (rb)

## Downloads

In [1]:
# The following downloads are needed to run this notebook
# !pip -q install transformers datasets
!pip -q install -U "transformers>=4.30" "accelerate>=0.21" datasets
import transformers, torch
print("transformers:", transformers.__version__, "| torch:", torch.__version__)

transformers: 4.55.4 | torch: 2.8.0+cu126


## Libraries

In [None]:
# The following imports should be done to run this notebook
import os, json, importlib
import pandas as pd
import numpy as np
import torch

from datetime import datetime
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
)

import transformers
import transformers.training_args as ta
importlib.reload(transformers)
importlib.reload(ta)

from transformers.training_args import TrainingArguments
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,  
)
from datasets import Dataset

os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("transformers:", transformers.__version__)
print("`evaluation_strategy` in TrainingArguments? ",
      "evaluation_strategy" in TrainingArguments.__init__.__code__.co_varnames)


transformers: 4.55.4
`evaluation_strategy` in TrainingArguments?  False


## Loading preprocessed data

In [3]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Paths
BASE = "/content/drive/MyDrive/Project_NLP"
PATH_FULL   = os.path.join(BASE, "video_games_preprocessed.parquet")
PATH_SAMPLE = os.path.join(BASE, "video_games_preprocessed_sample1pct.parquet")
PATH_CFG    = os.path.join(BASE, "preprocess_config.json")

# Loading
df        = pd.read_parquet(PATH_FULL)
df_sample = pd.read_parquet(PATH_SAMPLE)
cfg       = json.load(open(PATH_CFG))

df_use = df_sample.copy()

# Checking
print(" - sample: ", len(df_sample), "rows")
print(" - config keys:", list(cfg.keys()))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 - sample:  46246 rows
 - config keys: ['text_col', 'label_col', 'label2id', 'max_length', 'class_counts', 'class_weights']


# Splitting data set


In [None]:
# Just inspecting + record class stats
label_col = cfg["label_col"]
order = ["negative", "neutral", "positive"]

def class_stats(series, order):
    counts = series.value_counts().reindex(order, fill_value=0)
    weights = (counts.sum() / (len(order) * counts)).astype(float)  # inverse freq
    return counts, weights

counts_use, weights_use = class_stats(df_use[label_col], order)

print("WORKING DF counts:\n", counts_use.to_dict())
print("WORKING DF class_weights:\n", weights_use.to_dict())

WORKING DF counts:
 {'negative': 8394, 'neutral': 3401, 'positive': 34451}
WORKING DF class_weights:
 {'negative': 1.8364704947978714, 'neutral': 4.532588454376164, 'positive': 0.4474567743558484}


In [5]:
# train/val/test split on the sample dataset (70/15/15)
text_col  = cfg["text_col"]
label_col = cfg["label_col"]
order     = ["negative", "neutral", "positive"]

train_df, tmp_df = train_test_split(
    df_use, test_size=0.30, stratify=df_use[label_col], random_state=42
)
val_df, test_df = train_test_split(
    tmp_df, test_size=0.50, stratify=tmp_df[label_col], random_state=42
)

def pct(d):
    c = d[label_col].value_counts().reindex(order, fill_value=0)
    return (c / c.sum() * 100).round(2)

print("Sizes -> train/val/test:", len(train_df), len(val_df), len(test_df))
print("\nClass % in train:\n", pct(train_df))
print("\nClass % in val:\n",   pct(val_df))
print("\nClass % in test:\n",  pct(test_df))

Sizes -> train/val/test: 32372 6937 6937

Class % in train:
 sentiment
negative    18.15
neutral      7.36
positive    74.49
Name: count, dtype: float64

Class % in val:
 sentiment
negative    18.15
neutral      7.35
positive    74.50
Name: count, dtype: float64

Class % in test:
 sentiment
negative    18.15
neutral      7.35
positive    74.50
Name: count, dtype: float64


As seen in preprocessing, the data set was very positive balanced. So we should rebalance the training split before we can do model training.

Negative 18.15%, neutral 7.35%, positive 74.50%

## Tokenizing for roberta base

In [6]:
# Setting up config for second model roberta base
MODEL_NAME_rb  = "roberta-base"
SHORT_rb       = "rb"                     # <- shortname for this model
MAX_LENGTH     = 192 #cfg["max_length"] trying with 192 instead of cfg value 256
label2id       = cfg["label2id"]
text_col       = cfg["text_col"]
label_col      = cfg["label_col"]
order          = ["negative","neutral","positive"]

# recompute class weights from the TRAIN split (recommended)
train_counts_rb = train_df[label_col].value_counts().reindex(order, fill_value=0)
train_class_weights_rb = (train_counts_rb.sum() / (len(order) * train_counts_rb)).astype(float).to_dict()
print("RB train counts:", train_counts_rb.to_dict())
print("RB train class weights:", train_class_weights_rb)


RB train counts: {'negative': 5876, 'neutral': 2381, 'positive': 24115}
RB train class weights: {'negative': 1.8363966417063762, 'neutral': 4.531989360212796, 'positive': 0.4474669984103946}


In [7]:
# Making data frames to keep only text and labels/sentiment
def prep_for_model(df_):
    d = df_[[text_col, label_col]].copy()
    d["labels"] = d[label_col].map(label2id).astype("int64")
    d.rename(columns={text_col: "text"}, inplace=True)
    return d[["text","labels"]]

train_p_rb = prep_for_model(train_df)
val_p_rb   = prep_for_model(val_df)
test_p_rb  = prep_for_model(test_df)

for name, d in [("train_rb", train_p_rb), ("val_rb", val_p_rb), ("test_rb", test_p_rb)]:
    print(name, d.shape)

train_rb (32372, 2)
val_rb (6937, 2)
test_rb (6937, 2)


In [8]:
# Converting to datasets, should be more memory effiecient
ds_train_rb = Dataset.from_pandas(train_p_rb, preserve_index=False)
ds_val_rb   = Dataset.from_pandas(val_p_rb,   preserve_index=False)
ds_test_rb  = Dataset.from_pandas(test_p_rb,  preserve_index=False)

print(ds_train_rb, ds_val_rb, ds_test_rb)

Dataset({
    features: ['text', 'labels'],
    num_rows: 32372
}) Dataset({
    features: ['text', 'labels'],
    num_rows: 6937
}) Dataset({
    features: ['text', 'labels'],
    num_rows: 6937
})


In [9]:
# Load tokenizer and tokenize, see note if ram issues
tokenizer_rb = AutoTokenizer.from_pretrained(MODEL_NAME_rb)
def tok_fn_rb(batch):
    return tokenizer_rb(
        batch["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding=False,                 # dynamic padding via collator
        return_attention_mask=True,
    )

# NOTE: Tokenizing the FULL dataset is heavy in Colab.
# If you hit RAM/time issues, try first with a subset by uncommenting:
# ds_train_rb = ds_train.select(range(200_000))  # e.g., first 200k for a dry run

ds_train_tok_rb = ds_train_rb.map(tok_fn_rb, batched=True, remove_columns=["text"])
ds_val_tok_rb  = ds_val_rb.map(tok_fn_rb,     batched=True, remove_columns=["text"])
ds_test_tok_rb = ds_test_rb.map(tok_fn_rb,    batched=True, remove_columns=["text"])

print("Tokenized (rb):", len(ds_train_tok_rb), len(ds_val_tok_rb), len(ds_test_tok_rb))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/32372 [00:00<?, ? examples/s]

Map:   0%|          | 0/6937 [00:00<?, ? examples/s]

Map:   0%|          | 0/6937 [00:00<?, ? examples/s]

Tokenized (rb): 32372 6937 6937


In [10]:
# Setting PyTorch format
ds_train_tok_rb.set_format(type="torch")
ds_val_tok_rb.set_format(type="torch")
ds_test_tok_rb.set_format(type="torch")

data_collator_rb = DataCollatorWithPadding(tokenizer=tokenizer_rb, padding="longest")

ex = ds_train_tok_rb[0]
print("RB example keys:", ex.keys(), "| input_ids length:", len(ex["input_ids"]))

RB example keys: dict_keys(['labels', 'input_ids', 'attention_mask']) | input_ids length: 3


## Model training w/ roberta base

In [11]:
# Label maps
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

# Model
model_rb = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME_rb,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
)

def compute_metrics_rb(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    p_macro, r_macro, f1_macro, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    return {"accuracy": acc, "f1_macro": f1_macro, "precision_macro": p_macro, "recall_macro": r_macro}

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Class weighted training to handle that the training test is imbalanced
label_order = ["negative", "neutral", "positive"]

# copy current class weights in the train split
w = {k: train_class_weights_rb[k] for k in label_order}

# boost neutral
w["neutral"] *= 1.7

# checking weight used
weight_tensor_rb = torch.tensor([w[k] for k in label_order], dtype=torch.float32)
print("class weights used (neg, neu, pos):", [float(x) for x in weight_tensor_rb])

class WeightedTrainerRB(Trainer):
    # accept extra kwargs from new Trainer (e.g., num_items_in_batch) to stay future-proof
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        # make sure labels are correct dtype for CE loss
        if labels.dtype != torch.long:
            labels = labels.long()

        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = torch.nn.CrossEntropyLoss(
            weight=weight_tensor_rb.to(logits.device)
        )
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

class weights used (neg, neu, pos): [1.8363966941833496, 7.704381942749023, 0.44746699929237366]


In [13]:
# Setting training arguments
import importlib, transformers
importlib.reload(transformers)
from transformers.training_args import TrainingArguments

def make_training_args_rb(output_dir=f"runs_{SHORT_rb}"):
    """
    Build TrainingArguments with a compatibility shim:
    - Attempt A: modern API (>=3.x): eval/save 'epoch' + best model at end
    - Attempt B: fallback (very old API): no eval during training,
      no 'best model at end' (avoids the mismatch error).
    """
    # ---- Base hyperparams (tuned for the SAMPLE split) ----
    base = dict(
        output_dir=output_dir,

        # Optimization
        learning_rate=2e-5,
        weight_decay=0.01,

        # Batching (effective train batch ≈ 16 * 2 = 32)
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=2,

        # Train length on SAMPLE
        num_train_epochs=3,

        # Logging
        logging_strategy="steps",
        logging_steps=100,

        # Mixed precision
        fp16=True,

        # Keep disk tidy & avoid external loggers by default
        save_total_limit=2,
        report_to="none",
    )

    # ---- Attempt A: modern API (eval/save must MATCH) ----
    try:
        args = TrainingArguments(
            **base,
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1_macro",
            greater_is_better=True,
        )
        print("[TrainingArguments] Using modern API (eval/save='epoch').")
        return args
    except TypeError as e:
        print("[TrainingArguments] Modern API not supported:", e)

    # ---- Attempt B: minimal fallback (NO eval during training) ----
    # Remove modern-only fields and DISABLE 'best model at end'
    base_min = dict(base)
    base_min.pop("logging_strategy", None)   # some very old builds don't have this
    args = TrainingArguments(
        **base_min,
        # No evaluation during training in this fallback
        load_best_model_at_end=False,  # <- avoids the mismatch error
        # Don't set evaluation_strategy/save_strategy here
    )
    print("[TrainingArguments] Using minimal fallback (no eval during training; no best model at end).")
    return args

# Build args and quick sanity check
args_rb = make_training_args_rb(f"runs_{SHORT_rb}")
print("Has 'evaluation_strategy':", hasattr(args_rb, "evaluation_strategy"))
print("Has 'load_best_model_at_end':", getattr(args_rb, "load_best_model_at_end", None))


[TrainingArguments] Using modern API (eval/save='epoch').
Has 'evaluation_strategy': False
Has 'load_best_model_at_end': True


In [None]:
# Training
trainer_rb = WeightedTrainerRB(
    model=model_rb,
    args=args_rb,                       # from builder
    train_dataset=ds_train_tok_rb,
    eval_dataset=ds_val_tok_rb,         # optional; used when you call evaluate()
    processing_class=tokenizer_rb,      # replaces deprecated tokenizer=
    data_collator=data_collator_rb,
    compute_metrics=compute_metrics_rb,
)

train_result_rb = trainer_rb.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.5854,0.607373,0.85844,0.721581,0.703093,0.765218
2,0.4995,0.626207,0.866657,0.728978,0.707207,0.771167
3,0.4185,0.719765,0.880208,0.742652,0.726964,0.77299


In [None]:
# Validation metrics
val_metrics_rb  = trainer_rb.evaluate(ds_val_tok_rb)
print("VAL (rb):",  val_metrics_rb)

test_metrics_rb = trainer_rb.evaluate(ds_test_tok_rb, metric_key_prefix="test")
print("TEST (rb):", test_metrics_rb)

pred = trainer_rb.predict(ds_test_tok_rb)
y_true = pred.label_ids
y_pred = pred.predictions.argmax(axis=-1)

print(classification_report(y_true, y_pred,
                            target_names=["negative","neutral","positive"],
                            digits=3))
print("Confusion matrix (rows=true, cols=pred):\n",
      confusion_matrix(y_true, y_pred, labels=[0,1,2]))


VAL (rb): {'eval_loss': 0.7197648286819458, 'eval_accuracy': 0.8802075825284705, 'eval_f1_macro': 0.7426517164510611, 'eval_precision_macro': 0.7269643251303889, 'eval_recall_macro': 0.7729895765281852, 'eval_runtime': 15.4772, 'eval_samples_per_second': 448.207, 'eval_steps_per_second': 14.021, 'epoch': 3.0}
TEST (rb): {'test_loss': 0.7547275424003601, 'test_accuracy': 0.8784777281245495, 'test_f1_macro': 0.7413278015447077, 'test_precision_macro': 0.7251222581474663, 'test_recall_macro': 0.774336640652387, 'test_runtime': 15.6692, 'test_samples_per_second': 442.717, 'test_steps_per_second': 13.849, 'epoch': 3.0}
              precision    recall  f1-score   support

    negative      0.840     0.828     0.834      1259
     neutral      0.361     0.575     0.443       510
    positive      0.974     0.921     0.947      5168

    accuracy                          0.878      6937
   macro avg      0.725     0.774     0.741      6937
weighted avg      0.905     0.878     0.889      693

## Saving model

In [19]:
# saving too google drive
BASE_SAVE = "/content/drive/MyDrive/Project_NLP/models"
RUN_TAG   = "rb_sample_e3_lr2e-5_wNeutral1p7_len192"
SAVE_DIR  = f"{BASE_SAVE}/{RUN_TAG}"

import os, json, datetime, numpy as np
os.makedirs(SAVE_DIR, exist_ok=True)
print("Saving to:", SAVE_DIR)


Saving to: /content/drive/MyDrive/Project_NLP/models/rb_sample_e3_lr2e-5_wNeutral1p7_len192


In [20]:
# Model + tokenizer
trainer_rb.save_model(SAVE_DIR)
tokenizer_rb.save_pretrained(SAVE_DIR)

('/content/drive/MyDrive/Project_NLP/models/rb_sample_e3_lr2e-5_wNeutral1p7_len192/tokenizer_config.json',
 '/content/drive/MyDrive/Project_NLP/models/rb_sample_e3_lr2e-5_wNeutral1p7_len192/special_tokens_map.json',
 '/content/drive/MyDrive/Project_NLP/models/rb_sample_e3_lr2e-5_wNeutral1p7_len192/vocab.json',
 '/content/drive/MyDrive/Project_NLP/models/rb_sample_e3_lr2e-5_wNeutral1p7_len192/merges.txt',
 '/content/drive/MyDrive/Project_NLP/models/rb_sample_e3_lr2e-5_wNeutral1p7_len192/added_tokens.json',
 '/content/drive/MyDrive/Project_NLP/models/rb_sample_e3_lr2e-5_wNeutral1p7_len192/tokenizer.json')

In [None]:
# Puting all diagnostics beside model, but in a separate "runs" area
RUNS_BASE = "/content/drive/MyDrive/Project_NLP/runs"
RUN_DIR    = f"{RUNS_BASE}/{RUN_TAG}"
os.makedirs(RUN_DIR, exist_ok=True)
print("Writing metrics & reports to:", RUN_DIR)

# 1) raw val/test metrics → JSON
with open(f"{RUN_DIR}/metrics.json", "w") as f:
    json.dump(
        {
            "val":  val_metrics_rb,     # from trainer_rb.evaluate(ds_val_tok_rb)
            "test": test_metrics_rb,    # from trainer_rb.evaluate(ds_test_tok_rb, metric_key_prefix="test")
        },
        f,
        indent=2
    )

# 2) classification report → CSV
label_order = ["negative", "neutral", "positive"]
rep_dict = classification_report(
    y_true, y_pred, target_names=label_order, output_dict=True, digits=3
)
pd.DataFrame(rep_dict).T.to_csv(f"{RUN_DIR}/classification_report.csv")

# 3) confusion matrix → CSV
cm = confusion_matrix(y_true, y_pred, labels=[0, 1, 2])
pd.DataFrame(
    cm,
    index=[f"true_{l}" for l in label_order],
    columns=[f"pred_{l}" for l in label_order],
).to_csv(f"{RUN_DIR}/confusion_matrix.csv")

# 4) test-set predictions → CSV
pd.DataFrame({"y_true": y_true, "y_pred": y_pred}).to_csv(
    f"{RUN_DIR}/preds_test.csv", index=False
)

# 5) compact run summary (to remember how it was trained) → JSON
summary = {
    "run_tag": RUN_TAG,
    "model_path": SAVE_DIR,
    "model_name": MODEL_NAME_rb,            
    "max_length": MAX_LENGTH,
    "epochs": getattr(args_rb, "num_train_epochs", None),
    "learning_rate": getattr(args_rb, "learning_rate", None),
    "train_sizes": {
        "train": len(ds_train_tok_rb),
        "val":   len(ds_val_tok_rb),
        "test":  len(ds_test_tok_rb),
    },
    # if you used class weights:
    "class_weights_used": (
        [float(x) for x in weight_tensor_rb.tolist()]
        if "weight_tensor_rb" in globals() else None
    ),
    "notes": "rb on SAMPLE; neutral boosted ×1.6",
}
with open(f"{RUN_DIR}/run_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("Saved: metrics.json, classification_report.csv, confusion_matrix.csv, preds_test.csv, run_summary.json")



Writing metrics & reports to: /content/drive/MyDrive/Project_NLP/runs/rb_sample_e3_lr2e-5_wNeutral1p7_len192
Saved: metrics.json, classification_report.csv, confusion_matrix.csv, preds_test.csv, run_summary.json
