# Generate Star Labels (1–5)

This notebook generates star labels using `nlptown/bert-base-multilingual-uncased-sentiment` and writes `data/processed/reviews_with_stars.csv`. Cleaning was assessed as unnecessary for this dataset.

### Step 1 — Environment & imports

Ensure `pandas`, `bs4`, `transformers`, and `torch` are importable. If any fail, install via the active `venv`.

In [9]:
import importlib
mods = ["pandas","bs4","transformers","torch"]
for m in mods:
    try:
        importlib.import_module(m)
        print("OK", m)
    except Exception as e:
        print("FAIL", m, ":", e)
from pathlib import Path
import pandas as pd

import sys, torch, os
print("python:", sys.executable)
print("torch:", torch.__version__)
print("cuda_available:", torch.cuda.is_available())
print("cuda:", torch.version.cuda)
print("gpu:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

OK pandas
OK bs4
OK transformers
OK torch
python: c:\Users\TARIK\Desktop\Charles Darwin University\4 - Year 1 - Semester 2\IT CODE FAIR\Data Science Challenge\venv\Scripts\python.exe
torch: 2.8.0+cu129
cuda_available: True
cuda: 12.9
gpu: NVIDIA GeForce RTX 5060 Laptop GPU


### Step 2, 3 and 4 — Config

Define model id, batch size, input and output paths, and random seed.

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch

import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"

# Resolve project base directory (works from project root or notebooks/)
CWD = Path.cwd()
BASE_DIR = CWD if (CWD / "data" / "generate_stars" / "processed").exists() else CWD.parent
if not (BASE_DIR / "data" / "generate_stars" / "processed").exists():
    raise FileNotFoundError("Could not locate data/generate_stars/processed from current working directory")

class CFG:
    MODEL_ID = "nlptown/bert-base-multilingual-uncased-sentiment"
    BATCH_SIZE = 64  # GPU-enabled
    IN_PATH = BASE_DIR / "data" / "generate_stars" / "processed" / "reviews_unified.csv"
    OUT_WITH_STARS = BASE_DIR / "data" / "generate_stars" / "processed" / "reviews_with_stars.csv"
    SEED = 42

# Step 3 — Load data

df = pd.read_csv(CFG.IN_PATH)

# Step 4 — Generate stars (continuous + integer)
model = AutoModelForSequenceClassification.from_pretrained(CFG.MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_ID)

device = 0 if torch.cuda.is_available() else -1
pipeline = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    task="sentiment-analysis",
    truncation=True,
    device=device,
)

label_to_int = {"1 star":1, "2 stars":2, "3 stars":3, "4 stars":4, "5 stars":5}

stars_float = []
stars = []
for i in range(0, len(df), CFG.BATCH_SIZE):
    texts = df["comment"].iloc[i:i+CFG.BATCH_SIZE].tolist()
    dists = pipeline(texts, batch_size=CFG.BATCH_SIZE, truncation=True, return_all_scores=True)
    for dist in dists:
        ev = sum(label_to_int[d["label"]] * float(d["score"]) for d in dist)
        stars_float.append(ev)
        stars.append(int(min(5, max(1, round(ev)))))

df_out = df.copy()
# Keep float (rounded to 1 decimal) and an internal integer for training/EDA
df_out["stars_float"] = pd.Series(stars_float).round(1)
df_out["stars"] = pd.Series(stars).astype(int)

# Persist only float ratings in the CSV
cols_to_save = ["source", "place", "comment", "stars_float"]
import os
os.makedirs(CFG.OUT_WITH_STARS.parent, exist_ok=True)
df_out[cols_to_save].to_csv(CFG.OUT_WITH_STARS, index=False)
print("Wrote", CFG.OUT_WITH_STARS)


Device set to use cuda:0


Wrote c:\Users\TARIK\Desktop\Charles Darwin University\4 - Year 1 - Semester 2\IT CODE FAIR\Data Science Challenge\data\generate_stars\processed\reviews_with_stars.csv


### Step 5 — Averages per place (descending)

Compute and display average stars and review counts by `place`, ordered high → low.

In [11]:
# Define place name normalization mapping
PLACE_MAPPING = {
    'Kakadu National Park – Gunlom Falls': 'Kakadu',
    'Kakadu Gunlom Falls': 'Kakadu',
    'Nitmiluk (Katherine Gorge / Nitmiluk National Park)': 'Nitmiluk (Katherine Gorge)',
    'Tjoritja / West MacDonnell National Park': 'West MacDonnell National Park',
    'West MacDonnell – Ormiston Gorge': 'West MacDonnell National Park',
    'West MacDonnell Ormiston': 'West MacDonnell National Park'
}

# Create a copy for display purposes with normalized place names
df_display = df_out.copy()
df_display['place_normalized'] = df_display['place'].replace(PLACE_MAPPING)

# Aggregate using normalized place names
agg = (
    df_display
    .groupby("place_normalized")
    .agg(avg_stars=("stars","mean"), num_reviews=("stars","size"))
    .reset_index()
    .rename(columns={'place_normalized': 'place'})
)
agg = agg.sort_values(["avg_stars","num_reviews"], ascending=[False, False])
print(agg.head(30))

                           place  avg_stars  num_reviews
5  West MacDonnell National Park   4.446086          677
3     Nitmiluk (Katherine Gorge)   4.361834          807
1   Devils Marbles (Karlu Karlu)   4.257895          380
2                         Kakadu   3.917115          929
0      Alice Springs Desert Park   3.869219         3265
4               Uluru-Kata Tjuta   3.628382         3474
