In [5]:
!pip -q install datasets gensim scikit-learn tqdm


In [8]:
from datasets import load_dataset

try:
    ds = load_dataset("HangenYuu/Steam_Games_Review")
    print("Loaded full dataset ✅")
except Exception as e:
    print("Direct load failed (known parquet schema issue). Will use raw CSV fallback.")
    print("Error:", type(e), e)
    ds = None


Generating train split:   0%|          | 0/2302105 [00:00<?, ? examples/s]

Failed to read file '/root/.cache/huggingface/hub/datasets--HangenYuu--Steam_Games_Review/snapshots/1625b48294ea3b801cbf0472f9b109354ae49583/processed/developer_mapping.parquet' with error <class 'datasets.table.CastError'>: Couldn't cast
game_id: uint32
developer_id: uint32
to
{'developer_id': Value('uint32'), 'developer': Value('large_string')}
because column names don't match
ERROR:datasets.packaged_modules.parquet.parquet:Failed to read file '/root/.cache/huggingface/hub/datasets--HangenYuu--Steam_Games_Review/snapshots/1625b48294ea3b801cbf0472f9b109354ae49583/processed/developer_mapping.parquet' with error <class 'datasets.table.CastError'>: Couldn't cast
game_id: uint32
developer_id: uint32
to
{'developer_id': Value('uint32'), 'developer': Value('large_string')}
because column names don't match


Direct load failed (known parquet schema issue). Will use raw CSV fallback.
Error: <class 'datasets.exceptions.DatasetGenerationError'> An error occurred while generating the dataset


In [9]:
from huggingface_hub import hf_hub_download
from datasets import load_dataset

if ds is None:
    local_csv = hf_hub_download(
        repo_id="HangenYuu/Steam_Games_Review",
        filename="raw/steam_game_reviews.csv",
        repo_type="dataset",
    )
    print("Downloaded to:", local_csv)

    ds = load_dataset("csv", data_files={"train": local_csv})

print(ds)
print("Rows:", len(ds["train"]))
print("Columns:", ds["train"].column_names)
ds["train"][0]


raw/steam_game_reviews.csv:   0%|          | 0.00/476M [00:00<?, ?B/s]

Downloaded to: /root/.cache/huggingface/hub/datasets--HangenYuu--Steam_Games_Review/snapshots/1625b48294ea3b801cbf0472f9b109354ae49583/raw/steam_game_reviews.csv


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['review', 'hours_played', 'helpful', 'funny', 'recommendation', 'date', 'game_name', 'username'],
        num_rows: 992153
    })
})
Rows: 992153
Columns: ['review', 'hours_played', 'helpful', 'funny', 'recommendation', 'date', 'game_name', 'username']


{'review': "The game itself is also super fun. The PvP and the campaign are a joy to play. Your actions feel deliberate, almost as if you're controlling an Angel in a 1 Ton metal suit. I love it.-------------------------------It's sad that I have to say this, but it's wonderfully refreshing to be able to boot up a game, play for an hour or two, and close it again. Rather than boot it up, do my dailies, check my weeklies, check my season pass etc.You boot the game up, and you play it. It doesn't try to open your wallet at any point.While this should not be something to be praised in the year 2024, it is refreshing to see.",
 'hours_played': '39.9',
 'helpful': '1,152',
 'funny': '13',
 'recommendation': 'Recommended',
 'date': '14 September',
 'game_name': 'Warhammer 40,000: Space Marine 2',
 'username': 'Sentinowl\n224 products in account'}

In [12]:
# Cell 4 — Manually set correct columns for THIS dataset
text_col = "review"
label_col = "recommendation"

print("Using text_col:", text_col)
print("Using label_col:", label_col)

# sanity check
print("Example label values:", ds["train"].select(range(5))[label_col])


Using text_col: review
Using label_col: recommendation
Example label values: Column(['Recommended', 'Recommended', 'Recommended', 'Recommended', 'Recommended'])


In [14]:
import random
SAMPLE_N = 60000
SEED = 42
random.seed(SEED)

train_ds = ds["train"]
if SAMPLE_N < len(train_ds):
    idx = random.sample(range(len(train_ds)), SAMPLE_N)
    work_ds = train_ds.select(idx)
else:
    work_ds = train_ds

print("Working rows:", len(work_ds))


Working rows: 60000


In [15]:
# Cell 6 — Clean/tokenize + normalize 'recommendation' to 0/1
import re

def clean_text(t):
    if t is None:
        return ""
    t = str(t).lower()
    t = re.sub(r"\s+", " ", t).strip()
    return t

def tokenize(t):
    t = clean_text(t)
    t = re.sub(r"[^a-z0-9\s']", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t.split() if t else []

def normalize_recommendation(val):
    """
    Converts the dataset's 'recommendation' field into 0/1.
    Common formats in Steam datasets:
      - "Recommended" / "Not Recommended"
      - True / False
      - 1 / 0
    """
    if isinstance(val, bool):
        return int(val)
    if isinstance(val, (int, float)):
        return int(val > 0)

    s = str(val).strip().lower()
    if s in {"recommended", "recommend", "yes", "true", "1", "thumbs up", "up"}:
        return 1
    if s in {"not recommended", "no", "false", "0", "thumbs down", "down"}:
        return 0

    return None  # unknown format

def extract_row(row):
    text = row.get(text_col, "")
    label = row.get(label_col, None)
    y = normalize_recommendation(label)
    toks = tokenize(text)
    return {"tokens": toks, "y": y}

mapped = work_ds.map(extract_row)

# Filter out rows with unknown labels or empty text
mapped = mapped.filter(lambda r: r["y"] is not None and len(r["tokens"]) > 0)

print("After filtering:", len(mapped))
print("Label distribution sample:", sum(mapped["y"]) / len(mapped["y"]))
mapped[0]


Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/60000 [00:00<?, ? examples/s]

After filtering: 59861
Label distribution sample: 0.8114298123987237


{'review': '2020 Early Access Review epic destruction',
 'hours_played': '17.8',
 'helpful': '1',
 'funny': '0',
 'recommendation': 'Recommended',
 'date': 'December 1, 2020',
 'game_name': 'Teardown',
 'username': 'Jimmy\n112 products in account',
 'tokens': ['2020', 'early', 'access', 'review', 'epic', 'destruction'],
 'y': 1}

In [17]:
# Cell 7 — Train/test split (convert HF columns -> plain Python lists first)
from sklearn.model_selection import train_test_split

X_tokens = list(mapped["tokens"])
y = list(mapped["y"])

X_train_tok, X_test_tok, y_train, y_test = train_test_split(
    X_tokens,
    y,
    test_size=0.2,
    random_state=SEED,
    stratify=y
)

print("Train size:", len(X_train_tok))
print("Test size :", len(X_test_tok))
print("Positive rate (train):", sum(y_train) / len(y_train))
print("Positive rate (test) :", sum(y_test) / len(y_test))


Train size: 47888
Test size : 11973
Positive rate (train): 0.8114350150350819
Positive rate (test) : 0.811409003591414


In [18]:
from gensim.models import Word2Vec

w2v = Word2Vec(
    sentences=X_train_tok,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1
)

print("Vocab size:", len(w2v.wv))


Vocab size: 29685


In [19]:
import numpy as np

def sentence_vector(tokens):
    vecs = [w2v.wv[w] for w in tokens if w in w2v.wv]
    if not vecs:
        return np.zeros(w2v.vector_size, dtype=np.float32)
    return np.mean(vecs, axis=0).astype(np.float32)

X_train = np.vstack([sentence_vector(t) for t in X_train_tok])
X_test = np.vstack([sentence_vector(t) for t in X_test_tok])

X_train.shape, X_test.shape


((47888, 100), (11973, 100))

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
acc = accuracy_score(y_test, pred)

print("Accuracy:", acc)
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred))
print("\nReport:\n", classification_report(y_test, pred, digits=4))


Accuracy: 0.8751357220412594

Confusion matrix:
 [[1145 1113]
 [ 382 9333]]

Report:
               precision    recall  f1-score   support

           0     0.7498    0.5071    0.6050      2258
           1     0.8935    0.9607    0.9258      9715

    accuracy                         0.8751     11973
   macro avg     0.8216    0.7339    0.7654     11973
weighted avg     0.8664    0.8751    0.8653     11973

