In [1]:
import gzip

import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

DATA_PATH = "/Users/hetvigandhi/Desktop/australian_users_items.json.gz"


In [2]:
def readingJsonData(path):
    with gzip.open(path, 'rt') as f:
        for l in f:
            l = l.strip()
            if not l:
                continue
            d = eval(l)
            yield d

In [3]:
X_pairs = []
y = []

num_users = 0
num_items_total = 0

for user_data in readingJsonData(DATA_PATH):
    num_users += 1
    user_id = user_data["user_id"]
    items = user_data.get("items", [])
    num_items_total += len(items)

    for item in items:
        game_id = item["item_id"]
        playtime = item["playtime_forever"]

        label = 1 if playtime == 0 else 0
        X_pairs.append((user_id, game_id))
        y.append(label)

print(f"Total purchased games (user-game pairs): {len(X_pairs):,}")
print(f"Number of users (num_users): {num_users:,}")
print(f"Total number of items across all users (num_items_total): {num_items_total:,}")


Total purchased games (user-game pairs): 5,153,209
Number of users (num_users): 88,310
Total number of items across all users (num_items_total): 5,153,209


In [4]:
y = np.array(y)

never_plays = np.sum(y == 1)
plays = np.sum(y == 0)

print(f"Total examples: {len(y):,}")
print(f"Buys but never plays: {never_plays:,} ({never_plays/len(y):.3f})")
print(f"Buys and plays         : {plays:,} ({plays/len(y):.3f})")

Total examples: 5,153,209
Buys but never plays: 1,867,963 (0.362)
Buys and plays         : 3,285,246 (0.638)


In [5]:
def baselineNeverPlays(pairs):
    """
    Baseline model will always predict 'never plays' (label = 1) for each (user, game) pair.
    Input:
        pairs: list of (user_id, item_id) tuples (not actually used)

    Output:
        list of predictions, all 1s
    """
    return np.ones(len(pairs), dtype=int)


In [6]:
y_pred_all = baselineNeverPlays(X_pairs)

print("=== Baseline Performance on All Data ===")
print(f"Accuracy : {accuracy_score(y, y_pred_all):.4f}")
print(f"Precision: {precision_score(y, y_pred_all, zero_division=0):.4f}")
print(f"Recall   : {recall_score(y, y_pred_all, zero_division=0):.4f}")
print(f"F1-score : {f1_score(y, y_pred_all, zero_division=0):.4f}")

=== Baseline Performance on All Data ===
Accuracy : 0.3625
Precision: 0.3625
Recall   : 1.0000
F1-score : 0.5321
