In [1]:
import gzip
from collections import defaultdict

import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

DATA_PATH = "/Users/hetvigandhi/Desktop/australian_users_items.json.gz"


In [2]:
def readingJsonData(path):
    with gzip.open(path, 'rt') as f:
        for l in f:
            d = eval(l)
            yield d

In [3]:
X_pairs = []
y = []

num_users = 0
num_items_total = 0

for user_data in readingJsonData(DATA_PATH):
    num_users += 1
    user_id = user_data["user_id"]
    items = user_data.get("items", [])
    num_items_total += len(items)

    for item in items:
        game_id = item["item_id"]
        playtime = item["playtime_forever"]

        label = 1 if playtime == 0 else 0
        X_pairs.append((user_id, game_id))
        y.append(label)

len(X_pairs), num_users, num_items_total


(5153209, 88310, 5153209)

In [4]:
y = np.array(y)

never_plays = np.sum(y == 1)
plays = np.sum(y == 0)

print("Total (user, game) pairs:", len(y))
print("Never plays (label 1):", never_plays)
print("Plays (label 0):", plays)
print("Fraction never plays:", never_plays / len(y))
print("Fraction plays:", plays / len(y))


Total (user, game) pairs: 5153209
Never plays (label 1): 1867963
Plays (label 0): 3285246
Fraction never plays: 0.36248539502279065
Fraction plays: 0.6375146049772094


In [5]:
def baselineNeverPlays(pairs):
    """
    Baseline model will always predict 'never plays' (label = 1) for each (user, game) pair.
    Input:
        pairs: list of (user_id, item_id) tuples (not actually used)

    Output:
        list of predictions, all 1s
    """
    return np.ones(len(pairs), dtype=int)


In [6]:
y_pred_all = baselineNeverPlays(X_pairs)

print("Baseline performance on all data")
print("Accuracy :", accuracy_score(y, y_pred_all))
print("Precision:", precision_score(y, y_pred_all, zero_division=0))
print("Recall   :", recall_score(y, y_pred_all, zero_division=0))

Baseline performance on all data
Accuracy : 0.36248539502279065
Precision: 0.36248539502279065
Recall   : 1.0
