In [232]:
import ast

# rows_review_v1 = []
# with open("australian_user_reviews.json") as f:
#     for line in f:
#         line = line.strip()
#         if line:
#             rows_review_v1.append(ast.literal_eval(line))

rows_ui_v1 =[]
with open("australian_users_items.json", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            rows_ui_v1.append(ast.literal_eval(line))

In [233]:
# len(rows_review_v1), len(rows_ui_v1)

In [234]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [235]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import gc

In [349]:
ui_rows = []
item_name_lookup = {}

for user in rows_ui_v1:
    uid = user["user_id"]

    for item in user["items"]:
        item_id = item["item_id"]
        item_name = item["item_name"]

        ui_rows.append({
            "user_id": uid,
            "item_id": item_id,
            "item_name": item_name,
            "playtime": item["playtime_forever"],
            "purchase": 1
        })

        item_name_lookup[item_id] = item_name

df_ui = pd.DataFrame(ui_rows)

In [350]:
# Label: never played
df_ui["never_play"] = (df_ui["playtime"] == 0).astype(int)

## ** Could probably build some EDA with df_ui later

### ex. avg games each user bought,...

In [352]:
# Build Negative Samples
all_items = set(df_ui["item_id"].unique())

neg_samples = []
random.seed(42)

for user, group in tqdm(df_ui.groupby("user_id")):
  purchased = set(group["item_id"])
  not_bought = list(all_items - purchased)

  # Sample up to 20 negative items per user
  sample_count = min(20, len(not_bought))
  sampled_items = random.sample(not_bought, sample_count)

  for item in sampled_items:
    neg_samples.append({
        "user_id": user,
        "item_id": item,
        "item_name": item_name_lookup[item],
        "playtime": 0,
        "purchase": 0,
        "never_play": 1
    })

df_neg = pd.DataFrame(neg_samples)

df_neg

100%|██████████████████████████████████████████████████████████████████████████| 70912/70912 [00:29<00:00, 2438.78it/s]


Unnamed: 0,user_id,item_id,item_name,playtime,purchase,never_play
0,--000--,331460,Rooms: The Unsolvable Puzzle,0,0,1
1,--000--,214420,Gear Up,0,0,1
2,--000--,351720,Solar Division,0,0,1
3,--000--,296220,Farm Frenzy 4,0,0,1
4,--000--,386750,Cashtronauts,0,0,1
...,...,...,...,...,...,...
1418235,zzzmidmiss,366620,Broken Bots,0,0,1
1418236,zzzmidmiss,389680,Luxury Hotel Emporium,0,0,1
1418237,zzzmidmiss,369310,Monstro: Battle Tactics,0,0,1
1418238,zzzmidmiss,270950,Ski Region Simulator,0,0,1


In [353]:
# Merge pos and neg datasets together
df_full = pd.concat([df_ui, df_neg], ignore_index = True)

df_full["target_buy"] = df_full["purchase"]
df_full["target_never"] = df_full["never_play"]

df_full

Unnamed: 0,user_id,item_id,item_name,playtime,purchase,never_play,target_buy,target_never
0,76561197970982479,10,Counter-Strike,6,1,0,1,0
1,76561197970982479,20,Team Fortress Classic,0,1,1,1,1
2,76561197970982479,30,Day of Defeat,7,1,0,1,0
3,76561197970982479,40,Deathmatch Classic,0,1,1,1,1
4,76561197970982479,50,Half-Life: Opposing Force,0,1,1,1,1
...,...,...,...,...,...,...,...,...
6571444,zzzmidmiss,366620,Broken Bots,0,0,1,0,1
6571445,zzzmidmiss,389680,Luxury Hotel Emporium,0,0,1,0,1
6571446,zzzmidmiss,369310,Monstro: Battle Tactics,0,0,1,0,1
6571447,zzzmidmiss,270950,Ski Region Simulator,0,0,1,0,1


‼ Figure out later why only create sequences for Purchased only

In [355]:
# Build User Sequences for Item2Vec --> For Only Purchased Items

user_sequences = (
    df_ui[df_ui["purchase"] == 1]
    .groupby("user_id")["item_id"]
    .apply(list)
    .tolist()

)

In [356]:
# Train Item2Vec

item2vec = Word2Vec(
    sentences=user_sequences,
    vector_size=64,
    window=10,
    min_count=1,
    workers=4,
    sg=1,
)

# Each item_id now gets a embedding vector of length 64

In [357]:
# how many items got vectors
print(len(item2vec.wv))
print(df_ui['item_id'].nunique())

# should be 64
print(item2vec.wv.vector_size)              
print(item2vec.wv.most_similar(0))          

10978
10978
64
[('730', 0.8291909694671631), ('202970', 0.765162467956543), ('24240', 0.7533947229385376), ('202990', 0.7440153956413269), ('206210', 0.7254487872123718), ('55230', 0.7180371284484863), ('104700', 0.7134971022605896), ('212910', 0.710774302482605), ('200710', 0.6966071128845215), ('42690', 0.6958034634590149)]


In [358]:
item_vecs = {item: item2vec.wv[item] for item in item2vec.wv.index_to_key}

In [359]:
df_full["item_vec"] = df_full["item_id"].map(item_vecs)
df_full = df_full[df_full["item_vec"].notna()]

In [360]:
# Expand embedding columns
for i in range(64):
    df_full[f"v{i}"] = df_full["item_vec"].apply(lambda x: x[i])

In [361]:
%%time

# Add extra features
item_pop = df_ui.groupby("item_id")["playtime"].count()
user_never_rate = df_ui.groupby("user_id")["never_play"].mean()

df_full["item_popularity"] = df_full["item_id"].map(item_pop)
df_full["user_never_rate"] = df_full["user_id"].map(user_never_rate)

# Fill NaNs (should be few)
df_full["item_popularity"] = df_full["item_popularity"].fillna(0)
df_full["user_never_rate"] = df_full["user_never_rate"].fillna(0)

# # fraction of a user's games that have been played in the last 2 week
# user_recent_games_frac = (
#     (df_ui["playtime_2weeks"] > 0)
#         .groupby(df_ui["user_id"])
#         .mean()
# )
# df_full["user_recent_games_frac"] = df_full["user_id"].map(user_recent_games_frac)

CPU times: total: 1.28 s
Wall time: 1.28 s


In [362]:
# # free resources
# del df_neg, item2vec

In [363]:
%%time

# Train Stage 1 Model --> Predicting if user will buy the game or not
features = [f"v{i}" for i in range(64)] + ["item_popularity", "user_never_rate"]

# QUESTION: MODEL DIDN'T USE item_name at all???

X = df_full[features].values
y = df_full["target_buy"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify = y
)

clf_buy = LogisticRegression(max_iter=2500)
clf_buy.fit(X_train, y_train)

print("\n===== Stage 1: Buy Prediction Model =====")
pred = clf_buy.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))
# print("F1:", f1_score(y_test, pred))
# print(confusion_matrix(y_test, pred))
print(f"num iter: {clf_buy.n_iter_}")


===== Stage 1: Buy Prediction Model =====
Accuracy: 0.8899862283057772
num iter: [655]
CPU times: total: 14min 17s
Wall time: 2min 44s


In [364]:
%%time

# Train Stage 2 Model --> Predict if user will never play the game if buy

purchased_df = df_full[df_full["purchase"] == 1]

X2 = purchased_df[features].values
y2 = purchased_df["target_never"].values

# Split inside the purchased buyers only
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, random_state=42, stratify=y2
)

clf_never = LogisticRegression(max_iter=6000)
clf_never.fit(X2_train, y2_train)

print("\n===== Stage 2: NEVER PLAY MODEL =====")
pred2 = clf_never.predict(X2_test)
print("Accuracy:", accuracy_score(y2_test, pred2))
# print("F1:", f1_score(y2_test, pred2))
# print(confusion_matrix(y2_test, pred2))
print(f"num iter: {clf_never.n_iter_}")


===== Stage 2: NEVER PLAY MODEL =====
Accuracy: 0.7384698081390046
num iter: [1923]
CPU times: total: 31min 7s
Wall time: 5min 49s


In [365]:
# Final Prediction Function

def predict_buy_and_never_play(user_id, item_id):
    """
    Returns probability that the user will:
    (1) Buy the item
    (2) Never play it after buying
    """
    if item_id not in item_vecs:
        return None  # unknown item

    vec = item_vecs[item_id]
    pop = item_pop.get(item_id, 0)
    unr = user_never_rate.get(user_id, 0)

    feats = np.array(list(vec) + [pop, unr]).reshape(1, -1)

    p_buy = clf_buy.predict_proba(feats)[0, 1]
    p_never = clf_never.predict_proba(feats)[0, 1]

    return p_buy * p_never

print("\nExample prediction:")
print(predict_buy_and_never_play(df_ui.iloc[0]['user_id'],
                                 df_ui.iloc[0]['item_id']))


Example prediction:
0.25856653063788826


In [366]:
test_data = df_full[["user_id", "item_id", "target_buy", "target_never"]].copy()

test_data["buy_and_never"] = test_data["target_buy"] * test_data["target_never"]


In [367]:
# # Step 1 — take only 20% of test_data
# test_sample = test_data.sample(frac=0.10, random_state=42)

# # Step 2 — shuffle the sampled rows
# test_sample = test_sample.sample(frac=1, random_state=1337).reset_index(drop=True)

# test_sample

In [368]:
%%time
# Pre-extract numpy arrays from test_sample
uids = test_data["user_id"].values
iids = test_data["item_id"].values

probs = np.empty(len(test_data), dtype=float)

for i in range(len(test_data)):
    p = predict_buy_and_never_play(uids[i], iids[i])
    probs[i] = np.nan if p is None else p

test_data["prob"] = probs
test_eval = test_data.dropna(subset=["prob"]).reset_index(drop=True)
test_eval

CPU times: total: 19min 55s
Wall time: 21min 7s


Unnamed: 0,user_id,item_id,target_buy,target_never,buy_and_never,prob
0,76561197970982479,10,1,0,0,0.258567
1,76561197970982479,20,1,1,1,0.522394
2,76561197970982479,30,1,0,0,0.463800
3,76561197970982479,40,1,1,1,0.478147
4,76561197970982479,50,1,1,1,0.633845
...,...,...,...,...,...,...
6571444,zzzmidmiss,366620,0,1,0,0.059315
6571445,zzzmidmiss,389680,0,1,0,0.063329
6571446,zzzmidmiss,369310,0,1,0,0.015494
6571447,zzzmidmiss,270950,0,1,0,0.132024


In [369]:
%%time
from sklearn.metrics import accuracy_score, f1_score

# thresholds = np.linspace(0, 1, 101)   # 0.00, 0.01, ..., 1.00
thresholds = [0.49]

best_acc = -1
best_thr_acc = None

best_f1 = -1
best_thr_f1 = None

results = []   # optional: store all results

y_true = test_eval["buy_and_never"].values
probs   = test_eval["prob"].values

for thr in thresholds:
    y_pred = (probs >= thr).astype(int)

    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, zero_division=0)

    results.append((thr, acc, f1))

    if acc > best_acc:
        best_acc = acc
        best_thr_acc = thr

    if f1 > best_f1:
        best_f1 = f1
        best_thr_f1 = thr

print("===== BEST ACCURACY THRESHOLD =====")
print("Threshold:", best_thr_acc)
print("Accuracy:", best_acc)

print("\n===== BEST F1 THRESHOLD =====")
print("Threshold:", best_thr_f1)
print("F1:", best_f1)


===== BEST ACCURACY THRESHOLD =====
Threshold: 0.49
Accuracy: 0.7707257562221057

===== BEST F1 THRESHOLD =====
Threshold: 0.49
F1: 0.5075505061234895
CPU times: total: 2.86 s
Wall time: 3.17 s


In [370]:
%%time
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

y_true = test_eval["buy_and_never"].values   # 1 = buy & never, 0 = otherwise

# baseline: always predict "never buy & never-play event does NOT happen"
y_pred = np.zeros_like(y_true, dtype=int)

acc = accuracy_score(y_true, y_pred)
f1  = f1_score(y_true, y_pred, zero_division=0)

print("===== BASELINE: ALWAYS 'NEVER BUY' =====")
print("Accuracy:", acc)
print("F1:", f1)

===== BASELINE: ALWAYS 'NEVER BUY' =====
Accuracy: 0.7157456445298441
F1: 0.0
CPU times: total: 2.42 s
Wall time: 2.81 s


In [371]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import f1_score, accuracy_score

# # Small grids
# C_grid_buy   = [0.1, 1.0, 10.0]
# C_grid_never = [0.1, 1.0, 10.0]
# thresholds   = np.linspace(0, 1, 101)

# best_f1 = -1.0
# best_setup = None
# best_thr = None
# best_clf_buy = None
# best_clf_never = None

# # Precompute training matrices once
# X_buy = df_full[features].values
# y_buy = df_full["target_buy"].values

# purchased_df = df_full[df_full["purchase"] == 1].copy()
# X_never = purchased_df[features].values
# y_never = purchased_df["target_never"].values

# for C1 in C_grid_buy:
#     for C2 in C_grid_never:
#         # -------- Train Stage 1: Buy model --------
#         clf_buy_tmp = LogisticRegression(
#             C=C1,
#             class_weight="balanced",
#             max_iter=2500
#         )
#         clf_buy_tmp.fit(X_buy, y_buy)

#         # -------- Train Stage 2: Never-play model (only on purchased rows) --------
#         clf_never_tmp = LogisticRegression(
#             C=C2,
#             class_weight="balanced",
#             max_iter=6000
#         )
#         clf_never_tmp.fit(X_never, y_never)

#         # Make these the *current* global models so predict_buy_and_never_play uses them
#         clf_buy = clf_buy_tmp
#         clf_never = clf_never_tmp

#         # -------- Get probabilities on the fixed evaluation sample --------
#         probs = np.empty(len(test_data), dtype=float)
#         for idx, (u, i) in enumerate(zip(uids, iids)):
#             p = predict_buy_and_never_play(u, i)
#             probs[idx] = np.nan if p is None else p

#         mask = ~np.isnan(probs)
#         probs_eval = probs[mask]
#         y_eval = y_true[mask]

#         # -------- Sweep thresholds for this (C1, C2) pair --------
#         for thr in thresholds:
#             y_pred = (probs_eval >= thr).astype(int)
#             f1 = f1_score(y_eval, y_pred, zero_division=0)

#             if f1 > best_f1:
#                 best_f1 = f1
#                 best_setup = {"C_buy": C1, "C_never": C2}
#                 best_thr = thr
#                 best_clf_buy = clf_buy_tmp
#                 best_clf_never = clf_never_tmp

# print("Best setup:", best_setup)
# print("Best threshold:", best_thr)
# print("Best F1:", best_f1)

# # Keep the best models + threshold for the rest of your notebook
# clf_buy = best_clf_buy
# clf_never = best_clf_never
# best_threshold = best_thr