In [3]:
import ast

rows_review_v1 = []
with open("australian_user_reviews.json") as f:
    for line in f:
        line = line.strip()
        if line:
            rows_review_v1.append(ast.literal_eval(line))

rows_ui_v1 =[]
with open("australian_users_items.json") as f:
    for line in f:
        line = line.strip()
        if line:
            rows_ui_v1.append(ast.literal_eval(line))

In [4]:
len(rows_review_v1), len(rows_ui_v1)

(25799, 88310)

In [6]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [7]:
ui_rows = []
item_name_lookup = {}

for user in rows_ui_v1:
    uid = user["user_id"]

    for item in user["items"]:
        item_id = item["item_id"]
        item_name = item["item_name"]

        ui_rows.append({
            "user_id": uid,
            "item_id": item_id,
            "item_name": item_name,
            "playtime": item["playtime_forever"],
            "purchase": 1
        })

        item_name_lookup[item_id] = item_name

df_ui = pd.DataFrame(ui_rows)

In [8]:
# Label: never played
df_ui["never_play"] = (df_ui["playtime"] == 0).astype(int)

## ** Could probably build some EDA with df_ui later

### ex. avg games each user bought,...

In [12]:
# Build Negative Samples
all_items = set(df_ui["item_id"].unique())

neg_samples = []
random.seed(42)

for user, group in tqdm(df_ui.groupby("user_id")):
  purchased = set(group["item_id"])
  not_bought = list(all_items - purchased)

  # Sample up to 20 negative items per user
  sample_count = min(20, len(not_bought))
  sampled_items = random.sample(not_bought, sample_count)

  for item in sampled_items:
    neg_samples.append({
        "user_id": user,
        "item_id": item,
        "item_name": item_name_lookup[item],
        "playtime": 0,
        "purchase": 0,
        "never_play": 1
    })

df_neg = pd.DataFrame(neg_samples)

df_neg

100%|██████████| 70912/70912 [00:58<00:00, 1209.42it/s]


Unnamed: 0,user_id,item_id,item_name,playtime,purchase,never_play
0,--000--,283160,House of the Dying Sun,0,0,1
1,--000--,323060,Tharsis,0,0,1
2,--000--,312960,Starion Tactics,0,0,1
3,--000--,220820,Zombie Driver HD,0,0,1
4,--000--,400740,VERGE:Lost chapter,0,0,1
...,...,...,...,...,...,...
1418235,zzzmidmiss,250660,Bunny Must Die! Chelsea and the 7 Devils,0,0,1
1418236,zzzmidmiss,207690,Botanicula,0,0,1
1418237,zzzmidmiss,220420,Nancy Drew: The Deadly Device,0,0,1
1418238,zzzmidmiss,299580,Abducted,0,0,1


In [15]:
# Merge pos and neg datasets together
df_full = pd.concat([df_ui, df_neg], ignore_index = True)

df_full["target_buy"] = df_full["purchase"]
df_full["target_never"] = df_full["never_play"]

df_full

Unnamed: 0,user_id,item_id,item_name,playtime,purchase,never_play,target_buy,target_never
0,76561197970982479,10,Counter-Strike,6,1,0,1,0
1,76561197970982479,20,Team Fortress Classic,0,1,1,1,1
2,76561197970982479,30,Day of Defeat,7,1,0,1,0
3,76561197970982479,40,Deathmatch Classic,0,1,1,1,1
4,76561197970982479,50,Half-Life: Opposing Force,0,1,1,1,1
...,...,...,...,...,...,...,...,...
6571444,zzzmidmiss,250660,Bunny Must Die! Chelsea and the 7 Devils,0,0,1,0,1
6571445,zzzmidmiss,207690,Botanicula,0,0,1,0,1
6571446,zzzmidmiss,220420,Nancy Drew: The Deadly Device,0,0,1,0,1
6571447,zzzmidmiss,299580,Abducted,0,0,1,0,1


‼ Figure out later why only create sequences for Purchased only

In [17]:
# Build User Sequences for Item2Vec --> For Only Purchased Items

user_sequences = (
    df_ui[df_ui["purchase"] == 1]
    .groupby("user_id")["item_id"]
    .apply(list)
    .tolist()

)

In [18]:
# Train Item2Vec

item2vec = Word2Vec(
    sentences = user_sequences,
    vector_size = 64,
    window = 10,
    min_count = 1,
    workers = 4,
    sg = 1
)

# Each item_id now gets a embedding vector of length 64

In [20]:
item_vecs = {item: item2vec.wv[item] for item in item2vec.wv.index_to_key}

In [22]:
df_full["item_vec"] = df_full["item_id"].map(item_vecs)
df_full = df_full[df_full["item_vec"].notna()]

In [24]:
# Expand embedding columns
for i in range(64):
    df_full[f"v{i}"] = df_full["item_vec"].apply(lambda x: x[i])

In [26]:
# Add extra features
item_pop = df_ui.groupby("item_id")["playtime"].count()
user_never_rate = df_ui.groupby("user_id")["never_play"].mean()

df_full["item_popularity"] = df_full["item_id"].map(item_pop)
df_full["user_never_rate"] = df_full["user_id"].map(user_never_rate)

# Fill NaNs (should be few)
df_full["item_popularity"] = df_full["item_popularity"].fillna(0)
df_full["user_never_rate"] = df_full["user_never_rate"].fillna(0)

In [36]:
# Train Stage 1 Model --> Predicting if user will buy the game or not
features = [f"v{i}" for i in range(64)] + ["item_popularity", "user_never_rate"]

# QUESTION: MODEL DIDN'T USE item_name at all???

X = df_full[features].values
y = df_full["target_buy"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify = y
)

clf_buy = LogisticRegression(max_iter=2500)
clf_buy.fit(X_train, y_train)

print("\n===== Stage 1: Buy Prediction Model =====")
pred = clf_buy.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))
# print("F1:", f1_score(y_test, pred))
# print(confusion_matrix(y_test, pred))


===== Stage 1: Buy Prediction Model =====
Accuracy: 0.8896278599091525


In [34]:
# Train Stage 2 Model --> Predict if user will never play the game if buy

purchased_df = df_full[df_full["purchase"] == 1]

X2 = purchased_df[features].values
y2 = purchased_df["target_never"].values

# Split inside the purchased buyers only
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, random_state=42, stratify=y2
)

clf_never = LogisticRegression(max_iter=3000)
clf_never.fit(X2_train, y2_train)

print("\n===== Stage 2: NEVER PLAY MODEL =====")
pred2 = clf_never.predict(X2_test)
print("Accuracy:", accuracy_score(y2_test, pred2))
# print("F1:", f1_score(y2_test, pred2))
# print(confusion_matrix(y2_test, pred2))


===== NEVER PLAY MODEL =====
Accuracy: 0.7375800714506104


In [37]:
# Final Prediction Function

def predict_buy_and_never_play(user_id, item_id):
    """
    Returns probability that the user will:
    (1) Buy the item
    (2) Never play it after buying
    """
    if item_id not in item_vecs:
        return None  # unknown item

    vec = item_vecs[item_id]
    pop = item_pop.get(item_id, 0)
    unr = user_never_rate.get(user_id, 0)

    feats = np.array(list(vec) + [pop, unr]).reshape(1, -1)

    p_buy = clf_buy.predict_proba(feats)[0, 1]
    p_never = clf_never.predict_proba(feats)[0, 1]

    return p_buy * p_never

print("\nExample prediction:")
print(predict_buy_and_never_play(df_ui.iloc[0]['user_id'],
                                 df_ui.iloc[0]['item_id']))


Example prediction:
0.23578636228693192
