In [109]:
pip install fastFM



In [110]:
import ast

rows_review_v1 = []
with open("australian_user_reviews.json") as f:
    for line in f:
        line = line.strip()
        if line:
            rows_review_v1.append(ast.literal_eval(line))

rows_ui_v1 =[]
with open("australian_users_items.json") as f:
    for line in f:
        line = line.strip()
        if line:
            rows_ui_v1.append(ast.literal_eval(line))

In [111]:
import pandas as pd

ui_rows = []
for row in rows_ui_v1:
    user = row["user_id"]
    for it in row["items"]:
        ui_rows.append({
            "user_id": user,
            "item_id": it["item_id"],
            "playtime": it["playtime_forever"]
        })

df_ui = pd.DataFrame(ui_rows)


In [112]:
review_rows = []
for row in rows_review_v1:
    user = row["user_id"]
    for r in row["reviews"]:
        review_rows.append({
            "user_id": user,
            "item_id": r["item_id"],
            "recommend": int(r["recommend"])
        })

df_reviews = pd.DataFrame(review_rows)


In [113]:
user_stats = df_ui.groupby("user_id").agg(
    user_total_items=("item_id", "count"),
    user_total_playtime=("playtime", "sum"),
    user_avg_playtime=("playtime", "mean"),
    user_zero_play_rate=("playtime", lambda x: (x==0).mean())
)

user_review_stats = df_reviews.groupby("user_id").agg(
    user_review_count=("recommend", "count"),
    user_recommend_rate=("recommend", "mean")
)


In [114]:
item_stats = df_ui.groupby("item_id").agg(
    item_purchases=("user_id", "count"),
    item_avg_playtime=("playtime", "mean"),
    item_zero_play_rate=("playtime", lambda x: (x == 0).mean())
)

item_review_stats = df_reviews.groupby("item_id").agg(
    item_review_count=("recommend", "count"),
    item_recommend_rate=("recommend", "mean")
)


In [115]:
df_pos = df_ui.copy()
df_pos["purchase"] = 1
df_pos["never_play"] = (df_pos["playtime"] == 0).astype(int)

df_pos = df_pos.join(user_stats, on="user_id")
df_pos = df_pos.join(item_stats, on="item_id")
df_pos = df_pos.join(user_review_stats, on="user_id")
df_pos = df_pos.join(item_review_stats, on="item_id")

df_pos = df_pos.fillna(0)


In [116]:
# Generate negative samples
import numpy as np

all_users = df_ui["user_id"].unique()
all_items = df_ui["item_id"].unique()
owned = set(zip(df_ui.user_id, df_ui.item_id))

neg_samples = []

for user in np.random.choice(all_users, size=200000):
    item = np.random.choice(all_items)
    if (user, item) not in owned:   # user never bought item
        neg_samples.append({
            "user_id": user,
            "item_id": item,
            "purchase": 0,
            "never_play": 0
        })

df_neg = pd.DataFrame(neg_samples)


In [117]:
df_neg = df_neg.join(user_stats, on="user_id")
df_neg = df_neg.join(item_stats, on="item_id")
df_neg = df_neg.join(user_review_stats, on="user_id")
df_neg = df_neg.join(item_review_stats, on="item_id")

df_neg = df_neg.fillna(0)


In [133]:
df = pd.concat([df_pos, df_neg], ignore_index=True)

In [142]:
df.columns

Index(['user_id', 'item_id', 'playtime', 'purchase', 'never_play',
       'user_total_items', 'user_total_playtime', 'user_avg_playtime',
       'user_zero_play_rate', 'item_purchases', 'item_avg_playtime',
       'item_zero_play_rate', 'user_review_count', 'user_recommend_rate',
       'item_review_count', 'item_recommend_rate', 'u_id', 'i_id', 'label3'],
      dtype='object')

In [152]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from fastFM import sgd


user_enc = LabelEncoder()
item_enc = LabelEncoder()

df["u_id"] = user_enc.fit_transform(df["user_id"])
df["i_id"] = item_enc.fit_transform(df["item_id"])


####################################################################
# 4. CREATE 3-CLASS LABEL
#    0 = no purchase
#    1 = purchased & played
#    2 = purchased & never played
####################################################################

df["label3"] = np.where(
    df["purchase"] == 0,
    0,
    np.where(df["never_play"] == 1, 2, 1)
).astype(int)

y = df["label3"].values


####################################################################
# 5. BUILD DESIGN MATRIX WITH YOUR REAL NUMERIC COLUMNS
####################################################################

numeric_cols = [
    "playtime",
    "user_total_items", "user_total_playtime", "user_avg_playtime",
    "user_zero_play_rate",
    "item_purchases", "item_avg_playtime", "item_zero_play_rate",
    "user_review_count", "user_recommend_rate",
    "item_review_count", "item_recommend_rate"
]
df[numeric_cols] = df[numeric_cols].fillna(0)

# sparse numeric matrix
X_num = csr_matrix(df[numeric_cols].values)

N = len(df)
n_users = df["u_id"].max() + 1
n_items = df["i_id"].max() + 1
row_idx = np.arange(N)

# sparse one-hot users
X_user = csr_matrix(
    (np.ones(N), (row_idx, df["u_id"])),
    shape=(N, n_users)
)

# sparse one-hot items
X_item = csr_matrix(
    (np.ones(N), (row_idx, df["i_id"])),
    shape=(N, n_items)
)

# final FM matrix
X = hstack([X_user, X_item, X_num], format="csr")


####################################################################
# 6. TRAIN/TEST SPLIT
####################################################################

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)


####################################################################
# 7. ONE-VS-REST FM CLASSIFICATION (fastFM requires -1/+1)
####################################################################

fm_params = dict(
    n_iter=40,       # can increase to 60–100 for better performance
    init_stdev=0.1,
    rank=10,
    step_size=0.001,
    l2_reg_w=0.01,
    l2_reg_V=0.01
)

n_classes = 3
fm_models = []

for c in range(n_classes):
    print(f"\nTraining FM for class {c} vs Rest...")

    # fastFM binary labels must be -1 / +1
    y_train_bin = np.where(y_train == c, 1, -1).astype(np.float64)
    y_train_bin = np.ascontiguousarray(y_train_bin)



    fm = sgd.FMClassification(**fm_params)
    fm.fit(X_train, y_train_bin)

    fm_models.append(fm)


####################################################################
# 8. PREDICT CLASS PROBABILITIES
####################################################################

probs_raw = np.zeros((X_test.shape[0], n_classes))

for c, fm in enumerate(fm_models):
    # Probability y=+1 => probability of class c
    probs_raw[:, c] = fm.predict_proba(X_test).ravel()

# Normalize so rows sum to 1
probs = probs_raw / (probs_raw.sum(axis=1, keepdims=True) + 1e-15)

y_pred = probs.argmax(axis=1)


####################################################################
# 9. MULTICLASS PERFORMANCE
####################################################################

print("\n========== MULTICLASS METRICS ==========")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro F1:", f1_score(y_test, y_pred, average='macro'))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


####################################################################
# 10. MAIN OUTPUT — PROBABILITY BUY & NEVER PLAY
####################################################################

p_neverplay = probs[:, 2]   # class 2 probability

neverplay_true = (y_test == 2).astype(int)
neverplay_pred = (p_neverplay > 0.5).astype(int)

print("\n========== NEVER-PLAY (class 2) BINARY METRICS ==========")
print("Accuracy:", accuracy_score(neverplay_true, neverplay_pred))
print("F1:", f1_score(neverplay_true, neverplay_pred))
print("\nConfusion Matrix:\n", confusion_matrix(neverplay_true, neverplay_pred))

print("\nFirst 10 P(buy & never play):")
print(p_neverplay[:10])


Training FM for class 0 vs Rest...

Training FM for class 1 vs Rest...

Training FM for class 2 vs Rest...

Accuracy: 0.03703721004537628
Macro F1: 0.02380963105702234

Confusion Matrix:
 [[ 39644      0      0]
 [656918      0      0]
 [373821      0      0]]

Accuracy: 0.6507595879232013
F1: 0.0

Confusion Matrix:
 [[696562      0]
 [373821      0]]

First 10 P(buy & never play):
[nan nan nan nan nan nan nan nan nan nan]
