Matrix Factorisation 

Using Truncate SVD

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
import os

# Step 1: Load the datasets
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
recommendations_path = os.path.join(BASE_DIR, "data/external/recommendations.csv")
games_tagged_path = os.path.join(BASE_DIR, "data/external/games_tagged.csv")
df = pd.read_csv(recommendations_path)
games = pd.read_csv(games_tagged_path)

# Step 2: Create improved interaction score
df['hours_log'] = df['hours_log'].fillna(0)
df['helpfulness_ratio'] = df['helpfulness_ratio'].fillna(0)
df['interaction'] = (
    0.6 * df['hours_log'] +
    0.3 * df['is_recommended_binary'] +
    0.1 * df['helpfulness_ratio']
)

# Step 3: Filter top 5000 users
top_users = df['user_id'].value_counts().head(5000).index
df = df[df['user_id'].isin(top_users)]

# Step 4: Create user-item interaction matrix
user_item_matrix = df.pivot_table(index='user_id', columns='app_id', values='interaction', fill_value=0)
R_full = user_item_matrix.values
user_ids = list(user_item_matrix.index)
item_ids = list(user_item_matrix.columns)

# Step 5: Train-test split (20%)
np.random.seed(42)
test_mask = np.random.rand(*R_full.shape) < 0.2
train_matrix = R_full.copy()
train_matrix[test_mask] = 0

# Step 6: Apply TruncatedSVD
svd = TruncatedSVD(n_components=20, random_state=42)
user_factors = svd.fit_transform(train_matrix)
item_factors = svd.components_
R_pred = np.dot(user_factors, item_factors)

# Step 7: Evaluate RMSE
rmse = np.sqrt(mean_squared_error(R_full[test_mask], R_pred[test_mask]))
print(f"RMSE: {rmse:.4f}")

# Step 8: Create binary test matrix (only recommended games in test set)
binary_matrix = df.pivot_table(index='user_id', columns='app_id', values='is_recommended_binary', fill_value=0)
binary_matrix = binary_matrix.loc[user_item_matrix.index, user_item_matrix.columns]
R_true_binary = binary_matrix.values
R_test_binary = np.where(test_mask, R_true_binary, 0)

# Step 9: Evaluate Precision@20 and Recall@20 using test-only ground truth
def precision_recall_at_k(R_true, R_pred, k=20):
    precisions, recalls = [], []
    for i in range(R_true.shape[0]):
        actual = set(np.where(R_true[i] > 0)[0])
        if not actual:
            continue
        pred_scores = R_pred[i].copy()
        pred_scores[list(actual)] = -np.inf
        top_k = set(np.argsort(pred_scores)[-k:])
        tp = len(actual & top_k)
        precision = tp / k
        recall = tp / len(actual)
        precisions.append(precision)
        recalls.append(recall)
    return np.mean(precisions), np.mean(recalls)

precision, recall = precision_recall_at_k(R_test_binary, R_pred, k=20)
print(f"Precision@20: {precision:.4f}")
print(f"Recall@20: {recall:.4f}")

# Step 10: Map game titles
id_to_title = games.set_index('app_id')['title'].to_dict()

# Step 11: Recommend top 20 games per user
recommendations_all_users = []
for i, user_id in enumerate(user_ids):
    user_played = R_full[i] > 0
    scores = R_pred[i].copy()
    scores[user_played] = -np.inf
    top_indices = np.argsort(scores)[-20:][::-1]

    print(f"\nTop 20 recommended games for User {user_id}:")
    for j in top_indices:
        game_id = item_ids[j]
        score = scores[j]
        title = id_to_title.get(game_id, "Unknown Title")
        print(f"  {title} (Game ID: {game_id}, Score: {score:.4f})")
        recommendations_all_users.append({
            "User ID": user_id,
            "Recommended Game ID": game_id,
            "Game Title": title,
            "Predicted Score": score
        })

# Step 12: Save to CSV
# recommendations_df = pd.DataFrame(recommendations_all_users)
# recommendations_df.to_csv("top20_user_based_recommendations_evaluated.csv", index=False)
# print("\n✅ Recommendations saved to top20_user_based_recommendations_evaluated.csv")


Using NMF