# Matrix Factorization Model
This notebook trains a Matrix Factorization model (Truncated SVD) for a game recommendation system based on user interaction data.


In [13]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
import os
import pickle
import gc

# Configuration
pd.set_option('display.max_columns', None)
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))


## Step 2: Load Datasets
Load the user recommendation data and game metadata from CSV files.


In [14]:
# Load Datasets
recommendations_path = os.path.join(BASE_DIR, "data/external/recommendations.csv")
games_path = os.path.join(BASE_DIR, "data/external/games_tagged.csv")

print(f"Loading recommendations from: {recommendations_path}")
df = pd.read_csv(recommendations_path)

print(f"Loading game metadata from: {games_path}")
games = pd.read_csv(games_path)


Loading recommendations from: c:\Users\jians\Documents\GitHub\Game-Recommender-System\data/external/recommendations.csv
Loading game metadata from: c:\Users\jians\Documents\GitHub\Game-Recommender-System\data/external/games_tagged.csv


## Step 3: Create Interaction Score
Build a custom interaction score combining hours played, review helpfulness, and recommendation.


In [15]:
# Create Interaction Score
print("\nCreating interaction score...")
df['hours_log'] = df['hours_log'].fillna(0)
df['helpfulness_ratio'] = df['helpfulness_ratio'].fillna(0)
df['interaction'] = (
    0.6 * df['hours_log'] +
    0.3 * df['is_recommended_binary'] +
    0.1 * df['helpfulness_ratio']
)



Creating interaction score...


## Step 4: Filter Active Users and Items
Remove inactive users and unpopular games to reduce noise.


In [16]:
# Filter Active Users and Items
print("\nFiltering active users and active items...")
user_interaction_counts = df['user_id'].value_counts()
item_interaction_counts = df['app_id'].value_counts()

MIN_USER_INTERACTIONS = 5
MIN_ITEM_INTERACTIONS = 500

users_to_keep = user_interaction_counts[user_interaction_counts >= MIN_USER_INTERACTIONS].index
items_to_keep = item_interaction_counts[item_interaction_counts >= MIN_ITEM_INTERACTIONS].index

df = df[df['user_id'].isin(users_to_keep) & df['app_id'].isin(items_to_keep)]
print(f"Data after filtering: {len(df):,} records.")



Filtering active users and active items...
Data after filtering: 1,327 records.


## Step 5: Create User-Item Interaction Matrix
Pivot the data into a matrix where rows = users and columns = items (games).


In [17]:
# Create User-Item Interaction Matrix
print("\nCreating user-item interaction matrix...")
user_item_matrix = df.pivot_table(index='user_id', columns='app_id', values='interaction', fill_value=0)
print(f"User-item matrix shape: {user_item_matrix.shape}")

R_full = user_item_matrix.values
user_ids = list(user_item_matrix.index)
item_ids = list(user_item_matrix.columns)



Creating user-item interaction matrix...
User-item matrix shape: (253, 148)


## Step 6: Train-Test Split
Randomly hide 20% of interactions to evaluate model performance later.


In [18]:
# Train-Test Split
print("\nSplitting into train and test sets...")
np.random.seed(42)
test_mask = np.random.rand(*R_full.shape) < 0.2
train_matrix = R_full.copy()
train_matrix[test_mask] = 0

print(f"Train matrix non-zero entries: {np.count_nonzero(train_matrix):,}")
print(f"Test matrix non-zero entries: {np.count_nonzero(test_mask):,}")



Splitting into train and test sets...
Train matrix non-zero entries: 1,075
Test matrix non-zero entries: 7,458


## Step 7: Train Matrix Factorization Model
Use Truncated SVD to learn user and item latent factors.


In [19]:
# Train Matrix Factorization Model
print("\nTraining TruncatedSVD model...")
n_components = min(20, train_matrix.shape[1] - 1)
print(f"Using n_components = {n_components}")
svd = TruncatedSVD(n_components=n_components, random_state=42)
user_factors = svd.fit_transform(train_matrix)
item_factors = svd.components_

# Predict Ratings
R_pred = np.dot(user_factors, item_factors)



Training TruncatedSVD model...
Using n_components = 20


## Step 8: Evaluate RMSE and MAE
Measure prediction error on the hidden (test) set.


In [20]:
# Evaluate RMSE and MAE
print("\nEvaluating RMSE and MAE...")
rmse = np.sqrt(mean_squared_error(R_full[test_mask], R_pred[test_mask]))
mae = mean_absolute_error(R_full[test_mask], R_pred[test_mask])
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")



Evaluating RMSE and MAE...
RMSE: 0.5092
MAE: 0.1793


## Step 9: Prepare Binary Matrix for Evaluation
Prepare a binary version of the matrix (recommended/not recommended) for classification evaluation.


In [21]:
# Prepare Binary Matrix
print("\nPreparing binary matrix for evaluation...")
binary_matrix = df.pivot_table(index='user_id', columns='app_id', values='is_recommended_binary', fill_value=0)
binary_matrix = binary_matrix.loc[user_item_matrix.index, user_item_matrix.columns]
R_true_binary = binary_matrix.values
R_test_binary = np.where(test_mask, R_true_binary, 0)



Preparing binary matrix for evaluation...


## Step 10: Evaluate Precision@20, Recall@20, F1@20
Evaluate recommendation quality at top 20 predicted games.


In [22]:
# Precision, Recall, F1 Evaluation
print("\nEvaluating Precision@20, Recall@20, F1@20...")

def precision_recall_f1_at_k(R_true, R_pred, k=20):
    precisions, recalls, f1s = [], [], []
    for i in range(R_true.shape[0]):
        actual = set(np.where(R_true[i] > 0)[0])
        if not actual:
            continue
        pred_scores = R_pred[i].copy()
        top_k = set(np.argsort(pred_scores)[-k:])
        tp = len(actual & top_k)
        precision = tp / k if k else 0
        recall = tp / len(actual) if actual else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    return np.mean(precisions), np.mean(recalls), np.mean(f1s)

precision, recall, f1 = precision_recall_f1_at_k(R_test_binary, R_pred, k=20)
print(f"Precision@20: {precision:.4f}")
print(f"Recall@20:    {recall:.4f}")
print(f"F1@20:        {f1:.4f}")



Evaluating Precision@20, Recall@20, F1@20...
Precision@20: 0.0102
Recall@20:    0.1426
F1@20:        0.0189


## Step 11: Save Model Artifacts
Save the item latent factors and item IDs to disk for future use in the Streamlit app.


In [None]:
print("\nSaving model artifacts...")

models_dir = os.path.join(BASE_DIR, "models")
os.makedirs(models_dir, exist_ok=True)

np.save(os.path.join(models_dir, "item_factors.npy"), item_factors.T)  # Save item latent vectors
with open(os.path.join(models_dir, "item_ids.pkl"), 'wb') as f:
    pickle.dump(item_ids, f)

print(f"Saved item_factors.npy and item_ids.pkl to {models_dir}")


Saving model artifacts...
Saved item_factors.npy and item_ids.pkl to c:\Users\jians\Documents\GitHub\Game-Recommender-System\models


In [24]:
# Step 11: Calculate Item-Item Cosine Similarity

print("\nCalculating item-item cosine similarity...")

item_factors_T = item_factors.T  # Transpose so rows = games
item_similarity_matrix = cosine_similarity(item_factors_T)

print(f"Item-Item Similarity Matrix Shape: {item_similarity_matrix.shape}")

# Step 12: Recommend Top-N Similar Games for Each Game

top_n = 10  # How many similar games you want to recommend
all_game_recommendations = []

print("\nGenerating Top-10 similar games for each game...")

# Create app_id mappings
index_to_app_id = {idx: app_id for idx, app_id in enumerate(item_ids)}
app_id_to_title = games.set_index('app_id')['title'].to_dict()

for game_idx in range(item_similarity_matrix.shape[0]):
    game_id = index_to_app_id.get(game_idx)
    game_title = app_id_to_title.get(game_id, "Unknown Title")

    # Get similarity scores for this game
    similarities = item_similarity_matrix[game_idx]

    # Get top N similar games (excluding itself)
    similar_indices = similarities.argsort()[::-1][1:top_n+1]  # Skip the first one (itself)

    print(f"\nTop {top_n} similar games for '{game_title}' (Game ID: {game_id}):")
    for rank, sim_idx in enumerate(similar_indices, start=1):
        similar_game_id = index_to_app_id.get(sim_idx)
        similar_game_title = app_id_to_title.get(similar_game_id, "Unknown Title")
        similarity_score = similarities[sim_idx]
        print(f"  {rank}. {similar_game_title} (Game ID: {similar_game_id}, Similarity: {similarity_score:.4f})")

        all_game_recommendations.append({
            "Game ID": game_id,
            "Game Title": game_title,
            "Similar Game ID": similar_game_id,
            "Similar Game Title": similar_game_title,
            "Similarity Score": similarity_score
        })

# Step 13: Save all Top-N similar games into a CSV

output_dir = os.path.join(BASE_DIR, "models")
os.makedirs(output_dir, exist_ok=True)

similar_games_csv_path = os.path.join(output_dir, "top10_similar_games_mf.csv")
similar_games_df = pd.DataFrame(all_game_recommendations)
similar_games_df.to_csv(similar_games_csv_path, index=False)

print(f"\nSaved Top-10 similar games for all games to: {similar_games_csv_path}")



Calculating item-item cosine similarity...
Item-Item Similarity Matrix Shape: (148, 148)

Generating Top-10 similar games for each game...

Top 10 similar games for 'Left 4 Dead 2' (Game ID: 550):
  1. Yu-Gi-Oh! Master Duel (Game ID: 1449850, Similarity: 0.5742)
  2. Rocksmith® 2014 Edition - Remastered (Game ID: 221680, Similarity: 0.5742)
  3. Lost Ark (Game ID: 1599340, Similarity: 0.5637)
  4. Graveyard Keeper (Game ID: 599140, Similarity: 0.5338)
  5. Crusader Kings III (Game ID: 1158310, Similarity: 0.5163)
  6. Dota 2 (Game ID: 570, Similarity: 0.5084)
  7. Call of Duty®: Black Ops III (Game ID: 311210, Similarity: 0.4753)
  8. Sid Meier’s Civilization® VI (Game ID: 289070, Similarity: 0.4655)
  9. Jurassic World Evolution 2 (Game ID: 1244460, Similarity: 0.4489)
  10. Phasmophobia (Game ID: 739630, Similarity: 0.4433)

Top 10 similar games for 'Dota 2' (Game ID: 570):
  1. NARAKA: BLADEPOINT (Game ID: 1203220, Similarity: 0.8540)
  2. Destiny 2 (Game ID: 1085660, Similarity: 0