In [1]:
# Cell 1: Import Libraries and Load Full Data
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix # Will be used later
import os

# --- Configuration ---
pd.set_option('display.max_columns', None)
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

# --- Load Data ---
recommendations_path = os.path.join(BASE_DIR, "data/external/recommendations.csv")
games_path = os.path.join(BASE_DIR, "data/external/games_tagged.csv")

# --- Load Full Recommendations Data ---
print(f"Loading recommendations from: {recommendations_path}")
recommendations_pd_full = pd.read_csv(recommendations_path)
recommendations_pd_full['app_id'] = recommendations_pd_full['app_id'].astype(int)
print(f"Loaded {len(recommendations_pd_full):,} full recommendations records.")

# --- Load Game Metadata ---
print(f"Loading game metadata from: {games_path}")
games_pd = pd.read_csv(games_path)
games_pd['app_id'] = games_pd['app_id'].astype(int)
print(f"Loaded {len(games_pd):,} games metadata records.")

Loading recommendations from: c:\Users\wbrya\OneDrive\Documents\GitHub\MovieLens-Recommender-System\data/external/recommendations.csv
Loaded 41,154,794 full recommendations records.
Loading game metadata from: c:\Users\wbrya\OneDrive\Documents\GitHub\MovieLens-Recommender-System\data/external/games_tagged.csv
Loaded 50,872 games metadata records.


In [2]:
# Cell 2: Calculate Interaction Counts on Full Dataset
print("Calculating interaction counts...")

# Count how many interactions each user has
user_interaction_counts = recommendations_pd_full['user_id'].value_counts()

# Count how many interactions each game (app_id) has
item_interaction_counts = recommendations_pd_full['app_id'].value_counts()

print(f"Found {len(user_interaction_counts):,} unique users.")
print(f"Found {len(item_interaction_counts):,} unique items (games) with interactions.")

# Display some stats about the counts (optional, but helpful for choosing thresholds)
print("\nUser Interaction Count Stats:")
print(user_interaction_counts.describe())
print("\nItem Interaction Count Stats:")
print(item_interaction_counts.describe())

# Example: See the top 5 most active users and most interacted-with games
print("\nTop 5 Users by Interactions:")
print(user_interaction_counts.head())
print("\nTop 5 Items by Interactions:")
print(item_interaction_counts.head())

Calculating interaction counts...
Found 13,781,059 unique users.
Found 37,610 unique items (games) with interactions.

User Interaction Count Stats:
count    1.378106e+07
mean     2.986330e+00
std      8.118011e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      3.000000e+00
max      6.045000e+03
Name: count, dtype: float64

Item Interaction Count Stats:
count     37610.000000
mean       1094.251369
std        7689.340463
min           1.000000
25%          13.000000
50%          39.000000
75%         179.750000
max      319492.000000
Name: count, dtype: float64

Top 5 Users by Interactions:
user_id
11764552    6045
5112758     4152
11656130    3840
5669734     3479
11553593    3392
Name: count, dtype: int64

Top 5 Items by Interactions:
app_id
440        319492
252490     270684
1091500    226414
730        219737
570        216914
Name: count, dtype: int64


In [3]:
# Cell 3: Filter Interactions Based on Thresholds

# --- Define Thresholds ---
MIN_USER_INTERACTIONS = 10  # Keep users who interacted with at least 10 games
MIN_ITEM_INTERACTIONS = 10000 # Keep games that were interacted with by at least 10000 users

print(f"Filtering criteria: Users >= {MIN_USER_INTERACTIONS} interactions, Items >= {MIN_ITEM_INTERACTIONS} interactions.")

# --- Filter Users ---
# Get the list of user_ids who meet the minimum interaction count
users_to_keep = user_interaction_counts[user_interaction_counts >= MIN_USER_INTERACTIONS].index
# Filter the main dataframe
recommendations_pd_filtered_users = recommendations_pd_full[recommendations_pd_full['user_id'].isin(users_to_keep)]
print(f"Interactions after user filtering: {len(recommendations_pd_filtered_users):,}")

# --- Filter Items ---
# Get the list of app_ids that meet the minimum interaction count
items_to_keep = item_interaction_counts[item_interaction_counts >= MIN_ITEM_INTERACTIONS].index
# Filter the dataframe that was already filtered by user
recommendations_pd_filtered = recommendations_pd_filtered_users[recommendations_pd_filtered_users['app_id'].isin(items_to_keep)]
print(f"Interactions after item filtering: {len(recommendations_pd_filtered):,}")

# --- Cleanup intermediate variable (optional) ---
del items_to_keep # Index object
del item_interaction_counts # Series
del recommendations_pd_filtered_users # DataFrame (intermediate step)
del recommendations_pd_full # DataFrame (original full data)

import gc
gc.collect()

Filtering criteria: Users >= 10 interactions, Items >= 10000 interactions.
Interactions after user filtering: 14,600,426
Interactions after item filtering: 8,289,865


0

In [4]:
# Cell 4: Sample from Filtered Data

SEED = 42
print(f"\nFiltered interaction count: {len(recommendations_pd_filtered):,}")
recommendations_pd_final = recommendations_pd_filtered.sample(frac=1, random_state=SEED)

print(f"Final number of interactions being used (shuffled): {len(recommendations_pd_final):,}")

# --- Cleanup intermediate variable (optional) ---
del recommendations_pd_filtered
import gc
gc.collect()


Filtered interaction count: 8,289,865
Final number of interactions being used (shuffled): 8,289,865


0

In [5]:
# Cell 5: Display Sample Data and Finalize Game Metadata

print("\nSample Final Recommendations Data (Filtered & Shuffled):")
print(recommendations_pd_final.head())

# Get unique game IDs from the final interaction dataset
final_game_ids = recommendations_pd_final['app_id'].unique()
print(f"\nUnique Games in final dataset: {len(final_game_ids):,}")

print("\nFiltering Games Metadata...")
# Keep only metadata for games present in the final interaction data
games_pd_filtered = games_pd[games_pd['app_id'].isin(final_game_ids)]
print(f"Retained metadata for {len(games_pd_filtered):,} games.")

print("\nSample Filtered Games Metadata (Relevant Columns):")
print(games_pd_filtered[['app_id', 'title', 'tags']].head())

# --- Final Cleanup for this Stage ---
# Overwrite the original games_pd with the filtered version
games_pd = games_pd_filtered

# Delete the intermediate filtered variable and the IDs list
del games_pd_filtered
del final_game_ids
import gc
gc.collect()


Sample Final Recommendations Data (Filtered & Shuffled):
           app_id  helpful  funny   user_id  review_id  hours_log  \
11388928   301640        0      0   8200358   11388928   0.182322   
6423607   1238840        6      0   6700445    6423607   2.509599   
12776548  1252330        0      0   5091898   12776548   1.887070   
24633783   356190        0      0  10368619   24633783   0.095310   
12428935  1167630        0      0   1001456   12428935   3.288402   

          hours_log_scaled  is_recommended_binary  review_year  review_month  \
11388928         -1.930017                      1         2018             3   
6423607          -0.530183                      1         2020            11   
12776548         -0.904629                      0         2021            12   
24633783         -1.982353                      0         2020            10   
12428935         -0.061742                      1         2021            12   

          review_day  review_age_years  helpfu

0

In [6]:
# Cell 6: Create User-Item Interaction Matrix
from scipy.sparse import csr_matrix

print("Creating the User-Item interaction matrix...")
print(f"Input data shape: {recommendations_pd_final.shape}")

# Define the interaction value column
interaction_value_col = 'hours_log_scaled'

# Use pivot_table to create the matrix (Users as rows, Items as columns)
user_item_pivot = recommendations_pd_final.pivot_table(
    index='user_id',
    columns='app_id',
    values=interaction_value_col,
    fill_value=0  # Fill missing interactions (NaN) with 0
)

print(f"Created pivot table with shape: {user_item_pivot.shape} (Users x Items)")

# Convert the dense pivot table to a sparse matrix (CSR format)
print("Converting pivot table to sparse matrix (CSR format)...")
user_item_sparse_matrix = csr_matrix(user_item_pivot.values)

# --- Create Mappings for Interpretation ---
# These maps are essential to link matrix rows/columns back to original IDs
user_map = {id: i for i, id in enumerate(user_item_pivot.index)}
item_map = {id: i for i, id in enumerate(user_item_pivot.columns)}

# Create inverse mappings to get IDs from matrix indices
user_map_inv = {i: id for id, i in user_map.items()}
item_map_inv = {i: id for id, i in item_map.items()}

print(f"Sparse matrix shape: {user_item_sparse_matrix.shape}")
sparsity = 1.0 - user_item_sparse_matrix.nnz / (user_item_sparse_matrix.shape[0] * user_item_sparse_matrix.shape[1])
print(f"Sparsity: {sparsity:.4%}")

# --- Cleanup ---
del user_item_pivot
del recommendations_pd_final

import gc
gc.collect()

Creating the User-Item interaction matrix...
Input data shape: (8289865, 15)
Created pivot table with shape: (678264, 738) (Users x Items)
Converting pivot table to sparse matrix (CSR format)...
Sparse matrix shape: (678264, 738)
Sparsity: 98.3439%


0

In [7]:
# Cell 7: Calculate Item-Item Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

print("Calculating Item-Item cosine similarity...")

# Important: cosine_similarity expects samples as rows.
# Our user_item_sparse_matrix has users as rows and items as columns.
# To calculate item-item similarity, we need to compare the columns.
# Therefore, we transpose the matrix (.T) so that items become rows.
item_user_sparse_matrix = user_item_sparse_matrix.T

# Calculate cosine similarity between all pairs of items (rows in the transposed matrix)
# This returns a dense numpy array (item_count x item_count)
item_similarity_matrix = cosine_similarity(item_user_sparse_matrix, dense_output=True) # Keep dense for easier lookup

# The resulting item_similarity_matrix is a square matrix where:
# - Rows and columns both correspond to item indices (0 to 737 in your case).
# - The value at matrix[i, j] is the cosine similarity between item i and item j.
# - The diagonal (matrix[i, i]) will always be 1 (similarity of an item with itself).

print(f"Calculated item-item similarity matrix with shape: {item_similarity_matrix.shape}")

# Optional: Verify the diagonal is all 1s (within floating point tolerance)
print(f"Diagonal check (should be close to 1): {item_similarity_matrix.diagonal().mean()}")

# Optional: Display a small slice of the similarity matrix
print("\nSlice of the similarity matrix:")
print(pd.DataFrame(item_similarity_matrix).iloc[:5, :5])

# --- Cleanup ---
del item_user_sparse_matrix
del user_item_sparse_matrix

import gc
gc.collect()

Calculating Item-Item cosine similarity...
Calculated item-item similarity matrix with shape: (738, 738)
Diagonal check (should be close to 1): 0.9999999999999977

Slice of the similarity matrix:
          0         1         2         3         4
0  1.000000  0.052652  0.091838  0.042068  0.089529
1  0.052652  1.000000  0.086355  0.170226  0.045969
2  0.091838  0.086355  1.000000  0.066886  0.074018
3  0.042068  0.170226  0.066886  1.000000  0.055073
4  0.089529  0.045969  0.074018  0.055073  1.000000


0

In [8]:
# Cell 8: Define Item-Based Recommendation Function using Cosine Similarity

def recommend_similar_games_cosine(target_app_id: int, top_n: int = 10) -> pd.DataFrame:
    """
    Recommends games similar to the target_app_id using the pre-calculated
    item-item cosine similarity matrix.

    Args:
        target_app_id: The app_id of the game for which to find recommendations.
        top_n: The number of similar games to return.

    Returns:
        A pandas DataFrame containing the top_n recommended games, including
        their app_id, game_name, and similarity_score. Returns an empty
        DataFrame if the target_app_id is not found in the model's data.
    """
    # --- Input Validation ---
    if target_app_id not in item_map:
        print(f"Warning: app_id {target_app_id} not found in the filtered interaction data used for the model.")
        # Return empty DataFrame with correct columns
        return pd.DataFrame(columns=['app_id', 'game_name', 'similarity_score'])

    # --- Get Matrix Index ---
    try:
        item_index = item_map[target_app_id]
    except KeyError:
        # This case should be covered by the initial check, but added for robustness
        print(f"Error: Could not find matrix index for app_id {target_app_id}.")
        return pd.DataFrame(columns=['app_id', 'game_name', 'similarity_score'])


    # --- Retrieve Similarity Scores ---
    # Get the similarity scores of the target item with all other items
    similarity_scores = item_similarity_matrix[item_index] # This is a numpy array

    # --- Combine Scores with Item Indices ---
    # Create a list of tuples: (item_matrix_index, similarity_score)
    # We use enumerate to get both index and score
    item_score_pairs = list(enumerate(similarity_scores))

    # --- Sort by Similarity ---
    # Sort the list in descending order based on score (the second element of the tuple)
    sorted_item_score_pairs = sorted(item_score_pairs, key=lambda x: x[1], reverse=True)

    # --- Exclude Input Game & Select Top N ---
    # The first item in the sorted list will be the target_app_id itself (similarity=1)
    # So, we skip the first one and take the next top_n
    top_similar_items = sorted_item_score_pairs[1 : top_n + 1] # Slice includes start, excludes end

    # --- Map Indices back to App IDs and Get Names ---
    recommendations = []
    for matrix_idx, score in top_similar_items:
        try:
            recommended_app_id = item_map_inv[matrix_idx]
            # Look up game title in our filtered games_pd
            game_info = games_pd[games_pd['app_id'] == recommended_app_id]
            if not game_info.empty:
                 game_name = game_info['title'].iloc[0]
            else:
                 game_name = "Title Not Found" # Fallback if somehow missing
            recommendations.append({
                'app_id': recommended_app_id,
                'game_name': game_name,
                'similarity_score': score
            })
        except KeyError:
             # Should not happen if item_map_inv is correct, but good practice
             print(f"Warning: Could not map matrix index {matrix_idx} back to an app_id.")
        except IndexError:
             # If game_info was unexpectedly empty
             print(f"Warning: Could not retrieve game title for app_id {recommended_app_id}")


    return pd.DataFrame(recommendations)

# --- Quick Test ---
# Find a game ID that is definitely in our data
if 'item_map' in globals() and item_map: # Check if item_map exists and is not empty
    test_id = list(item_map.keys())[4] # Get the first app_id from our map
    print(f"Testing recommendation function with app_id: {test_id}")
    sample_recommendations = recommend_similar_games_cosine(test_id, top_n=10)
    print(sample_recommendations)
else:
    print("Skipping function test as item_map is not available.")

Testing recommendation function with app_id: 240
   app_id                       game_name  similarity_score
0      10                  Counter-Strike          0.089529
1      80  Counter-Strike: Condition Zero          0.074018
2     500                     Left 4 Dead          0.070424
3     220                     Half-Life 2          0.055073
4     550                   Left 4 Dead 2          0.048502
5      70                       Half-Life          0.045969
6     400                          Portal          0.035202
7    1250                   Killing Floor          0.034644
8     620                        Portal 2          0.031148
9     380        Half-Life 2: Episode One          0.028551


In [None]:
# Cell 9: Save Model Artifacts for Streamlit App
import pickle
import os

print("Saving model artifacts...")

# Define filenames for the artifacts
similarity_matrix_filename = "item_similarity_matrix.pkl"
item_map_filename = "item_map.pkl"
item_map_inv_filename = "item_map_inv.pkl"
games_df_filename = "filtered_games_metadata.pkl" # Save the filtered games_pd

# Define the target directory (using the models directory from your structure)
models_dir = os.path.join(BASE_DIR, "models")
os.makedirs(models_dir, exist_ok=True) # Create the directory if it doesn't exist

# --- Objects to Save ---
# 1. The Item-Item Cosine Similarity Matrix (NumPy array)
similarity_matrix_path = os.path.join(models_dir, similarity_matrix_filename)
with open(similarity_matrix_path, 'wb') as f:
    pickle.dump(item_similarity_matrix, f)
print(f"Saved item similarity matrix to: {similarity_matrix_path}")

# 2. Item Map (app_id -> matrix index dictionary)
item_map_path = os.path.join(models_dir, item_map_filename)
with open(item_map_path, 'wb') as f:
    pickle.dump(item_map, f)
print(f"Saved item map to: {item_map_path}")

# 3. Inverse Item Map (matrix index -> app_id dictionary)
item_map_inv_path = os.path.join(models_dir, item_map_inv_filename)
with open(item_map_inv_path, 'wb') as f:
    pickle.dump(item_map_inv, f)
print(f"Saved inverse item map to: {item_map_inv_path}")

# 4. Filtered Games DataFrame (pandas DataFrame)
# This contains metadata ONLY for the games present in the similarity matrix
games_df_path = os.path.join(models_dir, games_df_filename)
# Ensure we're saving the final filtered games_pd from Cell 5
with open(games_df_path, 'wb') as f:
    pickle.dump(games_pd, f)
print(f"Saved filtered games metadata DataFrame to: {games_df_path}")

# --- Cleanup ---
del item_map
del item_map_inv

Saving model artifacts...
Saved item similarity matrix to: c:\Users\wbrya\OneDrive\Documents\GitHub\MovieLens-Recommender-System\models\item_similarity_matrix.pkl
Saved item map to: c:\Users\wbrya\OneDrive\Documents\GitHub\MovieLens-Recommender-System\models\item_map.pkl
Saved inverse item map to: c:\Users\wbrya\OneDrive\Documents\GitHub\MovieLens-Recommender-System\models\item_map_inv.pkl
Saved filtered games metadata DataFrame to: c:\Users\wbrya\OneDrive\Documents\GitHub\MovieLens-Recommender-System\models\filtered_games_metadata.pkl
