In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Move up one level from 'notebooks/' to the project root
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

recommendations_path = os.path.join(BASE_DIR, "data/external/recommendations.csv")
recommendations_pd = pd.read_csv(recommendations_path) if os.path.exists(recommendations_path) else None

games_path = os.path.join(BASE_DIR, "data/external/games_tagged.csv")
games_pd = pd.read_csv(games_path) if os.path.exists(games_path) else None

users_path = os.path.join(BASE_DIR, "data/external/users.csv")
users_pd = pd.read_csv(users_path) if os.path.exists(users_path) else None

In [3]:
len(recommendations_pd)


41154794

In [4]:
recommendations_pd = recommendations_pd.head(5154794)

len(recommendations_pd)

5154794

In [5]:
recommendations_pd.head()

Unnamed: 0,app_id,helpful,funny,user_id,review_id,hours_log,hours_log_scaled,is_recommended_binary,review_year,review_month,review_day,review_age_years,helpfulness_ratio,helpful_log,funny_log
0,975370,0,0,51580,0,3.618993,0.137106,1,2022,12,12,2.056126,0.0,0.0,0.0
1,304390,4,0,2586,1,2.525729,-0.520482,0,2017,2,17,7.871321,0.8,1.609438,0.0
2,1085660,2,0,253880,2,5.821566,1.46193,1,2019,11,17,5.125257,0.666667,1.098612,0.0
3,703080,0,0,259432,3,3.346389,-0.026863,1,2022,9,23,2.275154,0.0,0.0,0.0
4,526870,0,0,23869,4,2.186051,-0.724794,1,2021,1,10,3.975359,0.0,0.0,0.0


In [6]:
games_pd.head()

Unnamed: 0,app_id,title,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,price_final_log,price_original_log,win_binary,mac_binary,linux_binary,steam_deck_binary,price_final_scaled,price_original_scaled,reviews_per_dollar,positive_ratio_per_dollar,rating_encoded_Mostly Negative,rating_encoded_Mostly Positive,rating_encoded_Negative,rating_encoded_Overwhelmingly Negative,rating_encoded_Overwhelmingly Positive,rating_encoded_Positive,rating_encoded_Very Negative,rating_encoded_Very Positive,release_year,release_month,release_day,game_age_years,user_reviews_log,user_reviews_log_scaled,tags
0,13500,Prince of Persia: Warrior Within™,84,2199,9.99,9.99,0.0,True,2.396986,2.396986,1,0,0,1,0.118957,0.109779,219.9,8.4,0,0,0,0,0,0,0,1,2008,11,21,16.112252,7.696213,1.810612,"['Action', 'Adventure', 'Parkour', 'Third Pers..."
1,22364,BRINK: Agents of Change,85,21,2.99,2.99,0.0,True,1.383791,1.383791,1,0,0,1,-0.488996,-0.498552,7.0,28.333333,0,0,0,0,0,1,0,0,2011,8,3,13.415469,3.091042,-0.722796,['Action']
2,113020,Monaco: What's Yours Is Mine,92,3722,14.99,14.99,0.0,True,2.771964,2.771964,1,1,1,1,0.553209,0.5443,248.133333,6.133333,0,0,0,0,0,0,0,1,2013,4,24,11.690623,8.222285,2.100016,"['Co-op', 'Stealth', 'Indie', 'Heist', 'Local ..."
3,226560,Escape Dead Island,61,873,14.99,14.99,0.0,True,2.771964,2.771964,1,0,0,1,0.553209,0.5443,58.2,4.066667,0,0,0,0,0,0,0,0,2014,11,18,10.121834,6.77308,1.302776,"['Zombies', 'Adventure', 'Survival', 'Action',..."
4,249050,Dungeon of the ENDLESS™,88,8784,11.99,11.99,0.0,True,2.56418,2.56418,1,1,0,1,0.292658,0.283587,732.0,7.333333,0,0,0,0,0,0,0,1,2014,10,27,10.182067,9.080801,2.572305,"['Roguelike', 'Strategy', 'Tower Defense', 'Pi..."


In [7]:
# users_pd.head()

# Item-Based Collaborative Filtering

## Recommend games based on games similar to what they like in the past. 
## Calculate similarity based on rating or preferences of other users.

## Example
1. Alex liked GTA 5 and Fallout 4
2. Brad liked GTA 5 and Fallout 4, also likes Skyrim.
3. Suggest Skyrim to Alex because people who like GTA 5 and Fallout are likely to enjoy Skyrim too.

## User-Item Interaction Matrix
- Row = USERs
- Columns = ITEMs (Games)
- Cells = Interaction value between USER and ITEM.




In [8]:
import pandas as pd
import implicit
from scipy.sparse import csr_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# 1. Prepare User-Item Matrix
user_item_data = recommendations_pd.pivot_table(
    index='user_id',               # Users as rows
    columns='app_id',              # Games as columns
    values='hours_log_scaled'      # Interaction strength (playtime)
).fillna(0)                        # Replace NAN with 0

## *user_item_data* looks like this

| user\_id | game\_app\_id\_1 | game\_app\_id\_2 | game\_app\_id\_3 | ... | game\_app\_id\_N |
| :-------- | :---------------- | :---------------- | :---------------- | :--- | :---------------- |
| user\_1   | 0                | 1.2               | 0                | ...  | 0                |
| user\_2   | 0.8               | 0                | 2.5               | ...  | 0                |
| user\_3   | 0                | 0                | 0                | ...  | 0.5               |
| ...       | ...               | ...               | ...               | ...  | ...               |
| user\_M   | 1.5               | 0                | 0                | ...  | 0                |

In [10]:
sparse_user_item = csr_matrix(user_item_data) 
# sparse = most cells are 0
# so convert into Compressed Sparse Row (CSR) matrix
# only store non-zero.

# 2. Train ALS (Alternating Least Squares) Model
model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.1)
# ALS learns 'latent factors' - hidden user tastes and game features
# factors: number of latent factors (complexity)
# regularization: prevents overfitting

model.fit(sparse_user_item)
# basically decompose into 2 lower dimensions:
# 1. User Latent Factor Matrix: Represent each user by vector of latent factors
# 2. Item Latent Factor Matrix: Represents each item (game) by a vector of latent factors.

# NOTE: latent = hidden features or characteristics describing users and items.

  check_blas_config()
100%|██████████| 15/15 [01:57<00:00,  7.83s/it]


In [11]:
def recommend_similar_games(app_id, model, games_df, user_item_matrix, top_n=10):
    """
    Recommends games similar to the input app_id using a trained model.
    """
    game_id = user_item_matrix.columns.get_loc(app_id)
    similar_item_ids, scores = model.similar_items(game_id, N=top_n)

    recommendations = []
    for item_id, score in zip(similar_item_ids, scores):
        recommended_app_id = user_item_matrix.columns[item_id]
        game_name = games_df[games_df['app_id'] == recommended_app_id]['title'].iloc[0]
        recommendations.append({'app_id': recommended_app_id, 'game_name': game_name, 'similarity_score': score})
    return pd.DataFrame(recommendations)

In [12]:
example_app_id = 975370 # Game to find recommendations for
print(f"Is app_id {example_app_id} in recommendations data? {example_app_id in recommendations_pd['app_id'].values}")

if example_app_id in user_item_data.columns: # Check if app_id is valid
    similar_games = recommend_similar_games(
        example_app_id,
        model,
        games_pd,
        user_item_data
    )
    print(f"\nTop games similar to app_id {example_app_id}:")
    print(similar_games)
else:
    print(f"\nError: app_id {example_app_id} not found in user interaction data.")

Is app_id 975370 in recommendations data? True

Top games similar to app_id 975370:
    app_id                   game_name  similarity_score
0   975370              Dwarf Fortress          1.000000
1   392160             X4: Foundations          0.822539
2  1176470               Terra Invicta          0.809208
3   599140            Graveyard Keeper          0.797850
4  1527950                    Wartales          0.758888
5   602960                  Barotrauma          0.742106
6   784080  MechWarrior 5: Mercenaries          0.703004
7   973230     We Who Are About To Die          0.691038
8  1336490           Against the Storm          0.677302
9  1113120                       IXION          0.667671


In [13]:
import pickle, os
model_filename = os.path.join(BASE_DIR, "models/item_cf_als_model.pkl")

with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

def human_file_size(size_bytes):
    """Converts bytes to human-readable file size (KB, MB, GB, etc.)."""
    units = ['B', 'KB', 'MB', 'GB', 'TB']
    size = float(size_bytes)
    for unit in units:
        if size < 1024:
            return f"{size:.2f} {unit}"
        size /= 1024
    return f"{size:.2f} {units[-1]}" # Return in largest unit if still large

file_size_bytes = os.path.getsize(model_filename)
human_size = human_file_size(file_size_bytes)

print(f"\nModel saved to: {model_filename} (Size: {human_size})")


Model saved to: c:\Users\wbrya\OneDrive\Documents\GitHub\MovieLens-Recommender-System\models/item_cf_als_model.pkl (Size: 683.99 MB)
