In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Move up one level from 'notebooks/' to the project root
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

recommendations_path = os.path.join(BASE_DIR, "data/external/recommendations.csv")
recommendations_pd = pd.read_csv(recommendations_path) if os.path.exists(recommendations_path) else None

games_path = os.path.join(BASE_DIR, "data/external/games_tagged.csv")
games_pd = pd.read_csv(games_path) if os.path.exists(games_path) else None

users_path = os.path.join(BASE_DIR, "data/external/users.csv")
users_pd = pd.read_csv(users_path) if os.path.exists(users_path) else None

In [3]:
if recommendations_pd is not None:
    print(len(recommendations_pd))
else:
    print("recommendations.csv not found at:", recommendations_path)


41154794


In [4]:
len(recommendations_pd)


41154794

In [5]:
recommendations_pd = recommendations_pd.head(5154794)

len(recommendations_pd)

5154794

In [6]:
recommendations_pd.head()

Unnamed: 0,app_id,helpful,funny,user_id,review_id,hours_log,hours_log_scaled,is_recommended_binary,review_year,review_month,review_day,review_age_years,helpfulness_ratio,helpful_log,funny_log
0,975370,0,0,51580,0,3.618993,0.137106,1,2022,12,12,2.056126,0.0,0.0,0.0
1,304390,4,0,2586,1,2.525729,-0.520482,0,2017,2,17,7.871321,0.8,1.609438,0.0
2,1085660,2,0,253880,2,5.821566,1.46193,1,2019,11,17,5.125257,0.666667,1.098612,0.0
3,703080,0,0,259432,3,3.346389,-0.026863,1,2022,9,23,2.275154,0.0,0.0,0.0
4,526870,0,0,23869,4,2.186051,-0.724794,1,2021,1,10,3.975359,0.0,0.0,0.0


In [7]:
games_pd.head()

Unnamed: 0,app_id,title,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,price_final_log,price_original_log,win_binary,mac_binary,linux_binary,steam_deck_binary,price_final_scaled,price_original_scaled,reviews_per_dollar,positive_ratio_per_dollar,rating_encoded_Mostly Negative,rating_encoded_Mostly Positive,rating_encoded_Negative,rating_encoded_Overwhelmingly Negative,rating_encoded_Overwhelmingly Positive,rating_encoded_Positive,rating_encoded_Very Negative,rating_encoded_Very Positive,release_year,release_month,release_day,game_age_years,user_reviews_log,user_reviews_log_scaled,tags
0,13500,Prince of Persia: Warrior Within™,84,2199,9.99,9.99,0.0,True,2.396986,2.396986,1,0,0,1,0.118957,0.109779,219.9,8.4,0,0,0,0,0,0,0,1,2008,11,21,16.112252,7.696213,1.810612,"('Action', 'Adventure', 'Parkour', 'Third Pers..."
1,22364,BRINK: Agents of Change,85,21,2.99,2.99,0.0,True,1.383791,1.383791,1,0,0,1,-0.488996,-0.498552,7.0,28.333333,0,0,0,0,0,1,0,0,2011,8,3,13.415469,3.091042,-0.722796,"('Action',)"
2,113020,Monaco: What's Yours Is Mine,92,3722,14.99,14.99,0.0,True,2.771964,2.771964,1,1,1,1,0.553209,0.5443,248.133333,6.133333,0,0,0,0,0,0,0,1,2013,4,24,11.690623,8.222285,2.100016,"('Co-op', 'Stealth', 'Indie', 'Heist', 'Local ..."
3,226560,Escape Dead Island,61,873,14.99,14.99,0.0,True,2.771964,2.771964,1,0,0,1,0.553209,0.5443,58.2,4.066667,0,0,0,0,0,0,0,0,2014,11,18,10.121834,6.77308,1.302776,"('Zombies', 'Adventure', 'Survival', 'Action',..."
4,249050,Dungeon of the ENDLESS™,88,8784,11.99,11.99,0.0,True,2.56418,2.56418,1,1,0,1,0.292658,0.283587,732.0,7.333333,0,0,0,0,0,0,0,1,2014,10,27,10.182067,9.080801,2.572305,"('Roguelike', 'Strategy', 'Tower Defense', 'Pi..."


In [8]:
users_pd.head()

Unnamed: 0,user_id,products,reviews
0,7360263,359,0
1,14020781,156,1
2,8762579,329,4
3,4820647,176,4
4,5167327,98,2


# Content Based Model



content-based filtering model is using TF-IDF vectorizer and cosine similarity. It recommends games that are similar to a given game the player plays based on the game's descriptions.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# Combine title and tags (convert tags to string just in case)
games_pd['tags'] = games_pd['tags'].astype(str)
games_pd['content'] = games_pd['title'] + ' ' + games_pd['tags']

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
tfidf_matrix = tfidf.fit_transform(games_pd['content'])  # Sparse matrix

In [10]:
nn = NearestNeighbors(n_neighbors=6, metric='cosine', algorithm='brute')
nn.fit(tfidf_matrix)

# Reverse index to map app_id → index
indices = pd.Series(games_pd.index, index=games_pd['app_id'])


In [11]:
def get_recommendations_knn(app_id):
    if app_id not in indices:
        return ["Game not found"]
    
    idx = indices[app_id]
    distances, indices_knn = nn.kneighbors(tfidf_matrix[idx], n_neighbors=6)
    
    # indices_knn is a 2D array, take the first row and exclude the first match (itself)
    recommended_idx = indices_knn[0][1:]
    
    return games_pd[['app_id', 'title']].iloc[recommended_idx].reset_index(drop=True)


In [12]:
get_recommendations_knn(13500)  # Try any app_id from your dataset


Unnamed: 0,app_id,title
0,13600,Prince of Persia®: The Sands of Time
1,33320,Prince of Persia: The Forgotten Sands™
2,13530,Prince of Persia: The Two Thrones™
3,19980,Prince of Persia®
4,1207010,Me and myself


In [18]:
def get_recommendations_knn_verbose(app_id):
    if app_id not in indices:
        return ["Game not found"]
    
    idx = indices[app_id]
    distances, indices_knn = nn.kneighbors(tfidf_matrix[idx], n_neighbors=6)
    recommended_idx = indices_knn[0][1:]

    # Include tags or other metadata
    return games_pd[['app_id', 'title', 'tags', 'positive_ratio']].iloc[recommended_idx].reset_index(drop=True)


In [21]:
get_recommendations_knn_verbose(1207010)  # Try any app_id from your dataset


Unnamed: 0,app_id,title,tags,positive_ratio
0,447210,Blink,"('Adventure', 'Indie', 'Puzzle Platformer')",92
1,782380,Mirage of Dragon,"('Action', 'Adventure', 'Indie', 'Puzzle Platf...",74
2,1725470,Enionax,"('Precision Platformer', '2D Platformer', 'Ret...",75
3,342100,Sym,"('Indie', 'Puzzle Platformer', 'Platformer')",80
4,982540,Hyss,"('Adventure', 'Indie', 'Puzzle', 'Puzzle Platf...",93


In [17]:
def get_user_favorites(user_id):
    liked = recommendations_pd[
        (recommendations_pd['user_id'] == user_id) & 
        (recommendations_pd['is_recommended_binary'] == 1)
    ]
    return liked['app_id'].tolist()

def recommend_for_user_knn(user_id):
    liked_games = get_user_favorites(user_id)
    if not liked_games:
        return ["No liked games found for this user."]
    
    recs = pd.DataFrame()
    for app_id in liked_games:
        game_recs = get_recommendations_knn(app_id)
        recs = pd.concat([recs, game_recs])
    
    # Remove duplicates and already liked games
    recs = recs.drop_duplicates(subset='app_id')
    recs = recs[~recs['app_id'].isin(liked_games)]
    
    return recs.head(5)