In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Move up one level from 'notebooks/' to the project root
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

recommendations_path = os.path.join(BASE_DIR, "data/external/recommendations.csv")
recommendations_pd = pd.read_csv(recommendations_path) if os.path.exists(recommendations_path) else None

games_path = os.path.join(BASE_DIR, "data/external/games_tagged.csv")
games_pd = pd.read_csv(games_path) if os.path.exists(games_path) else None

users_path = os.path.join(BASE_DIR, "data/external/users.csv")
users_pd = pd.read_csv(users_path) if os.path.exists(users_path) else None

In [3]:
recommendations_pd.head()

Unnamed: 0,app_id,helpful,funny,user_id,review_id,hours_log,hours_log_scaled,is_recommended_binary,review_year,review_month,review_day,review_age_years,helpfulness_ratio,helpful_log,funny_log
0,975370,0,0,51580,0,3.618993,0.137106,1,2022,12,12,2.056126,0.0,0.0,0.0
1,304390,4,0,2586,1,2.525729,-0.520482,0,2017,2,17,7.871321,0.8,1.609438,0.0
2,1085660,2,0,253880,2,5.821566,1.46193,1,2019,11,17,5.125257,0.666667,1.098612,0.0
3,703080,0,0,259432,3,3.346389,-0.026863,1,2022,9,23,2.275154,0.0,0.0,0.0
4,526870,0,0,23869,4,2.186051,-0.724794,1,2021,1,10,3.975359,0.0,0.0,0.0


In [4]:
games_pd.head()

Unnamed: 0,app_id,title,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck,price_final_log,price_original_log,win_binary,mac_binary,linux_binary,steam_deck_binary,price_final_scaled,price_original_scaled,reviews_per_dollar,positive_ratio_per_dollar,rating_encoded_Mostly Negative,rating_encoded_Mostly Positive,rating_encoded_Negative,rating_encoded_Overwhelmingly Negative,rating_encoded_Overwhelmingly Positive,rating_encoded_Positive,rating_encoded_Very Negative,rating_encoded_Very Positive,release_year,release_month,release_day,game_age_years,user_reviews_log,user_reviews_log_scaled,tags
0,13500,Prince of Persia: Warrior Within™,84,2199,9.99,9.99,0.0,True,2.396986,2.396986,1,0,0,1,0.118957,0.109779,219.9,8.4,0,0,0,0,0,0,0,1,2008,11,21,16.112252,7.696213,1.810612,"('Action', 'Adventure', 'Parkour', 'Third Pers..."
1,22364,BRINK: Agents of Change,85,21,2.99,2.99,0.0,True,1.383791,1.383791,1,0,0,1,-0.488996,-0.498552,7.0,28.333333,0,0,0,0,0,1,0,0,2011,8,3,13.415469,3.091042,-0.722796,"('Action',)"
2,113020,Monaco: What's Yours Is Mine,92,3722,14.99,14.99,0.0,True,2.771964,2.771964,1,1,1,1,0.553209,0.5443,248.133333,6.133333,0,0,0,0,0,0,0,1,2013,4,24,11.690623,8.222285,2.100016,"('Co-op', 'Stealth', 'Indie', 'Heist', 'Local ..."
3,226560,Escape Dead Island,61,873,14.99,14.99,0.0,True,2.771964,2.771964,1,0,0,1,0.553209,0.5443,58.2,4.066667,0,0,0,0,0,0,0,0,2014,11,18,10.121834,6.77308,1.302776,"('Zombies', 'Adventure', 'Survival', 'Action',..."
4,249050,Dungeon of the ENDLESS™,88,8784,11.99,11.99,0.0,True,2.56418,2.56418,1,1,0,1,0.292658,0.283587,732.0,7.333333,0,0,0,0,0,0,0,1,2014,10,27,10.182067,9.080801,2.572305,"('Roguelike', 'Strategy', 'Tower Defense', 'Pi..."


In [5]:
users_pd.head()

Unnamed: 0,user_id,products,reviews
0,7360263,359,0
1,14020781,156,1
2,8762579,329,4
3,4820647,176,4
4,5167327,98,2


# Content-Based Filtering

## Recommend games based on the content similarity of game they played like titles and tags

#### Example
- Kenji liked Prince of Persia: Warrior Within™
- That game has tags like 'Action', 'Adventure', 'Parkour'
- The system finds other games with similar titles and tags (e.g., The Sands of Time, The Two Thrones)
- Suggest those similar games to Kenji based on the content features, not other users


In [38]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd

In [39]:
# Combine title and tags (convert tags to string just in case)
games_pd['tags'] = games_pd['tags'].astype(str)
games_pd['content'] = games_pd['title'] + ' ' + games_pd['tags']

In [40]:
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
tfidf_matrix = tfidf.fit_transform(games_pd['content'])  # Sparse matrix
from sklearn.metrics.pairwise import linear_kernel

In [41]:
nn = NearestNeighbors(n_neighbors=6, metric='cosine', algorithm='brute')
nn.fit(tfidf_matrix)

# Reverse index to map app_id → index
indices = pd.Series(games_pd.index, index=games_pd['app_id'])


In [42]:
# 🧪 Evaluation Function
def evaluate_recommendation(user_id, recommended_app_ids, k=5):
    """
    Evaluate recommendations based on known liked games by user.
    """
    test_set = recommendations_pd[recommendations_pd['user_id'] == user_id]
    liked_app_ids = test_set[test_set['is_recommended_binary'] == 1]['app_id'].values

    y_true = [1 if app_id in liked_app_ids else 0 for app_id in recommended_app_ids]
    y_pred = [1] * len(y_true)

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    return round(precision, 2), round(recall, 2), round(f1, 2)

In [43]:
# 🎮 Game-Based Recommendation Function
def get_recommendations_knn(app_id, user_id=None, k=5):
    """
    Recommends similar games based on game content (title + tags).
    Optionally evaluates performance if a user_id is provided.
    """
    if app_id not in indices:
        return ["Game not found"]
    
    # Show context game
    game_row = games_pd[games_pd['app_id'] == app_id]
    print(f"\n🎮 Recommending based on:\nTitle: {game_row['title'].values[0]}\nTags: {game_row['tags'].values[0]}")
    
    # Get similar games
    idx = indices[app_id]
    distances, indices_knn = nn.kneighbors(tfidf_matrix[idx], n_neighbors=k+1)
    recommended_idx = indices_knn[0][1:]  # Skip the game itself

    recommendations = games_pd[['app_id', 'title', 'tags']].iloc[recommended_idx].reset_index(drop=True)
    print("\n🧠 Top Recommendations:")
    print(recommendations)

    # Optional Evaluation
    if user_id is not None:
        recommended_app_ids = recommendations['app_id'].tolist()
        precision, recall, f1 = evaluate_recommendation(user_id, recommended_app_ids, k)
        
        print(f"\n📊 Evaluation for user {user_id}:")
        print(f"Precision@{k}: {precision}")
        print(f"Recall@{k}:    {recall}")
        print(f"F1 Score@{k}:  {f1}")

    return recommendations

In [None]:
# Just show similar games
get_recommendations_knn(13500)


🎮 Recommending based on:
Title: Prince of Persia: Warrior Within™
Tags: ('Action', 'Adventure', 'Parkour', 'Third Person', 'Great Soundtrack', 'Singleplayer', 'Platformer', 'Time Travel', 'Atmospheric', 'Classic', 'Hack and Slash', 'Time Manipulation', 'Gore', 'Fantasy', 'Story Rich', 'Dark', 'Open World', 'Controller', 'Dark Fantasy', 'Puzzle')

🧠 Top Recommendations:
    app_id                                   title  \
0    13600    Prince of Persia®: The Sands of Time   
1    33320  Prince of Persia: The Forgotten Sands™   
2    13530      Prince of Persia: The Two Thrones™   
3    19980                       Prince of Persia®   
4  1207010                           Me and myself   

                                                tags  
0  ('Action', 'Adventure', 'Parkour', 'Platformer...  
1  ('Action', 'Adventure', 'Platformer', 'Parkour...  
2  ('Action', 'Adventure', 'Platformer', 'Parkour...  
3  ('Action', 'Adventure', 'Parkour', 'Platformer...  
4  ('Adventure', 'Indie', '

Unnamed: 0,app_id,title,tags
0,13600,Prince of Persia®: The Sands of Time,"('Action', 'Adventure', 'Parkour', 'Platformer..."
1,33320,Prince of Persia: The Forgotten Sands™,"('Action', 'Adventure', 'Platformer', 'Parkour..."
2,13530,Prince of Persia: The Two Thrones™,"('Action', 'Adventure', 'Platformer', 'Parkour..."
3,19980,Prince of Persia®,"('Action', 'Adventure', 'Parkour', 'Platformer..."
4,1207010,Me and myself,"('Adventure', 'Indie', 'Puzzle Platformer', 'T..."


In [46]:
# Show similar games + evaluate against user’s past reviews
get_recommendations_knn(382400, user_id=253880)



🎮 Recommending based on:
Title: Dungeons 2 - A Game of Winter
Tags: ('Strategy', 'RPG', 'Simulation')

🧠 Top Recommendations:
    app_id                             title  \
0   382410  Dungeons 2 - A Chance of Dragons   
1   807320        Into the Breach Soundtrack   
2   284441             Tropico 5 - Espionage   
3   949680                   Fantasy Monarch   
4  1052420                   Badland Caravan   

                                                tags  
0                  ('Strategy', 'RPG', 'Simulation')  
1         ('Indie', 'Strategy', 'RPG', 'Simulation')  
2                  ('Strategy', 'RPG', 'Simulation')  
3        ('Strategy', 'RPG', 'Simulation', 'Nudity')  
4  ('Strategy', 'RPG', 'Simulation', 'Early Access')  

📊 Evaluation for user 253880:
Precision@5: 0.0
Recall@5:    0.0
F1 Score@5:  0.0


Unnamed: 0,app_id,title,tags
0,382410,Dungeons 2 - A Chance of Dragons,"('Strategy', 'RPG', 'Simulation')"
1,807320,Into the Breach Soundtrack,"('Indie', 'Strategy', 'RPG', 'Simulation')"
2,284441,Tropico 5 - Espionage,"('Strategy', 'RPG', 'Simulation')"
3,949680,Fantasy Monarch,"('Strategy', 'RPG', 'Simulation', 'Nudity')"
4,1052420,Badland Caravan,"('Strategy', 'RPG', 'Simulation', 'Early Access')"


In [33]:
def get_recommendations_knn(app_id, user_id=None, k=5):
    if app_id not in indices:
        return ["Game not found"]
    
    # Show context game info
    game_row = games_pd[games_pd['app_id'] == app_id]
    print(f"\nRecommending based on:\nTitle: {game_row['title'].values[0]}\nTags: {game_row['tags'].values[0]}")
    
    # Generate recommendations
    idx = indices[app_id]
    distances, indices_knn = nn.kneighbors(tfidf_matrix[idx], n_neighbors=k+1)
    recommended_idx = indices_knn[0][1:]  # exclude the game itself

    recommendations = games_pd[['app_id', 'title', 'tags']].iloc[recommended_idx].reset_index(drop=True)
    print("\nTop Recommendations:")
    print(recommendations)

    # Evaluate (optional)
    if user_id is not None:
        recommended_app_ids = recommendations['app_id'].tolist()
        precision, recall, f1 = evaluate_recommendation(user_id, recommended_app_ids, k)
        
        print(f"\n📊 Evaluation for user {user_id}:")
        print(f"Precision@{k}: {precision}")
        print(f"Recall@{k}:    {recall}")
        print(f"F1 Score@{k}:  {f1}")


In [34]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_recommendation(user_id, recommended_app_ids, k=5):
    test_set = recommendations_pd[recommendations_pd['user_id'] == user_id]
    liked_app_ids = test_set[test_set['is_recommended_binary'] == 1]['app_id'].values

    y_true = [1 if app_id in liked_app_ids else 0 for app_id in recommended_app_ids]
    y_pred = [1] * len(y_true)

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    return round(precision, 2), round(recall, 2), round(f1, 2)


In [35]:
# Just show game info + recommendations
get_recommendations_knn(13500)



Recommending based on:
Title: Prince of Persia: Warrior Within™
Tags: ('Action', 'Adventure', 'Parkour', 'Third Person', 'Great Soundtrack', 'Singleplayer', 'Platformer', 'Time Travel', 'Atmospheric', 'Classic', 'Hack and Slash', 'Time Manipulation', 'Gore', 'Fantasy', 'Story Rich', 'Dark', 'Open World', 'Controller', 'Dark Fantasy', 'Puzzle')

Top Recommendations:
    app_id                                   title  \
0    13600    Prince of Persia®: The Sands of Time   
1    33320  Prince of Persia: The Forgotten Sands™   
2    13530      Prince of Persia: The Two Thrones™   
3    19980                       Prince of Persia®   
4  1207010                           Me and myself   

                                                tags  
0  ('Action', 'Adventure', 'Parkour', 'Platformer...  
1  ('Action', 'Adventure', 'Platformer', 'Parkour...  
2  ('Action', 'Adventure', 'Platformer', 'Parkour...  
3  ('Action', 'Adventure', 'Parkour', 'Platformer...  
4  ('Adventure', 'Indie', 'Puzz