In [69]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import re

In [70]:
# Text preprocessing functions
def text_preprocessing(text):
    """Remove special characters, lowercase, and normalize spaces."""
    if pd.isna(text):
        return ""
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [8]:
# load required datasets
users = pd.read_csv("../data/users_meta.csv")
games = pd.read_csv("../data/games_preprocessed.csv")
games.head()

Unnamed: 0,platform_id,game_id,name,released,rating,metacritic,name_mod,genres_Action,genres_Adventure,genres_Arcade,...,tags_street racing,tags_tracks,tags_true exclusive,tags_unique,tags_vr mod,tags_w10 exclusive,tags_waves,tags_wizard,tags_wizards,tags_work
0,4,21,DiRT 4,2017-06-06,3.61,78.0,dirt 4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,25,Middle-earth: Shadow of War,2017-09-27,3.85,82.0,middleearth shadow of war,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,26,Nex Machina,2017-06-19,3.9,89.0,nex machina,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,27,Pyre,2017-07-25,4.17,84.0,pyre,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,4,28,Red Dead Redemption 2,2018-10-26,4.59,96.0,red dead redemption 2,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
users.columns

Index(['user_id', 'game_name', 'game_name_mod', 'purchase', 'play',
       'hours_played', 'matched_game', 'game_id', 'name_mod', 'rating',
       ...
       'tags_street racing', 'tags_tracks', 'tags_true exclusive',
       'tags_unique', 'tags_vr mod', 'tags_w10 exclusive', 'tags_waves',
       'tags_wizard', 'tags_wizards', 'tags_work'],
      dtype='object', length=373)

In [80]:
# curr user
curr_user_data = [
    {"user_id": 999999999,
    "game_id": 58732,           
    "game_name_mod": "fifa 19",   
    "hours_played": 100.0,             
    "play": 1},
    {
    "user_id": 999999999,
    "game_id": 29069,           
    "game_name_mod": "fifa 10",   
    "hours_played": 150.0,             
    "play": 1
        },
    {
    "user_id": 999999999,
    "game_id": 326243,           
    "game_name_mod": "elden ring",   
    "hours_played": 200.0,             
    "play": 1
        }
]

curr_user_df = pd.DataFrame(curr_user_data)  # wrap dict in a list
# drop unnecessarily columns
cols_to_keep = ["user_id", "game_id","game_name_mod", "hours_played", "play"]
ratings_df = users[cols_to_keep]
# add curr user
ratings_df = pd.concat([ratings_df, curr_user_df], ignore_index=True)
# select played games
played_df = ratings_df[ratings_df["play"] == 1]
played_df

Unnamed: 0,user_id,game_id,game_name_mod,hours_played,play
0,151603712,5679,the elder scrolls v skyrim,273.0,1
2,151603712,3070,fallout 4,87.0,1
4,151603712,11425,spore,14.9,1
6,151603712,5563,fallout new vegas,12.1,1
8,151603712,12020,left 4 dead 2,8.9,1
...,...,...,...,...,...
174552,128470551,430,grand theft auto vice city,1.5,1
174554,128470551,12725,rush,1.4,1
174555,999999999,58732,fifa 19,100.0,1
174556,999999999,29069,fifa 10,150.0,1


In [81]:
p = played_df.groupby("game_name_mod")["hours_played"].agg(["sum", "mean"])
p.sort_values(by = "sum", ascending = False).head()

Unnamed: 0_level_0,sum,mean
game_name_mod,Unnamed: 1_level_1,Unnamed: 2_level_1
dota 2,981133.3,261.635547
counterstrike global offensive,322736.3,245.426844
team fortress 2,173356.8,101.437566
counterstrike,134205.1,308.517471
sid meiers civilization v,99806.3,193.798641


## CF based on Playtime

In [82]:
users_playtime_matrix = played_df.pivot_table(index='user_id', columns='game_id', values='hours_played', fill_value=0)
users_playtime_matrix.head()

game_id,21,32,39,51,56,69,87,89,108,121,...,366892,367183,398414,401808,494382,602224,605674,670682,670693,802181
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
181212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
229911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
sparse_playtime_matrix = csr_matrix(users_playtime_matrix.values)
sparse_playtime_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 46817 stored elements and shape (9130, 1459)>

In [84]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(sparse_playtime_matrix.T)

In [85]:
def recommend_games_for_user(user_id, user_game_matrix, model_knn, n_recommendations=5):
    user_row = user_game_matrix.loc[user_id]
    played_games = user_row[user_row > 0].index.tolist()

    recommendations = set()

    for game_id in played_games:
        game_idx = list(user_game_matrix.columns).index(game_id)
        distances, indices = model_knn.kneighbors(sparse_playtime_matrix.T[game_idx], n_neighbors=n_recommendations + 1)
        
        # Add similar games (ignore first = same game)
        similar_game_ids = [user_game_matrix.columns[i] for i in indices.flatten()[1:]]
        recommendations.update(similar_game_ids)

    # Remove already played games
    final_recommendations = list(recommendations - set(played_games))

    return final_recommendations[:n_recommendations]


In [86]:
recommendations = recommend_games_for_user(user_id=76767, user_game_matrix=users_playtime_matrix, model_knn=model_knn)
print("Recommended games:", recommendations)

Recommended games: [38146, 5636, 18693, 1030, 9609]


In [87]:
rec_games = games[games["game_id"].isin(recommendations)]
rec_games

Unnamed: 0,platform_id,game_id,name,released,rating,metacritic,name_mod,genres_Action,genres_Adventure,genres_Arcade,...,tags_street racing,tags_tracks,tags_true exclusive,tags_unique,tags_vr mod,tags_w10 exclusive,tags_waves,tags_wizard,tags_wizards,tags_work
161,4,1030,Limbo,2010-07-21,4.14,88.0,limbo,1,1,0,...,0,0,0,0,0,0,0,0,0,0
725,4,5636,Rage,2011-10-03,3.42,79.0,rage,1,0,0,...,0,0,0,0,0,0,0,0,0,0
833,4,9609,Euro Truck Simulator 2,2012-10-19,4.12,79.0,euro truck simulator 2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1786,4,18693,Painkiller: Recurring Evil,2012-02-29,2.87,38.0,painkiller recurring evil,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2253,4,38146,Platypus 2,2007-02-01,1.92,,platypus 2,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [91]:
users[users["user_id"] == 76767]

Unnamed: 0,user_id,game_name,game_name_mod,purchase,play,hours_played,matched_game,game_id,name_mod,rating,...,tags_street racing,tags_tracks,tags_true exclusive,tags_unique,tags_vr mod,tags_w10 exclusive,tags_waves,tags_wizard,tags_wizards,tags_work
48880,76767,Counter-Strike,counterstrike,1,0,1.0,counterstrike global offensive,4291,counterstrike global offensive,3.56,...,0,0,0,0,0,0,0,0,0,0
48881,76767,Counter-Strike,counterstrike,1,1,365.0,counterstrike global offensive,4291,counterstrike global offensive,3.56,...,0,0,0,0,0,0,0,0,0,0
48882,76767,Call of Duty World at War,call of duty world at war,1,0,1.0,call of duty world at war,5528,call of duty world at war,3.95,...,0,0,0,0,0,0,0,0,0,0
48883,76767,Call of Duty World at War,call of duty world at war,1,1,271.0,call of duty world at war,5528,call of duty world at war,3.95,...,0,0,0,0,0,0,0,0,0,0
48884,76767,Total War ATTILA,total war attila,1,0,1.0,total war attila,13468,total war attila,3.9,...,0,0,0,0,0,0,0,0,0,0
48885,76767,Total War ATTILA,total war attila,1,1,207.0,total war attila,13468,total war attila,3.9,...,0,0,0,0,0,0,0,0,0,0
48886,76767,Call of Duty Modern Warfare 2 - Multiplayer,call of duty modern warfare 2 multiplayer,1,0,1.0,call of duty modern warfare 2,4527,call of duty modern warfare 2,4.27,...,0,0,0,0,0,0,0,0,0,0
48887,76767,Call of Duty Modern Warfare 2 - Multiplayer,call of duty modern warfare 2 multiplayer,1,1,165.0,call of duty modern warfare 2,4527,call of duty modern warfare 2,4.27,...,0,0,0,0,0,0,0,0,0,0
48888,76767,Call of Duty Modern Warfare 2,call of duty modern warfare 2,1,0,1.0,call of duty modern warfare 2,4527,call of duty modern warfare 2,4.27,...,0,0,0,0,0,0,0,0,0,0
48889,76767,Call of Duty Modern Warfare 2,call of duty modern warfare 2,1,1,65.0,call of duty modern warfare 2,4527,call of duty modern warfare 2,4.27,...,0,0,0,0,0,0,0,0,0,0


In [None]:
curr_user_id = 999999999
recommendations = recommend_games_for_user(user_id=curr_user_id, user_game_matrix=users_playtime_matrix, model_knn=model_knn)
print("Recommended games:", recommendations)
user_rec_games = games[games["game_id"].isin(recommendations)]
user_rec_games

## Content based filtering

In [53]:
content_features = games.filter(regex="^genres_|^tags_")
feature_cols = list(content_features.columns)
# Compute similarity between all games
similarity_matrix = cosine_similarity(content_features.values)

In [98]:
def recommend_similar_games(game_name, games_df, similarity_matrix, top_n=10):
    game_name = text_preprocessing(game_name)
    idx = games_df[games_df['name_mod'] == game_name].index
    if len(idx) == 0:
        return []

    idx = idx[0]
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Exclude the game itself
    recommended_indices = [i for i, score in similarity_scores[1:top_n+1]]

    return games_df.iloc[recommended_indices][["game_id", "name"]]


In [100]:
recommend_similar_games("fifa 10", games, similarity_matrix, top_n=10)

Unnamed: 0,game_id,name
185,1249,FIFA 17
480,3580,FIFA 15
741,5783,FIFA 14
2782,326229,FIFA 20
2500,58732,FIFA 19
388,3154,NBA 2K16
530,3855,NBA 2K14
1953,20159,Tony Hawk’s Pro Skater HD
2119,23341,Pro Evolution Soccer 2018
2151,28126,FIFA 18
