# Playground for LighFM & ScaNN

## 1. Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

## 2. Load data

In [2]:
users = pd.read_csv('./users.csv')
games = pd.read_csv('./games.csv')
recommendations = pd.read_csv('./recommendations.csv')

## 3. Data model

In [3]:
users.head()

Unnamed: 0,user_id,products,reviews
0,7360263,359,0
1,14020781,156,1
2,8762579,329,4
3,4820647,176,4
4,5167327,98,2


In [4]:
games.head()

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck
0,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True
1,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True
2,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True
3,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True
4,249050,Dungeon of the ENDLESS™,2014-10-27,True,True,False,Very Positive,88,8784,11.99,11.99,0.0,True


In [5]:
recommendations.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,975370,0,0,2022-12-12,True,36.3,51580,0
1,304390,4,0,2017-02-17,False,11.5,2586,1
2,1085660,2,0,2019-11-17,True,336.5,253880,2
3,703080,0,0,2022-09-23,True,27.4,259432,3
4,526870,0,0,2021-01-10,True,7.9,23869,4


## 4. Data overview

In [6]:
nUsers = users['user_id'].nunique()
nGames = games['app_id'].nunique()
nRecommendations = recommendations['review_id'].nunique()

print(f'Number of users: {nUsers}')
print(f'Number of games: {nGames}')
print(f'Number of reviews: {nRecommendations}')
print(f'Average number of reviews per user: {nRecommendations / nUsers}')
print(f'Average number of reviews per game: {nRecommendations / nGames}')

Number of users: 14306064
Number of games: 50872
Number of reviews: 41154794
Average number of reviews per user: 2.8767377246459964
Average number of reviews per game: 808.9871442050637


In [7]:
noReviewUsers = set(users['user_id']) - set(recommendations['user_id'])
print(f'Users without any review: {len(noReviewUsers)}')

Users without any review: 525005


## 5. Data merging

In [8]:
userIds = users['user_id'].unique()
gameIds = games['app_id'].unique()

userMapper = {user_id: idx for idx, user_id in enumerate(userIds)}
gameMapper = {app_id: idx for idx, app_id in enumerate(gameIds)}

print(gameMapper.keys() == 223)

data = recommendations[['app_id', 'user_id', 'is_recommended']]

False


In [9]:
inverseGameMapper = {idx: gameId for idx, gameId in enumerate(games)}
inverseUserMapper = {idx: userId for idx, userId in enumerate(users)}

## 6. Interaction matrix

In [10]:
from scipy.sparse import coo_matrix

appIds, userIds, isRecommended = data['app_id'], data['user_id'], data['is_recommended']

appIds = appIds.map(lambda x: gameMapper[x])
userIds = userIds.map(lambda x: userMapper[x])
isRecommended = isRecommended.map(lambda x: 1.0 if x else 0.0)

interaction = coo_matrix((isRecommended, (userIds, appIds))).tocsr()

## 7. Game feature matrix

###### TODO...

## 9. LightFM

In [11]:
from lightfm import LightFM

model = LightFM(learning_schedule='adagrad', loss='warp')

model.fit(interaction, epochs=50, num_threads=6)

<lightfm.lightfm.LightFM at 0x7f1e686222b0>

## ScaNN

In [12]:
import scann

itemEmbeddings = model.item_embeddings
userEmbeddings = model.user_embeddings

itemEmbeddings = itemEmbeddings / np.linalg.norm(itemEmbeddings, axis=1, keepdims=True)
userEmbeddings = userEmbeddings / np.linalg.norm(userEmbeddings, axis=1, keepdims=True)

scannSearcher = scann.scann_ops_pybind.builder(itemEmbeddings, 5, "dot_product").score_ah(2).build()

2025-01-05 16:32:26.208499: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-05 16:32:26.210668: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-05 16:32:26.237157: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-05 16:32:26.238105: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-01-05 16:32:27.596043: I scann/base/single_machine_factory_scann.cc:153] Single-machine AH training with dat

## Recommender

In [21]:
def gameEmbedding(gameId):
  n = gameMapper.get(gameId, None)
  
  if n is not None and n < len(itemEmbeddings):
    return itemEmbeddings[n]
  return np.zeros(len(itemEmbeddings))

In [22]:
def userEmbedding(userId):
  if userId in userMapper:
    n = userMapper[userId]

    if n < len(userEmbeddings):
      return userEmbeddings[n]
    
  return np.zeros(len(userEmbeddings))

In [23]:
def listUserRecommendations(userId):
  return recommendations[recommendations['user_id'] == userId]

In [24]:
def userToVec(userId):
  arr = listUserRecommendations(userId)[['app_id', 'is_recommended']]

  result = [0] * len(games)

  for _, row in arr.iterrows():
    result[gameMapper[row['app_id']]] = row['is_recommended']

  return result

In [25]:
def embedUser(newUser, d):
  vec = np.zeros(d)
  m = 0

  for idx in range(0, len(games)):
    if newUser[idx]:
      vec += gameEmbedding(games.iloc[idx]['app_id'])
      m += 1
    
  if m > 0:
    return vec / m
  else:
    return np.zeros(d)

## Similarity

In [26]:
u1 = userEmbedding(0)
u2 = embedUser(userToVec(0), 10)
u2_norm = u2 / np.linalg.norm(u2)
similarity = np.dot(u1, u2_norm)
print(similarity)  

0.878040431219939


## Predict

In [19]:
prediction = scannSearcher.search(u2_norm, 5)

prediction

(array([19243, 11649, 44630, 23145, 17121], dtype=uint32),
 array([1.0065706 , 0.9738897 , 0.9738897 , 0.96408546, 0.9608174 ],
       dtype=float32))

User ID at index 0 in users DataFrame: 7360263
Index of this user in user_embeddings: 0
Embedding at index 0: [ 0.3097066   0.06934527 -0.13778706 -0.48403978 -0.2601099   0.29225737
 -0.4298928   0.36122203 -0.41603038 -0.06746241]
Embedding for user 7360263 (mapped index): [ 0.3097066   0.06934527 -0.13778706 -0.48403978 -0.2601099   0.29225737
 -0.4298928   0.36122203 -0.41603038 -0.06746241]
