# Imports

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math
from collections import defaultdict
from tqdm import tqdm
import json

In [21]:
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

from tensorflow import keras
from tensorflow.keras import layers

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

# Load Data

In [22]:
import pandas as pd

df = pd.read_csv('../../dataset/weighted_score_above_08.csv')

df.head()

  df = pd.read_csv('../../dataset/weighted_score_above_08.csv')


Unnamed: 0,recommendationid,appid,game,author_steamid,author_num_games_owned,author_num_reviews,author_playtime_forever,author_playtime_last_two_weeks,author_playtime_at_review,author_last_played,...,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,hidden_in_steam_china,steam_china_location
0,147449116,10,Counter-Strike,76561199183984450,51,12,2548,0,2480,1696305457,...,1,99,2,0.889438,0,1,0,0,1,
1,147374264,10,Counter-Strike,76561198099573060,226,13,2369,0,2361,1696096555,...,1,122,8,0.914834,0,1,0,0,1,
2,147357703,10,Counter-Strike,76561199080026894,118,23,13501,212,12957,1697630734,...,1,599,20,0.968375,3,1,0,0,1,
3,147345102,10,Counter-Strike,76561198068970227,28,1,10668,640,9906,1698261011,...,1,59,12,0.826206,0,0,0,0,1,
4,147284743,10,Counter-Strike,76561199137893460,19,5,543,10,526,1697810991,...,1,128,24,0.853612,0,1,0,0,1,


In [23]:
with open('../combined_df.json', 'r') as f:
    sample_data = [json.loads(line) for line in f]

In [24]:
valid_appids = {entry['appid'] for entry in sample_data}

df = df[df['appid'].isin(valid_appids)]

df_games = df[['appid', 'game']]
df_games = df_games.drop_duplicates(subset='appid')

appid2game = df_games.set_index('appid')['game'].to_dict()
game2appid = df_games.set_index('game')['appid'].to_dict()

del df_games

df = df[['appid', 'author_steamid', 'voted_up']]
df.rename(columns={'appid': 'movieId', 'author_steamid': 'userId', 'voted_up': 'rating'}, inplace=True)

In [25]:
item_id_to_genres = {entry['appid']: entry['genre'] for entry in sample_data}

In [7]:
# df.reset_index(drop=True, inplace=True)

In [8]:
# N_DISPLAY = 30
# has_no_genre_count = 0
# for i, item_id in enumerate(df['item_id'].unique()):
#     genres = item_id_to_genres.get(item_id, [])
#     if not genres:
#         has_no_genre_count += 1
#         continue
# print(f"Number of games without genres: {has_no_genre_count}")

In [9]:
# nan_count = df['rating'].isna().sum()
# print(f"Number of NaN values in 'rating': {nan_count}")

In [10]:
sample_data[0]

{'appid': 550, 'game': 'Left 4 Dead 2', 'genre': ['Action']}

In [11]:
# Create a DataFrame from sample_data
movies_data_clean = pd.DataFrame(sample_data)

# Rename columns for clarity
movies_data_clean.rename(columns={'game': 'title'}, inplace=True)

# Convert the 'genre' list into a pipe-separated string
movies_data_clean['genre'] = movies_data_clean['genre'].apply(lambda x: '|'.join(x) if isinstance(x, list) else '')

# Select only the required columns
movies_data_clean = movies_data_clean[['appid', 'title', 'genre']]
movies_data_clean.rename(columns={'appid': 'movieId'}, inplace=True)
movies_data_clean.rename(columns={'genre': 'genres'}, inplace=True)

movies_data_clean.head()

Unnamed: 0,movieId,title,genres
0,550,Left 4 Dead 2,Action
1,644560,Mirror,Adventure|Indie|RPG
2,264710,Subnautica,Adventure|Indie
3,294100,RimWorld,Indie|Simulation|Strategy
4,413150,Stardew Valley,Indie|RPG|Simulation


### Create Item Train Feature Vectors

In [12]:
movie_features_data = movies_data_clean.copy()

In [19]:
movie_features_data.head()

Unnamed: 0,movieId,title,genres
0,550,Left 4 Dead 2,Action
1,644560,Mirror,Adventure|Indie|RPG
2,264710,Subnautica,Adventure|Indie
3,294100,RimWorld,Indie|Simulation|Strategy
4,413150,Stardew Valley,Indie|RPG|Simulation


In [13]:
movies_data_clean.shape

(27315, 3)

In [14]:
movies_data_clean.loc[:, 'genre_count'] = movies_data_clean['genres'].str.count(r'\|') + 1

movies_data_clean = (
    movies_data_clean
    .sort_values('genre_count', ascending=False)
    .drop_duplicates('title', keep='first')
    .drop(columns='genre_count')
    .reset_index(drop=True)
)

In [15]:
movies_data_clean.shape

(27262, 3)

In [16]:
movies_data_clean['movieId'].nunique()

27262

#### Read Movie Ratings Data

In [26]:
df.head()

Unnamed: 0,movieId,userId,rating
0,10,76561199183984450,1
1,10,76561198099573060,1
2,10,76561199080026894,1
3,10,76561198068970227,1
4,10,76561199137893460,1


In [27]:
ratings_data = df.copy()

In [28]:
ratings_data.head()

Unnamed: 0,movieId,userId,rating
0,10,76561199183984450,1
1,10,76561198099573060,1
2,10,76561199080026894,1
3,10,76561198068970227,1
4,10,76561199137893460,1


In [29]:
ratings_data['rating'].min()

np.int64(0)

In [30]:
ratings_data.shape

(498094, 3)

In [31]:
ratings_data['movieId'].nunique()

27315

In [32]:
movie_ratings_data = (
    movies_data_clean.join(ratings_data[['userId', 'movieId', 'rating']].set_index('movieId'), on='movieId', how='inner')
    [['userId', 'movieId', 'title', 'rating', 'genres']]
).sort_values(['userId', 'movieId']).reset_index(drop=True)

In [33]:
movie_ratings_data.head()

Unnamed: 0,userId,movieId,title,rating,genres
0,76561197960265730,70,Half-Life,1,Action
1,76561197960266573,639790,DEEP SPACE WAIFU,1,Action|Casual|Indie
2,76561197960267615,241930,Middle-earth™: Shadow of Mordor™,0,Single-player|Steam Achievements|Full controll...
3,76561197960267644,236390,War Thunder,1,Action|Free to Play|Massively Multiplayer|Simu...
4,76561197960267644,251570,7 Days to Die,1,Action|Adventure|Indie|RPG|Simulation|Strategy...


In [34]:
movie_ratings_data.shape

(496932, 5)

In [35]:
movie_ratings_data['movieId'].nunique()

27262

In [36]:
movie_ratings_data['userId'].nunique()

379070

#### Create Movie Meta Data

In [37]:
movie_meta_data = (
    movie_ratings_data[['movieId', 'title', 'genres']]
      .drop_duplicates(subset='movieId')
)

In [38]:
movie_meta_data.head()

Unnamed: 0,movieId,title,genres
0,70,Half-Life,Action
1,639790,DEEP SPACE WAIFU,Action|Casual|Indie
2,241930,Middle-earth™: Shadow of Mordor™,Single-player|Steam Achievements|Full controll...
3,236390,War Thunder,Action|Free to Play|Massively Multiplayer|Simu...
4,251570,7 Days to Die,Action|Adventure|Indie|RPG|Simulation|Strategy...


### Explore Movies

In [39]:
grouped_movie_data = (
    movie_ratings_data.groupby(['movieId', 'title', 'genres']).agg(
        rating_count=('rating', 'count'),
        rating_mean=('rating', 'mean'),
    ).reset_index()
)[['movieId', 'rating_count', 'rating_mean', 'title', 'genres']]
grouped_movie_data.columns = ['movie_id', 'num_ratings', 'average_rating', 'title', 'genres']
grouped_movie_data['average_rating'] = np.round(grouped_movie_data['average_rating'], 1)
grouped_movie_data.head()

Unnamed: 0,movie_id,num_ratings,average_rating,title,genres
0,10,611,1.0,Counter-Strike,Action
1,20,85,1.0,Team Fortress Classic,Action
2,30,22,1.0,Day of Defeat,Action
3,40,20,1.0,Deathmatch Classic,Action
4,50,26,1.0,Half-Life: Opposing Force,Action


In [40]:
grouped_movie_data.sort_values(by=['num_ratings'], ascending=False)

Unnamed: 0,movie_id,num_ratings,average_rating,title,genres
26,730,3017,0.8,Counter-Strike 2,Action|Free To Play
9291,578080,2404,0.5,PUBG: BATTLEGROUNDS,Action|Adventure|Massively Multiplayer|Free To...
6301,413150,2240,1.0,Stardew Valley,Indie|RPG|Simulation
1127,105600,1959,1.0,Terraria,Action|Adventure|Indie|RPG
2737,271590,1944,0.7,Grand Theft Auto V,Action|Adventure
...,...,...,...,...,...
10807,680930,1,0.0,Fairy Lands: Rinka and the Fairy Gems,Casual|Indie
19616,1336210,1,1.0,The ER: Patient Typhon,Action|Adventure|Indie
19617,1336321,1,1.0,Jurassic World Evolution 2: Deluxe Upgrade Pack,Simulation|Strategy
10806,680910,1,1.0,HIVESWAP: Act 1 Original Soundtrack,Adventure|Indie|RPG


### Explore Genres

In [41]:
movie_ratings_data_genres = movie_ratings_data.assign(genres = movie_ratings_data['genres'].str.split('|'))
movie_ratings_data_exploded = movie_ratings_data_genres.explode('genres').rename(columns={'genres': 'genre'})

In [42]:
movie_ratings_data_exploded.head()

Unnamed: 0,userId,movieId,title,rating,genre
0,76561197960265730,70,Half-Life,1,Action
1,76561197960266573,639790,DEEP SPACE WAIFU,1,Action
1,76561197960266573,639790,DEEP SPACE WAIFU,1,Casual
1,76561197960266573,639790,DEEP SPACE WAIFU,1,Indie
2,76561197960267615,241930,Middle-earth™: Shadow of Mordor™,0,Single-player


In [43]:
grouped_genre_data = (
    movie_ratings_data_exploded.groupby(['genre'])
    .agg(
        movie_count=('movieId', 'nunique'),
        rating_mean=('rating', 'mean'),
        rating_count=('rating', 'count'),
    )
    .reset_index()
)
grouped_genre_data.columns = ['genre', 'num_movies', 'average_rating_per_genre', 'num_ratings_per_genre']
grouped_genre_data['average_rating_per_genre'] = np.round(
    grouped_genre_data['average_rating_per_genre'], 1
)

In [44]:
grouped_genre_data.sort_values('num_ratings_per_genre', ascending=False)

Unnamed: 0,genre,num_movies,average_rating_per_genre,num_ratings_per_genre
2,Action,11212,0.8,261073
37,Indie,15327,0.9,219341
4,Adventure,10452,0.8,198685
55,RPG,6430,0.8,133815
66,Simulation,7086,0.8,129943
...,...,...,...,...
118,アドベンチャー,1,0.0,1
124,冒险,1,0.0,1
20,Desporto,1,1.0,1
129,無料プレイ,1,0.0,1


### Create Item Feature Vectors

In [45]:
grouped_movie_ratings_data = (
    movie_ratings_data
    .groupby(['movieId', 'title', 'genres'], as_index=False)
    .agg(item_average_rating = ('rating', 'mean'))
)

In [46]:
grouped_movie_ratings_data.head()

Unnamed: 0,movieId,title,genres,item_average_rating
0,10,Counter-Strike,Action,1.0
1,20,Team Fortress Classic,Action,0.952941
2,30,Day of Defeat,Action,1.0
3,40,Deathmatch Classic,Action,0.95
4,50,Half-Life: Opposing Force,Action,1.0


In [48]:
# grouped_movie_ratings_data['item_year'] = (
#     grouped_movie_ratings_data['title']
#     .str.extract(r'\((\d{4})\)$')[0]
#     .astype(int)
# )

In [49]:
genre_dummies = grouped_movie_ratings_data['genres'].str.get_dummies(sep='|')

In [50]:
item_genre_dummies = genre_dummies.add_prefix('item_')

In [51]:
item_features = pd.concat([
    grouped_movie_ratings_data[['movieId', 'item_average_rating']],
    item_genre_dummies
], axis=1)

In [52]:
item_features.head()

Unnamed: 0,movieId,item_average_rating,item_360 Video,item_Accounting,item_Action,item_Additional High-Quality Audio,item_Adventure,item_Akció,item_Aksiyon,item_Animation & Modeling,...,item_冒险,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_無料プレイ,item_独立,item_竞速,item_策略,item_角色扮演
0,10,1.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20,0.952941,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30,1.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,40,0.95,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,50,1.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
item_features.shape

(27262, 136)

### Create User Feature Vectors

In [54]:
movie_ratings_data.head()

Unnamed: 0,userId,movieId,title,rating,genres
0,76561197960265730,70,Half-Life,1,Action
1,76561197960266573,639790,DEEP SPACE WAIFU,1,Action|Casual|Indie
2,76561197960267615,241930,Middle-earth™: Shadow of Mordor™,0,Single-player|Steam Achievements|Full controll...
3,76561197960267644,236390,War Thunder,1,Action|Free to Play|Massively Multiplayer|Simu...
4,76561197960267644,251570,7 Days to Die,1,Action|Adventure|Indie|RPG|Simulation|Strategy...


In [55]:
movie_ratings_genre_exploded = (
    movie_ratings_data
    .assign(genres = movie_ratings_data['genres'].str.split('|'))
    .explode('genres')
    .rename(columns={'genres':'genre'})
)

In [56]:
user_stats = (
    movie_ratings_genre_exploded
    .groupby('userId', as_index=True)['rating']
    .agg(
        user_rating_count = 'count',
        user_average_rating = 'mean'
    )
)

In [57]:
user_genres = (
    movie_ratings_genre_exploded
    .groupby(['userId', 'genre'], as_index=True)['rating']
    .mean()
    .unstack('genre') 
    .add_prefix('user_')
)

In [58]:
user_features = (
    user_stats
    .join(user_genres)
    .reset_index()  
)

In [59]:
user_features = user_features.fillna(0)

In [60]:
user_features.head()

Unnamed: 0,userId,user_rating_count,user_average_rating,user_360 Video,user_Accounting,user_Action,user_Additional High-Quality Audio,user_Adventure,user_Akció,user_Aksiyon,...,user_冒险,user_动作,user_大型多人在线,user_抢先体验,user_模拟,user_無料プレイ,user_独立,user_竞速,user_策略,user_角色扮演
0,76561197960265730,1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,76561197960266573,3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,76561197960267615,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,76561197960267644,22,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,76561197960267984,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
user_features['userId'].nunique()

379070

In [62]:
combined_features_df = (
    movie_ratings_data
      .loc[:, ['userId', 'movieId', 'rating']]
      .merge(user_features, on='userId')
      .merge(item_features, on='movieId')
)

In [63]:
combined_features_df.head()

Unnamed: 0,userId,movieId,rating,user_rating_count,user_average_rating,user_360 Video,user_Accounting,user_Action,user_Additional High-Quality Audio,user_Adventure,...,item_冒险,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_無料プレイ,item_独立,item_竞速,item_策略,item_角色扮演
0,76561197960265730,70,1,1,1.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,76561197960266573,639790,1,3,1.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,76561197960267615,241930,0,6,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,76561197960267644,236390,1,22,1.0,0.0,0.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,76561197960267644,251570,1,22,1.0,0.0,0.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
combined_features_df.shape

(496932, 274)

#### Train Two-Tower System

In [66]:
train_df, test_df = train_test_split(combined_features_df, test_size=0.2, random_state=42)

In [67]:
train_df.head()

Unnamed: 0,userId,movieId,rating,user_rating_count,user_average_rating,user_360 Video,user_Accounting,user_Action,user_Additional High-Quality Audio,user_Adventure,...,item_冒险,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_無料プレイ,item_独立,item_竞速,item_策略,item_角色扮演
287112,76561198250908770,20920,1,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
403296,76561198839931715,1202690,1,8,1.0,0.0,0.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
273397,76561198208193903,7940,1,1,1.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
412191,76561198862390476,644560,1,3,1.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
187675,76561198101008593,947510,0,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
test_df.head()

Unnamed: 0,userId,movieId,rating,user_rating_count,user_average_rating,user_360 Video,user_Accounting,user_Action,user_Additional High-Quality Audio,user_Adventure,...,item_冒险,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_無料プレイ,item_独立,item_竞速,item_策略,item_角色扮演
165530,76561198083779573,493490,1,3,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
349749,76561198358370335,22320,1,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
20184,76561197983242000,1416420,0,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
299817,76561198272977150,597820,1,2,1.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
495276,76561199467560260,1178490,1,3,1.0,0.0,0.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
user_continuous = ['user_rating_count', 'user_average_rating'] + [c for c in user_features.columns if c not in ['userId', 'user_rating_count', 'user_average_rating']]

In [70]:
item_continuous = ['item_average_rating']
item_genres = [c for c in item_features.columns if c not in ['movieId', 'item_average_rating']]

In [71]:
user_standardizer = StandardScaler()
item_standardizer = StandardScaler()

In [72]:
train_df[user_continuous] = user_standardizer.fit_transform(train_df[user_continuous])
train_df[item_continuous] = item_standardizer.fit_transform(train_df[item_continuous])

In [73]:
test_df[user_continuous] = user_standardizer.transform(test_df[user_continuous])
test_df[item_continuous] = item_standardizer.transform(test_df[item_continuous])

In [74]:
X_train_user = train_df[user_continuous].to_numpy()
X_train_item = train_df[item_continuous + item_genres].to_numpy()
y_train = train_df['rating'].to_numpy()

In [75]:
X_train_user.shape

(397545, 136)

In [76]:
X_train_item.shape

(397545, 135)

In [77]:
X_test_user = test_df[user_continuous].to_numpy()
X_test_item = test_df[item_continuous + item_genres].to_numpy()
y_test = test_df['rating'].to_numpy()

In [78]:
user_input = keras.Input(shape=(X_train_user.shape[1],), name='user_input')
item_input = keras.Input(shape=(X_train_item.shape[1],), name='item_input')

In [79]:
# user tower: 256 → 128 → 32
u = layers.Dense(256, activation='relu')(user_input)
u = layers.Dense(128, activation='relu')(u)
u = layers.Dense(32,  activation='relu')(u)

In [80]:
# item tower: 256 → 128 → 32
i = layers.Dense(256, activation='relu')(item_input)
i = layers.Dense(128, activation='relu')(i)
i = layers.Dense(32,  activation='relu')(i)

In [81]:
dot = layers.Dot(axes=1)([u, i])

In [82]:
model = keras.Model(inputs=[user_input, item_input], outputs=dot)

In [83]:
tf.random.set_seed(1)
model.compile(
    optimizer=keras.optimizers.Adam(0.01),
    loss=keras.losses.MeanSquaredError()
)

#### Train the model

In [84]:
history = model.fit(
    x = [X_train_user, X_train_item],
    y = y_train,
    batch_size=128,
    epochs=30,
    validation_data=([X_test_user, X_test_item], y_test),
    verbose=2
)

Epoch 1/30
3106/3106 - 5s - 2ms/step - loss: 0.0424 - val_loss: 0.0263
Epoch 2/30
3106/3106 - 4s - 1ms/step - loss: 0.7500 - val_loss: 0.0132
Epoch 3/30
3106/3106 - 4s - 1ms/step - loss: 0.0133 - val_loss: 0.0122
Epoch 4/30
3106/3106 - 4s - 1ms/step - loss: 0.0132 - val_loss: 0.0124
Epoch 5/30
3106/3106 - 4s - 1ms/step - loss: 0.0133 - val_loss: 0.0121
Epoch 6/30
3106/3106 - 4s - 1ms/step - loss: 0.0131 - val_loss: 0.0119
Epoch 7/30
3106/3106 - 4s - 1ms/step - loss: 0.0131 - val_loss: 0.0120
Epoch 8/30
3106/3106 - 4s - 1ms/step - loss: 0.0128 - val_loss: 0.0120
Epoch 9/30
3106/3106 - 4s - 1ms/step - loss: 0.0127 - val_loss: 0.0122
Epoch 10/30
3106/3106 - 4s - 1ms/step - loss: 0.0130 - val_loss: 0.0125
Epoch 11/30
3106/3106 - 4s - 1ms/step - loss: 0.0129 - val_loss: 0.0120
Epoch 12/30
3106/3106 - 4s - 1ms/step - loss: 0.0129 - val_loss: 0.0119
Epoch 13/30
3106/3106 - 4s - 1ms/step - loss: 0.0128 - val_loss: 0.0123
Epoch 14/30
3106/3106 - 4s - 1ms/step - loss: 0.0128 - val_loss: 0.0120
E

#### Compare predictions for an existing user

In [91]:
user_movie_counts = df.groupby('userId')['movieId'].count()
first_user = user_movie_counts[user_movie_counts > 5].index[0]
print(first_user)

76561197960279927


In [92]:
user_id = 76561197960279927

In [93]:
test_df.head()

Unnamed: 0,userId,movieId,rating,user_rating_count,user_average_rating,user_360 Video,user_Accounting,user_Action,user_Additional High-Quality Audio,user_Adventure,...,item_冒险,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_無料プレイ,item_独立,item_竞速,item_策略,item_角色扮演
165530,76561198083779573,493490,1,-0.1513,0.463741,0.0,-0.003885,-1.098373,-0.0334,-0.91993,...,0,0,0,0,0,0,0,0,0,0
349749,76561198358370335,22320,1,-0.166164,0.463741,0.0,-0.003885,-1.098373,-0.0334,-0.91993,...,0,0,0,0,0,0,0,0,0,0
20184,76561197983242000,1416420,0,-0.158732,-2.498655,0.0,-0.003885,-1.098373,-0.0334,-0.91993,...,0,0,0,0,0,0,0,0,0,0
299817,76561198272977150,597820,1,-0.158732,0.463741,0.0,-0.003885,0.95497,-0.0334,-0.91993,...,0,0,0,0,0,0,0,0,0,0
495276,76561199467560260,1178490,1,-0.1513,0.463741,0.0,-0.003885,0.95497,-0.0334,1.12886,...,0,0,0,0,0,0,0,0,0,0


In [94]:
user_test_df = test_df[test_df['userId'] == user_id].copy()

In [95]:
user_test_df.head()

Unnamed: 0,userId,movieId,rating,user_rating_count,user_average_rating,user_360 Video,user_Accounting,user_Action,user_Additional High-Quality Audio,user_Adventure,...,item_冒险,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_無料プレイ,item_独立,item_竞速,item_策略,item_角色扮演
53,76561197960279927,389980,0,-0.039825,0.134586,0.0,-0.003885,0.95497,-0.0334,1.12886,...,0,0,0,0,0,0,0,0,0,0
52,76561197960279927,252490,1,-0.039825,0.134586,0.0,-0.003885,0.95497,-0.0334,1.12886,...,0,0,0,0,0,0,0,0,0,0


In [96]:
X_user = user_test_df[user_continuous].to_numpy()
X_item = user_test_df[item_continuous + item_genres].to_numpy()

In [97]:
y_true = user_test_df['rating'].to_numpy()

In [98]:
y_pred = model.predict([X_user, X_item]).flatten()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step


In [99]:
user_test_df.insert(3, 'predictions', y_pred)

In [100]:
user_test_df.head()

Unnamed: 0,userId,movieId,rating,predictions,user_rating_count,user_average_rating,user_360 Video,user_Accounting,user_Action,user_Additional High-Quality Audio,...,item_冒险,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_無料プレイ,item_独立,item_竞速,item_策略,item_角色扮演
53,76561197960279927,389980,0,0.262296,-0.039825,0.134586,0.0,-0.003885,0.95497,-0.0334,...,0,0,0,0,0,0,0,0,0,0
52,76561197960279927,252490,1,0.957904,-0.039825,0.134586,0.0,-0.003885,0.95497,-0.0334,...,0,0,0,0,0,0,0,0,0,0


In [102]:
rmse = root_mean_squared_error(y_true, y_pred)
print(rmse)

0.18784481287002563


### Make predictions for a new user

In [107]:
list(user_test_df.columns)

['userId',
 'movieId',
 'rating',
 'predictions',
 'user_rating_count',
 'user_average_rating',
 'user_360 Video',
 'user_Accounting',
 'user_Action',
 'user_Additional High-Quality Audio',
 'user_Adventure',
 'user_Akció',
 'user_Aksiyon',
 'user_Animation & Modeling',
 'user_Audio Production',
 'user_Aventura',
 'user_Avventura',
 'user_Ação',
 'user_Bağımsız Yapımcı',
 'user_Captions available',
 'user_Casual',
 'user_Co-op',
 'user_Commentary available',
 'user_Corrida',
 'user_Cross-Platform Multiplayer',
 'user_Design & Illustration',
 'user_Desporto',
 'user_Documentary',
 'user_Downloadable Content',
 'user_Early Access',
 'user_Education',
 'user_Episodic',
 'user_Esportes',
 'user_Estrategia',
 'user_Estratégia',
 'user_Family Sharing',
 'user_Free To Play',
 'user_Free to Play',
 'user_Full controller support',
 'user_Game Development',
 'user_Game demo',
 'user_Gore',
 'user_In-App Purchases',
 'user_Indie',
 'user_Indépendant',
 'user_Kaland',
 'user_MMO',
 'user_Massively

In [123]:
# new_user = {
#     'user_rating_count'   : 3,
#     'user_average_rating' : 0.0,
#     'user_Action'         : 0.0,
#     'user_Adventure'      : 5.0,
#     'user_Animation'      : 0.0,
#     'user_Children'       : 0.0,
#     'user_Comedy'         : 0.0,
#     'user_Crime'          : 0.0,
#     'user_Documentary'    : 0.0,
#     'user_Drama'          : 0.0,
#     'user_Fantasy'        : 5.0,
#     'user_Horror'         : 0.0,
#     'user_Mystery'        : 0.0,
#     'user_Romance'        : 0.0,
#     'user_Sci-Fi'         : 0.0,
#     'user_Thriller'       : 0.0,
# }

new_user = {key: 0.0 for key in user_features.columns if key not in ['userId']}
new_user['user_Simulation'] = 5.0

In [124]:
new_user_df = pd.DataFrame([new_user])

In [125]:
new_user_df[user_continuous] = user_standardizer.transform(new_user_df[user_continuous])

In [126]:
X_new_user = new_user_df[user_continuous].to_numpy() 

In [127]:
X_new_user.shape

(1, 136)

#### Get the user embedding vector

In [128]:
user_model = keras.Model(
    inputs = user_input,
    outputs = u,
    name = "user_embedding_model"
)

#### Get the item embedding vector

In [129]:
item_model = keras.Model(
    inputs = item_input,
    outputs = i,
    name = "item_embedding_model"
)

In [130]:
user_embedding = user_model.predict(X_new_user)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


In [131]:
user_embedding.shape

(1, 32)

In [132]:
item_features_copy = item_features.copy()

In [133]:
item_features_copy[item_continuous] = item_standardizer.transform(item_features_copy[item_continuous])

In [134]:
X_all_items = item_features_copy[item_continuous + item_genres].to_numpy()

In [135]:
item_embeddings = item_model.predict(X_all_items, batch_size=512)

[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [136]:
item_embeddings.shape

(27262, 32)

In [137]:
scores = item_embeddings.dot(user_embedding[0]) 

In [138]:
len(scores)

27262

In [139]:
top_k_indices = np.argsort(scores)[-10:][::-1]
top_k_movieIds = item_features.iloc[top_k_indices]['movieId']

In [140]:
recommendations = (
    pd.DataFrame({'movieId': top_k_movieIds})
      .merge(movie_meta_data, on='movieId', how='left')
      [['movieId', 'title', 'genres']]
)

In [141]:
recommendations

Unnamed: 0,movieId,title,genres
0,1247100,SpellMaster: The Saga,Single-player|Steam Achievements|Full controll...
1,889480,Virtual Virtual Reality,Adventure|Indie
2,682990,Drug Dealer Simulator,Action|Adventure|Indie|Simulation
3,683320,GRIS,Adventure|Indie
4,683940,Lawgivers,Indie|Simulation
5,684410,Bridge Constructor Portal,Simulation|Strategy
6,684630,American Truck Simulator - New Mexico,Indie|Simulation
7,684680,Polygoneer,Action|Casual|Indie
8,684690,Grid Cartographer,Design & Illustration
9,684851,DCS: World War II Assets Pack,Simulation
