# Imports

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math
from collections import defaultdict
from tqdm import tqdm
import json

In [21]:
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

from tensorflow import keras
from tensorflow.keras import layers

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

# Load Data

In [22]:
import pandas as pd

df = pd.read_csv('../../dataset/weighted_score_above_08.csv')

df.head()

  df = pd.read_csv('../../dataset/weighted_score_above_08.csv')


Unnamed: 0,recommendationid,appid,game,author_steamid,author_num_games_owned,author_num_reviews,author_playtime_forever,author_playtime_last_two_weeks,author_playtime_at_review,author_last_played,...,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,hidden_in_steam_china,steam_china_location
0,147449116,10,Counter-Strike,76561199183984450,51,12,2548,0,2480,1696305457,...,1,99,2,0.889438,0,1,0,0,1,
1,147374264,10,Counter-Strike,76561198099573060,226,13,2369,0,2361,1696096555,...,1,122,8,0.914834,0,1,0,0,1,
2,147357703,10,Counter-Strike,76561199080026894,118,23,13501,212,12957,1697630734,...,1,599,20,0.968375,3,1,0,0,1,
3,147345102,10,Counter-Strike,76561198068970227,28,1,10668,640,9906,1698261011,...,1,59,12,0.826206,0,0,0,0,1,
4,147284743,10,Counter-Strike,76561199137893460,19,5,543,10,526,1697810991,...,1,128,24,0.853612,0,1,0,0,1,


In [23]:
with open('../combined_df.json', 'r') as f:
    sample_data = [json.loads(line) for line in f]

In [24]:
valid_appids = {entry['appid'] for entry in sample_data}

df = df[df['appid'].isin(valid_appids)]

df_games = df[['appid', 'game']]
df_games = df_games.drop_duplicates(subset='appid')

appid2game = df_games.set_index('appid')['game'].to_dict()
game2appid = df_games.set_index('game')['appid'].to_dict()

del df_games

df = df[['appid', 'author_steamid', 'voted_up']]
df.rename(columns={'appid': 'movieId', 'author_steamid': 'userId', 'voted_up': 'rating'}, inplace=True)

In [25]:
item_id_to_genres = {entry['appid']: entry['genre'] for entry in sample_data}

In [7]:
# df.reset_index(drop=True, inplace=True)

In [8]:
# N_DISPLAY = 30
# has_no_genre_count = 0
# for i, item_id in enumerate(df['item_id'].unique()):
#     genres = item_id_to_genres.get(item_id, [])
#     if not genres:
#         has_no_genre_count += 1
#         continue
# print(f"Number of games without genres: {has_no_genre_count}")

In [9]:
# nan_count = df['rating'].isna().sum()
# print(f"Number of NaN values in 'rating': {nan_count}")

In [10]:
sample_data[0]

{'appid': 550, 'game': 'Left 4 Dead 2', 'genre': ['Action']}

In [11]:
# Create a DataFrame from sample_data
movies_data_clean = pd.DataFrame(sample_data)

# Rename columns for clarity
movies_data_clean.rename(columns={'game': 'title'}, inplace=True)

# Convert the 'genre' list into a pipe-separated string
movies_data_clean['genre'] = movies_data_clean['genre'].apply(lambda x: '|'.join(x) if isinstance(x, list) else '')

# Select only the required columns
movies_data_clean = movies_data_clean[['appid', 'title', 'genre']]
movies_data_clean.rename(columns={'appid': 'movieId'}, inplace=True)
movies_data_clean.rename(columns={'genre': 'genres'}, inplace=True)

movies_data_clean.head()

Unnamed: 0,movieId,title,genres
0,550,Left 4 Dead 2,Action
1,644560,Mirror,Adventure|Indie|RPG
2,264710,Subnautica,Adventure|Indie
3,294100,RimWorld,Indie|Simulation|Strategy
4,413150,Stardew Valley,Indie|RPG|Simulation


# Validation Methodology

In [None]:
# def leave_one_out_split(interactions_dict):
#     train_interactions = {}
#     test_interactions = {}

#     for user, items in interactions_dict.items():
#         if len(items) < 2:
#             continue  # already filtered but double check
#         train_interactions[user] = items[:-1]
#         test_interactions[user] = [items[-1]]  # last one is held out

#     return train_interactions, test_interactions

def leave_one_out_split(interactions_dict):
    train, test = {}, {}
    for user, items in interactions_dict.items():
        if len(items) < 2:
            continue
        train[user] = items[:-1]
        test[user] = [items[-1]]  # still just the last one
    return train, test

train_interactions, test_interactions = leave_one_out_split(user_item_interactions)


# Two Tower

### Create Item Train Feature Vectors

In [142]:
movie_features_data = movies_data_clean.copy()

In [143]:
movie_features_data.head()

Unnamed: 0,movieId,title,genres
0,934710,英语杀,Casual|Free To Play|Indie|RPG|Strategy|Account...
1,953610,HOT GIRLS VR,Action|Adventure|Casual|Free to Play|Indie|Mas...
2,919280,MovieMator Video Editor,Accounting|Animation & Modeling|Audio Producti...
3,1052500,御龙在天-平衡国战版,Action|Adventure|Casual|Free To Play|Indie|Mas...
4,1242700,墲人之境：探索,Action|Adventure|Casual|Indie|Massively Multip...


In [144]:
movies_data_clean.shape

(27262, 3)

In [145]:
movies_data_clean.loc[:, 'genre_count'] = movies_data_clean['genres'].str.count(r'\|') + 1

movies_data_clean = (
    movies_data_clean
    .sort_values('genre_count', ascending=False)
    .drop_duplicates('title', keep='first')
    .drop(columns='genre_count')
    .reset_index(drop=True)
)

In [146]:
movies_data_clean.shape

(27262, 3)

In [147]:
movies_data_clean['movieId'].nunique()

27262

#### Read Movie Ratings Data

In [149]:
ratings_data = df.copy()

In [150]:
ratings_data.head()

Unnamed: 0,movieId,userId,rating
0,10,76561199183984450,1
1,10,76561198099573060,1
2,10,76561199080026894,1
3,10,76561198068970227,1
4,10,76561199137893460,1


In [151]:
ratings_data['rating'].min()

np.int64(0)

In [152]:
ratings_data.shape

(498094, 3)

In [153]:
ratings_data['movieId'].nunique()

27315

In [189]:
movie_ratings_data = (
    movies_data_clean.join(ratings_data[['userId', 'movieId', 'rating']].set_index('movieId'), on='movieId', how='inner')
    [['userId', 'movieId', 'title', 'rating', 'genres']]
).sort_values(['userId', 'movieId']).reset_index(drop=True)

In [None]:
# Step 1a: Filter users with more than 1 rating
user_rating_counts = movie_ratings_data.groupby('userId')['movieId'].count()
eligible_users = user_rating_counts[user_rating_counts > 1].index

# Step 1b: Filter movies with more than 1 rating
movie_rating_counts = movie_ratings_data.groupby('movieId')['userId'].count()
eligible_movies = movie_rating_counts[movie_rating_counts > 1].index

# Step 1c: Apply both filters to movie_ratings_data
movie_ratings_data = movie_ratings_data[
    movie_ratings_data['userId'].isin(eligible_users) &
    movie_ratings_data['movieId'].isin(eligible_movies)
].copy()

In [None]:
# Group by userId to create user-item interaction history
user_item_interactions = (
    movie_ratings_data
    .sort_values(['userId', 'movieId'])
    .groupby('userId')['movieId']
    .apply(list)
    .to_dict()
)

In [273]:
train_interactions, test_interactions = leave_one_out_split(user_item_interactions)

In [None]:
train_rows = [(user, movie) for user, movies in train_interactions.items() for movie in movies]
test_rows = [(user, movie[0]) for user, movie in test_interactions.items()]

train_df = pd.DataFrame(train_rows, columns=['userId', 'movieId'])
test_df = pd.DataFrame(test_rows, columns=['userId', 'movieId'])

In [191]:
movie_ratings_data.head()

Unnamed: 0,userId,movieId,title,rating,genres
3,76561197960267644,236390,War Thunder,1,Action|Free to Play|Massively Multiplayer|Simu...
4,76561197960267644,251570,7 Days to Die,1,Action|Adventure|Indie|RPG|Simulation|Strategy...
5,76561197960267644,386360,SMITE,1,Action|Adventure|Casual|Massively Multiplayer|...
6,76561197960267644,1254120,Bless Unleashed,1,Action|Massively Multiplayer|RPG|Free To Play
12,76561197960269579,221180,Eufloria HD,1,Indie|Strategy


In [192]:
movie_ratings_data.shape

(160583, 5)

In [193]:
movie_ratings_data['movieId'].nunique()

15105

In [194]:
movie_ratings_data['userId'].nunique()

47935

#### Create Movie Meta Data

In [195]:
movie_meta_data = (
    movie_ratings_data[['movieId', 'title', 'genres']]
      .drop_duplicates(subset='movieId')
)

In [196]:
movie_meta_data.head()

Unnamed: 0,movieId,title,genres
3,236390,War Thunder,Action|Free to Play|Massively Multiplayer|Simu...
4,251570,7 Days to Die,Action|Adventure|Indie|RPG|Simulation|Strategy...
5,386360,SMITE,Action|Adventure|Casual|Massively Multiplayer|...
6,1254120,Bless Unleashed,Action|Massively Multiplayer|RPG|Free To Play
12,221180,Eufloria HD,Indie|Strategy


### Explore Movies

In [197]:
grouped_movie_data = (
    movie_ratings_data.groupby(['movieId', 'title', 'genres']).agg(
        rating_count=('rating', 'count'),
        rating_mean=('rating', 'mean'),
    ).reset_index()
)[['movieId', 'rating_count', 'rating_mean', 'title', 'genres']]
grouped_movie_data.columns = ['movie_id', 'num_ratings', 'average_rating', 'title', 'genres']
grouped_movie_data['average_rating'] = np.round(grouped_movie_data['average_rating'], 1)
grouped_movie_data.head()

Unnamed: 0,movie_id,num_ratings,average_rating,title,genres
0,10,188,1.0,Counter-Strike,Action
1,20,23,1.0,Team Fortress Classic,Action
2,30,9,1.0,Day of Defeat,Action
3,40,7,1.0,Deathmatch Classic,Action
4,50,20,1.0,Half-Life: Opposing Force,Action


In [198]:
grouped_movie_data.sort_values(by=['num_ratings'], ascending=False)

Unnamed: 0,movie_id,num_ratings,average_rating,title,genres
2279,292030,737,1.0,The Witcher 3: Wild Hunt,RPG
4089,413150,626,1.0,Stardew Valley,Indie|RPG|Simulation
1472,242760,550,1.0,The Forest,Action|Adventure|Indie|Simulation
1966,271590,539,0.8,Grand Theft Auto V,Action|Adventure
10170,1174180,536,1.0,Red Dead Redemption 2,Action|Adventure
...,...,...,...,...,...
4633,457010,1,1.0,StarsOne,Action|Indie|Massively Multiplayer|Early Access
4630,456220,1,0.0,Two Worlds II - Call of the Tenebrae,Action|Adventure|Indie|RPG
12422,1538970,1,1.0,Hammerwatch II,Action|Adventure|Indie|RPG
4625,455810,1,1.0,Rainbow Six Siege - Ops Icon Charm Bundle,Action


### Explore Genres

In [199]:
movie_ratings_data_genres = movie_ratings_data.assign(genres = movie_ratings_data['genres'].str.split('|'))
movie_ratings_data_exploded = movie_ratings_data_genres.explode('genres').rename(columns={'genres': 'genre'})

In [200]:
movie_ratings_data_exploded.head()

Unnamed: 0,userId,movieId,title,rating,genre
3,76561197960267644,236390,War Thunder,1,Action
3,76561197960267644,236390,War Thunder,1,Free to Play
3,76561197960267644,236390,War Thunder,1,Massively Multiplayer
3,76561197960267644,236390,War Thunder,1,Simulation
4,76561197960267644,251570,7 Days to Die,1,Action


In [201]:
grouped_genre_data = (
    movie_ratings_data_exploded.groupby(['genre'])
    .agg(
        movie_count=('movieId', 'nunique'),
        rating_mean=('rating', 'mean'),
        rating_count=('rating', 'count'),
    )
    .reset_index()
)
grouped_genre_data.columns = ['genre', 'num_movies', 'average_rating_per_genre', 'num_ratings_per_genre']
grouped_genre_data['average_rating_per_genre'] = np.round(
    grouped_genre_data['average_rating_per_genre'], 1
)

In [202]:
grouped_genre_data.sort_values('num_ratings_per_genre', ascending=False)

Unnamed: 0,genre,num_movies,average_rating_per_genre,num_ratings_per_genre
1,Action,6422,0.9,84069
32,Indie,8062,0.9,69085
3,Adventure,5940,0.9,67011
50,RPG,3899,0.9,43439
60,Simulation,3856,0.9,37224
...,...,...,...,...
34,Kaland,1,1.0,1
40,Multijogador Massivo,1,0.0,1
42,Nezávislé,1,1.0,1
65,Simulátory,1,1.0,1


### Create Item Feature Vectors

In [203]:
grouped_movie_ratings_data = (
    movie_ratings_data
    .groupby(['movieId', 'title', 'genres'], as_index=False)
    .agg(item_average_rating = ('rating', 'mean'))
)

In [204]:
grouped_movie_ratings_data.head()

Unnamed: 0,movieId,title,genres,item_average_rating
0,10,Counter-Strike,Action,1.0
1,20,Team Fortress Classic,Action,1.0
2,30,Day of Defeat,Action,1.0
3,40,Deathmatch Classic,Action,1.0
4,50,Half-Life: Opposing Force,Action,1.0


In [205]:
genre_dummies = grouped_movie_ratings_data['genres'].str.get_dummies(sep='|')

In [206]:
item_genre_dummies = genre_dummies.add_prefix('item_')

In [207]:
item_features = pd.concat([
    grouped_movie_ratings_data[['movieId', 'item_average_rating']],
    item_genre_dummies
], axis=1)

In [208]:
item_features.head()

Unnamed: 0,movieId,item_average_rating,item_Accounting,item_Action,item_Additional High-Quality Audio,item_Adventure,item_Akció,item_Aksiyon,item_Animation & Modeling,item_Audio Production,...,item_体育,item_免费开玩,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_独立,item_竞速,item_策略,item_角色扮演
0,10,1.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20,1.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30,1.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,40,1.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,50,1.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [209]:
item_features.shape

(15105, 127)

### Create User Feature Vectors

In [210]:
movie_ratings_data.head()

Unnamed: 0,userId,movieId,title,rating,genres
3,76561197960267644,236390,War Thunder,1,Action|Free to Play|Massively Multiplayer|Simu...
4,76561197960267644,251570,7 Days to Die,1,Action|Adventure|Indie|RPG|Simulation|Strategy...
5,76561197960267644,386360,SMITE,1,Action|Adventure|Casual|Massively Multiplayer|...
6,76561197960267644,1254120,Bless Unleashed,1,Action|Massively Multiplayer|RPG|Free To Play
12,76561197960269579,221180,Eufloria HD,1,Indie|Strategy


In [211]:
movie_ratings_genre_exploded = (
    movie_ratings_data
    .assign(genres = movie_ratings_data['genres'].str.split('|'))
    .explode('genres')
    .rename(columns={'genres':'genre'})
)

In [212]:
user_stats = (
    movie_ratings_genre_exploded
    .groupby('userId', as_index=True)['rating']
    .agg(
        user_rating_count = 'count',
        user_average_rating = 'mean'
    )
)

In [213]:
user_genres = (
    movie_ratings_genre_exploded
    .groupby(['userId', 'genre'], as_index=True)['rating']
    .mean()
    .unstack('genre') 
    .add_prefix('user_')
)

In [214]:
user_features = (
    user_stats
    .join(user_genres)
    .reset_index()  
)

In [215]:
user_features = user_features.fillna(0)

In [216]:
user_features.head()

Unnamed: 0,userId,user_rating_count,user_average_rating,user_Accounting,user_Action,user_Additional High-Quality Audio,user_Adventure,user_Akció,user_Aksiyon,user_Animation & Modeling,...,user_体育,user_免费开玩,user_动作,user_大型多人在线,user_抢先体验,user_模拟,user_独立,user_竞速,user_策略,user_角色扮演
0,76561197960267644,22,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,76561197960269579,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,76561197960269822,5,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,76561197960271099,7,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,76561197960272407,5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [217]:
user_features['userId'].nunique()

47935

In [218]:
combined_features_df = (
    movie_ratings_data
      .loc[:, ['userId', 'movieId', 'rating']]
      .merge(user_features, on='userId')
      .merge(item_features, on='movieId')
)

In [219]:
combined_features_df.head()

Unnamed: 0,userId,movieId,rating,user_rating_count,user_average_rating,user_Accounting,user_Action,user_Additional High-Quality Audio,user_Adventure,user_Akció,...,item_体育,item_免费开玩,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_独立,item_竞速,item_策略,item_角色扮演
0,76561197960267644,236390,1,22,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,76561197960267644,251570,1,22,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,76561197960267644,386360,1,22,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,76561197960267644,1254120,1,22,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,76561197960269579,221180,1,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [220]:
combined_features_df.shape

(160583, 256)

#### Train Two-Tower System

In [221]:
train_df, test_df = train_test_split(combined_features_df, test_size=0.2, random_state=42)

In [222]:
train_df.head()

Unnamed: 0,userId,movieId,rating,user_rating_count,user_average_rating,user_Accounting,user_Action,user_Additional High-Quality Audio,user_Adventure,user_Akció,...,item_体育,item_免费开玩,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_独立,item_竞速,item_策略,item_角色扮演
77362,76561198112662576,418370,1,7,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
68765,76561198090608579,2199210,1,33,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
84790,76561198131348759,245390,1,3,1.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
86551,76561198137042170,1245620,1,3,1.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
105017,76561198232847568,1173400,1,61,0.868852,0.0,0.916667,0.0,0.846154,0.0,...,0,0,0,0,0,0,0,0,0,0


In [223]:
test_df.head()

Unnamed: 0,userId,movieId,rating,user_rating_count,user_average_rating,user_Accounting,user_Action,user_Additional High-Quality Audio,user_Adventure,user_Akció,...,item_体育,item_免费开玩,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_独立,item_竞速,item_策略,item_角色扮演
117005,76561198300270231,406970,1,1339,0.759522,0.0,0.738281,0.0,0.800948,0.0,...,0,0,0,0,0,0,0,0,0,0
20136,76561198006560453,200510,1,110,0.981818,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
129146,76561198371932407,292030,1,4,1.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
149488,76561199002086696,2000950,1,42,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
38361,76561198036560642,1406990,1,15,1.0,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [224]:
user_continuous = ['user_rating_count', 'user_average_rating'] + [c for c in user_features.columns if c not in ['userId', 'user_rating_count', 'user_average_rating']]

In [225]:
item_continuous = ['item_average_rating']
item_genres = [c for c in item_features.columns if c not in ['movieId', 'item_average_rating']]

In [226]:
user_standardizer = StandardScaler()
item_standardizer = StandardScaler()

In [227]:
train_df[user_continuous] = user_standardizer.fit_transform(train_df[user_continuous])
train_df[item_continuous] = item_standardizer.fit_transform(train_df[item_continuous])

In [228]:
test_df[user_continuous] = user_standardizer.transform(test_df[user_continuous])
test_df[item_continuous] = item_standardizer.transform(test_df[item_continuous])

In [229]:
X_train_user = train_df[user_continuous].to_numpy()
X_train_item = train_df[item_continuous + item_genres].to_numpy()
y_train = train_df['rating'].to_numpy()

In [230]:
X_train_user.shape

(128466, 127)

In [231]:
X_train_item.shape

(128466, 126)

In [232]:
X_test_user = test_df[user_continuous].to_numpy()
X_test_item = test_df[item_continuous + item_genres].to_numpy()
y_test = test_df['rating'].to_numpy()

In [233]:
user_input = keras.Input(shape=(X_train_user.shape[1],), name='user_input')
item_input = keras.Input(shape=(X_train_item.shape[1],), name='item_input')

In [234]:
# user tower: 256 → 128 → 32
u = layers.Dense(256, activation='relu')(user_input)
u = layers.Dense(128, activation='relu')(u)
u = layers.Dense(32,  activation='relu')(u)

In [235]:
# item tower: 256 → 128 → 32
i = layers.Dense(256, activation='relu')(item_input)
i = layers.Dense(128, activation='relu')(i)
i = layers.Dense(32,  activation='relu')(i)

In [236]:
dot = layers.Dot(axes=1)([u, i])

In [237]:
model = keras.Model(inputs=[user_input, item_input], outputs=dot)

In [238]:
tf.random.set_seed(1)
model.compile(
    optimizer=keras.optimizers.Adam(0.01),
    loss=keras.losses.MeanSquaredError()
)

#### Train the model

In [239]:
history = model.fit(
    x = [X_train_user, X_train_item],
    y = y_train,
    batch_size=128,
    epochs=30,
    validation_data=([X_test_user, X_test_item], y_test),
    verbose=2
)

Epoch 1/30
1004/1004 - 2s - 2ms/step - loss: 0.1397 - val_loss: 0.0535
Epoch 2/30
1004/1004 - 1s - 1ms/step - loss: 0.0531 - val_loss: 0.0522
Epoch 3/30
1004/1004 - 1s - 1ms/step - loss: 0.0534 - val_loss: 0.0533
Epoch 4/30
1004/1004 - 2s - 1ms/step - loss: 0.0531 - val_loss: 0.0522
Epoch 5/30
1004/1004 - 1s - 1ms/step - loss: 0.0471 - val_loss: 0.0431
Epoch 6/30
1004/1004 - 1s - 1ms/step - loss: 0.1498 - val_loss: 0.0461
Epoch 7/30
1004/1004 - 1s - 1ms/step - loss: 0.0442 - val_loss: 0.0438
Epoch 8/30
1004/1004 - 1s - 1ms/step - loss: 0.0379 - val_loss: 0.0340
Epoch 9/30
1004/1004 - 1s - 1ms/step - loss: 0.0419 - val_loss: 0.0347
Epoch 10/30
1004/1004 - 1s - 1ms/step - loss: 0.0329 - val_loss: 0.0327
Epoch 11/30
1004/1004 - 1s - 1ms/step - loss: 0.0327 - val_loss: 0.0347
Epoch 12/30
1004/1004 - 2s - 2ms/step - loss: 0.0329 - val_loss: 0.0345
Epoch 13/30
1004/1004 - 1s - 1ms/step - loss: 0.0330 - val_loss: 0.0341
Epoch 14/30
1004/1004 - 1s - 1ms/step - loss: 0.0330 - val_loss: 0.0339
E

#### Compare predictions for an existing user

In [240]:
user_movie_counts = df.groupby('userId')['movieId'].count()
first_user = user_movie_counts[user_movie_counts > 5].index[0]
print(first_user)

76561197960279927


In [241]:
user_id = 76561197960279927

In [242]:
test_df.head()

Unnamed: 0,userId,movieId,rating,user_rating_count,user_average_rating,user_Accounting,user_Action,user_Additional High-Quality Audio,user_Adventure,user_Akció,...,item_体育,item_免费开玩,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_独立,item_竞速,item_策略,item_角色扮演
117005,76561198300270231,406970,1,9.169071,-0.53174,-0.003946,-0.021641,-0.045465,0.238874,-0.003946,...,0,0,0,0,0,0,0,0,0,0
20136,76561198006560453,200510,1,0.454861,0.432825,-0.003946,0.639136,-0.045465,0.700057,-0.003946,...,0,0,0,0,0,0,0,0,0,0
129146,76561198371932407,292030,1,-0.29673,0.511717,-0.003946,-1.885622,-0.045465,0.700057,-0.003946,...,0,0,0,0,0,0,0,0,0,0
149488,76561199002086696,2000950,1,-0.027292,0.511717,-0.003946,0.639136,-0.045465,0.700057,-0.003946,...,0,0,0,0,0,0,0,0,0,0
38361,76561198036560642,1406990,1,-0.218735,0.511717,-0.003946,0.639136,-0.045465,0.700057,-0.003946,...,0,0,0,0,0,0,0,0,0,0


In [243]:
user_test_df = test_df[test_df['userId'] == user_id].copy()

In [244]:
user_test_df.head()

Unnamed: 0,userId,movieId,rating,user_rating_count,user_average_rating,user_Accounting,user_Action,user_Additional High-Quality Audio,user_Adventure,user_Akció,...,item_体育,item_免费开玩,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_独立,item_竞速,item_策略,item_角色扮演
20,76561197960279927,555160,1,-0.211644,0.511717,-0.003946,0.639136,-0.045465,0.700057,-0.003946,...,0,0,0,0,0,0,0,0,0,0


In [245]:
X_user = user_test_df[user_continuous].to_numpy()
X_item = user_test_df[item_continuous + item_genres].to_numpy()

In [246]:
y_true = user_test_df['rating'].to_numpy()

In [247]:
y_pred = model.predict([X_user, X_item]).flatten()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step


In [248]:
user_test_df.insert(3, 'predictions', y_pred)

In [249]:
user_test_df.head()

Unnamed: 0,userId,movieId,rating,predictions,user_rating_count,user_average_rating,user_Accounting,user_Action,user_Additional High-Quality Audio,user_Adventure,...,item_体育,item_免费开玩,item_动作,item_大型多人在线,item_抢先体验,item_模拟,item_独立,item_竞速,item_策略,item_角色扮演
20,76561197960279927,555160,1,0.996454,-0.211644,0.511717,-0.003946,0.639136,-0.045465,0.700057,...,0,0,0,0,0,0,0,0,0,0


In [250]:
rmse = root_mean_squared_error(y_true, y_pred)
print(rmse)

0.003546416759490967


### Make predictions for a new user

In [251]:
list(user_test_df.columns)

['userId',
 'movieId',
 'rating',
 'predictions',
 'user_rating_count',
 'user_average_rating',
 'user_Accounting',
 'user_Action',
 'user_Additional High-Quality Audio',
 'user_Adventure',
 'user_Akció',
 'user_Aksiyon',
 'user_Animation & Modeling',
 'user_Audio Production',
 'user_Aventura',
 'user_Ação',
 'user_Bağımsız Yapımcı',
 'user_Captions available',
 'user_Casual',
 'user_Co-op',
 'user_Corrida',
 'user_Cross-Platform Multiplayer',
 'user_Design & Illustration',
 'user_Downloadable Content',
 'user_Early Access',
 'user_Education',
 'user_Episodic',
 'user_Esportes',
 'user_Estrategia',
 'user_Estratégia',
 'user_Family Sharing',
 'user_Free To Play',
 'user_Free to Play',
 'user_Full controller support',
 'user_Game Development',
 'user_Game demo',
 'user_Gore',
 'user_In-App Purchases',
 'user_Indie',
 'user_Indépendant',
 'user_Kaland',
 'user_MMO',
 'user_Massively Multiplayer',
 'user_Miscellaneous',
 'user_Movie',
 'user_Multi-player',
 'user_Multijogador Massivo',
 '

In [252]:
# new_user = {
#     'user_rating_count'   : 3,
#     'user_average_rating' : 0.0,
#     'user_Action'         : 0.0,
#     'user_Adventure'      : 5.0,
#     'user_Animation'      : 0.0,
#     'user_Children'       : 0.0,
#     'user_Comedy'         : 0.0,
#     'user_Crime'          : 0.0,
#     'user_Documentary'    : 0.0,
#     'user_Drama'          : 0.0,
#     'user_Fantasy'        : 5.0,
#     'user_Horror'         : 0.0,
#     'user_Mystery'        : 0.0,
#     'user_Romance'        : 0.0,
#     'user_Sci-Fi'         : 0.0,
#     'user_Thriller'       : 0.0,
# }

new_user = {key: 0.0 for key in user_features.columns if key not in ['userId']}
new_user['user_Simulation'] = 5.0

In [253]:
new_user_df = pd.DataFrame([new_user])

In [254]:
new_user_df[user_continuous] = user_standardizer.transform(new_user_df[user_continuous])

In [255]:
X_new_user = new_user_df[user_continuous].to_numpy() 

In [256]:
X_new_user.shape

(1, 127)

#### Get the user embedding vector

In [257]:
user_model = keras.Model(
    inputs = user_input,
    outputs = u,
    name = "user_embedding_model"
)

#### Get the item embedding vector

In [258]:
item_model = keras.Model(
    inputs = item_input,
    outputs = i,
    name = "item_embedding_model"
)

In [259]:
user_embedding = user_model.predict(X_new_user)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step


In [260]:
user_embedding.shape

(1, 32)

In [261]:
item_features_copy = item_features.copy()

In [262]:
item_features_copy[item_continuous] = item_standardizer.transform(item_features_copy[item_continuous])

In [263]:
X_all_items = item_features_copy[item_continuous + item_genres].to_numpy()

In [264]:
item_embeddings = item_model.predict(X_all_items, batch_size=512)

[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [265]:
item_embeddings.shape

(15105, 32)

In [266]:
scores = item_embeddings.dot(user_embedding[0]) 

In [267]:
len(scores)

15105

In [268]:
top_k_indices = np.argsort(scores)[-10:][::-1]
top_k_movieIds = item_features.iloc[top_k_indices]['movieId']

In [269]:
recommendations = (
    pd.DataFrame({'movieId': top_k_movieIds})
      .merge(movie_meta_data, on='movieId', how='left')
      [['movieId', 'title', 'genres']]
)

In [270]:
recommendations

Unnamed: 0,movieId,title,genres
0,1175380,Arma 3 Creator DLC: Spearhead 1944,Aksiyon|Simülasyon|Strateji
1,1235751,The Sims™ 4 Vintage Glamour Stuff,Simulação
2,1235720,The Sims™ 4 Get Famous,Simulação
3,458360,eMedia Piano and Keyboard Method,Education
4,347990,Assetto Corsa - Dream Pack 1,Indie|Corrida|Simulação|Esportes
5,364970,CopperCube 5,Animation & Modeling|Design & Illustration|Edu...
6,251810,Leadwerks Game Engine,Animation & Modeling|Design & Illustration|Edu...
7,877850,Tribe XR | DJ Academy,Casual|Indie|Simulation|Strategy|Audio Product...
8,883680,Four Kings One War,Strategy|Early Access
9,2105980,会计模拟器,Strategy|Early Access
