In [263]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, average_precision_score
from sklearn.mixture import GaussianMixture
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 
from tensorflow.python.framework import ops
from collections import Counter
import pickle

In [165]:
path = '/home/jewelle/data_bootcamp/LHL-final-project/'

df = pd.read_csv(path + 'final.csv')

In [166]:
df.head()

Unnamed: 0,userID,game,action,hours_played,all_reviews,developer,publisher,popular_tags,game_details,genre
0,151603712,Fallout 4,purchase,1.0,"Mostly Positive,(90,387),- 70% of the 90,387 u...",Bethesda Game Studios,"Bethesda Softworks,Bethesda Softworks","Open World,Post-apocalyptic,Exploration,Single...","Single-player,Steam Achievements,Full controll...",RPG
1,151603712,Fallout 4,play,87.0,"Mostly Positive,(90,387),- 70% of the 90,387 u...",Bethesda Game Studios,"Bethesda Softworks,Bethesda Softworks","Open World,Post-apocalyptic,Exploration,Single...","Single-player,Steam Achievements,Full controll...",RPG
2,87445402,Fallout 4,purchase,1.0,"Mostly Positive,(90,387),- 70% of the 90,387 u...",Bethesda Game Studios,"Bethesda Softworks,Bethesda Softworks","Open World,Post-apocalyptic,Exploration,Single...","Single-player,Steam Achievements,Full controll...",RPG
3,87445402,Fallout 4,play,83.0,"Mostly Positive,(90,387),- 70% of the 90,387 u...",Bethesda Game Studios,"Bethesda Softworks,Bethesda Softworks","Open World,Post-apocalyptic,Exploration,Single...","Single-player,Steam Achievements,Full controll...",RPG
4,25096601,Fallout 4,purchase,1.0,"Mostly Positive,(90,387),- 70% of the 90,387 u...",Bethesda Game Studios,"Bethesda Softworks,Bethesda Softworks","Open World,Post-apocalyptic,Exploration,Single...","Single-player,Steam Achievements,Full controll...",RPG


In [167]:
df.shape

(92530, 10)

In [168]:
df.dtypes

userID            int64
game             object
action           object
hours_played    float64
all_reviews      object
developer        object
publisher        object
popular_tags     object
game_details     object
genre            object
dtype: object

In [169]:
df = df.sort_values(['userID', 'game', 'hours_played'])

In [170]:
df.head()

Unnamed: 0,userID,game,action,hours_played,all_reviews,developer,publisher,popular_tags,game_details,genre
51895,5250,Alien Swarm,purchase,1.0,"Very Positive,(18,439),- 94% of the 18,439 use...",Valve,"Valve,Valve","Free to Play,Co-op,Action,Multiplayer,Aliens,O...","Single-player,Multi-player,Co-op,Steam Achieve...",Action
51896,5250,Alien Swarm,play,4.9,"Very Positive,(18,439),- 94% of the 18,439 use...",Valve,"Valve,Valve","Free to Play,Co-op,Action,Multiplayer,Aliens,O...","Single-player,Multi-player,Co-op,Steam Achieve...",Action
58359,5250,Counter-Strike,purchase,1.0,"Overwhelmingly Positive,(66,438),- 96% of the ...",Valve,"Valve,Valve","Action,FPS,Multiplayer,Shooter,Classic,Team-Ba...","Multi-player,Online Multi-Player,Local Multi-P...",Action
59534,5250,Day of Defeat,purchase,1.0,"Very Positive,(2,022),- 86% of the 2,022 user ...",Valve,"Valve,Valve","FPS,World War II,Multiplayer,Shooter,Action,Wa...","Multi-player,Valve Anti-Cheat enabled",Action
60143,5250,Deathmatch Classic,purchase,1.0,"Very Positive,(953),- 80% of the 953 user revi...",Valve,"Valve,Valve","Action,FPS,Classic,Multiplayer,Shooter,First-P...","Multi-player,Online Multi-Player,Local Multi-P...",Action


In [171]:
df.loc[(df['action'] == 'purchase') & (df['hours_played'] == 1.0), 'hours_played'] = 0

In [172]:
clean_df = df.drop_duplicates(['userID', 'game'], keep = 'last')

In [173]:
clean_df.drop(['action'], axis=1, inplace=True)

In [174]:
clean_df.head()

Unnamed: 0,userID,game,hours_played,all_reviews,developer,publisher,popular_tags,game_details,genre
51896,5250,Alien Swarm,4.9,"Very Positive,(18,439),- 94% of the 18,439 use...",Valve,"Valve,Valve","Free to Play,Co-op,Action,Multiplayer,Aliens,O...","Single-player,Multi-player,Co-op,Steam Achieve...",Action
58359,5250,Counter-Strike,0.0,"Overwhelmingly Positive,(66,438),- 96% of the ...",Valve,"Valve,Valve","Action,FPS,Multiplayer,Shooter,Classic,Team-Ba...","Multi-player,Online Multi-Player,Local Multi-P...",Action
59534,5250,Day of Defeat,0.0,"Very Positive,(2,022),- 86% of the 2,022 user ...",Valve,"Valve,Valve","FPS,World War II,Multiplayer,Shooter,Action,Wa...","Multi-player,Valve Anti-Cheat enabled",Action
60143,5250,Deathmatch Classic,0.0,"Very Positive,(953),- 80% of the 953 user revi...",Valve,"Valve,Valve","Action,FPS,Classic,Multiplayer,Shooter,First-P...","Multi-player,Online Multi-Player,Local Multi-P...",Action
12807,5250,Dota 2,0.0,"Very Positive,(1,015,621),- 85% of the 1,015,6...",Valve,"Valve,Valve","Free to Play,MOBA,Multiplayer,Strategy,e-sport...","Multi-player,Co-op,Steam Trading Cards,Steam W...","Action,Free to Play,Strategy"


In [175]:
n_users = len(clean_df.userID.unique())
n_games = len(clean_df.game.unique())

print('There are {0} users and {1} games in the data'.format(n_users, n_games))

There are 10140 users and 1976 games in the data


In [176]:
sparsity = clean_df.shape[0] / float(n_users * n_games)

print('{:.2%} of the user-item matrix is filled'.format(sparsity))

0.28% of the user-item matrix is filled


In [177]:
user_counter = Counter()
for user in clean_df.userID.tolist():
    user_counter[user] +=1

game_counter = Counter()
for game in clean_df.game.tolist():
    game_counter[game] += 1

In [178]:
user2idx = {user: i for i, user in enumerate(clean_df.userID.unique())}
idx2user = {i: user for user, i in user2idx.items()}

game2idx = {game: i for i, game in enumerate(clean_df.game.unique())}
idx2game = {i: game for game, i in game2idx.items()}

In [179]:
user_idx = clean_df['userID'].apply(lambda x: user2idx[x]).values
game_idx = clean_df['gameIdx'] = clean_df['game'].apply(lambda x: game2idx[x]).values
hours = clean_df['hours_played'].values

In [264]:
#check for missing values in clean_df
total = clean_df.isnull().sum().sort_values(ascending=False)
percent = (clean_df.isnull().sum()/clean_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head()

Unnamed: 0,Total,Percent
publisher,890,0.015854
genre,352,0.00627
developer,196,0.003491
all_reviews,178,0.003171
popular_tags,102,0.001817


In [265]:
steam_clean = clean_df.dropna(how='any', subset=['publisher', 'genre', 'developer', 
                                                 'all_reviews', 'popular_tags', 'game_details'])

In [266]:
steam_clean.head()

Unnamed: 0,userID,game,hours_played,all_reviews,developer,publisher,popular_tags,game_details,genre,gameIdx
51896,5250,Alien Swarm,4.9,"Very Positive,(18,439),- 94% of the 18,439 use...",Valve,"Valve,Valve","Free to Play,Co-op,Action,Multiplayer,Aliens,O...","Single-player,Multi-player,Co-op,Steam Achieve...",Action,0
58359,5250,Counter-Strike,0.0,"Overwhelmingly Positive,(66,438),- 96% of the ...",Valve,"Valve,Valve","Action,FPS,Multiplayer,Shooter,Classic,Team-Ba...","Multi-player,Online Multi-Player,Local Multi-P...",Action,1
59534,5250,Day of Defeat,0.0,"Very Positive,(2,022),- 86% of the 2,022 user ...",Valve,"Valve,Valve","FPS,World War II,Multiplayer,Shooter,Action,Wa...","Multi-player,Valve Anti-Cheat enabled",Action,2
60143,5250,Deathmatch Classic,0.0,"Very Positive,(953),- 80% of the 953 user revi...",Valve,"Valve,Valve","Action,FPS,Classic,Multiplayer,Shooter,First-P...","Multi-player,Online Multi-Player,Local Multi-P...",Action,3
12807,5250,Dota 2,0.0,"Very Positive,(1,015,621),- 85% of the 1,015,6...",Valve,"Valve,Valve","Free to Play,MOBA,Multiplayer,Strategy,e-sport...","Multi-player,Co-op,Steam Trading Cards,Steam W...","Action,Free to Play,Strategy",4


In [267]:
#check for missing values in steam_clean
total = steam_clean.isnull().sum().sort_values(ascending=False)
percent = (steam_clean.isnull().sum()/steam_clean.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head()

Unnamed: 0,Total,Percent
gameIdx,0,0.0
genre,0,0.0
game_details,0,0.0
popular_tags,0,0.0
publisher,0,0.0


In [268]:
#steam refunds games played for less than 2 hours, so logical to remove games with fewer hours palyed than 2.0
steam_df = steam_clean[steam_clean['hours_played'] > 2.0] 

In [269]:
#also filter for games with user count > 20
steam_df.groupby('game')['userID'].count()

steam_train = steam_df[steam_df.groupby('game').userID.transform('count')>19].copy()

In [270]:
steam_train.head()

Unnamed: 0,userID,game,hours_played,all_reviews,developer,publisher,popular_tags,game_details,genre,gameIdx
51896,5250,Alien Swarm,4.9,"Very Positive,(18,439),- 94% of the 18,439 use...",Valve,"Valve,Valve","Free to Play,Co-op,Action,Multiplayer,Aliens,O...","Single-player,Multi-player,Co-op,Steam Achieve...",Action,0
27686,5250,Portal 2,13.6,"Overwhelmingly Positive,(104,354),- 98% of the...",Valve,"Valve,Valve","Puzzle,Co-op,First-Person,Sci-fi,Comedy,Single...","Single-player,Co-op,Steam Achievements,Full co...","Action,Adventure",8
34318,76767,Banished,24.0,"Very Positive,(23,931),- 89% of the 23,931 use...",Shining Rock Software LLC,"Shining Rock Software LLC,Shining Rock Softwar...","City Builder,Strategy,Simulation,Survival,Indi...","Single-player,Steam Achievements","Indie,Simulation,Strategy",13
58286,76767,Counter-Strike,365.0,"Overwhelmingly Positive,(66,438),- 96% of the ...",Valve,"Valve,Valve","Action,FPS,Multiplayer,Shooter,Classic,Team-Ba...","Multi-player,Online Multi-Player,Local Multi-P...",Action,1
27625,76767,Portal 2,15.0,"Overwhelmingly Positive,(104,354),- 98% of the...",Valve,"Valve,Valve","Puzzle,Co-op,First-Person,Sci-fi,Comedy,Single...","Single-player,Co-op,Steam Achievements,Full co...","Action,Adventure",8


In [271]:
n_users = len(steam_train.userID.unique())
n_games = len(steam_train.game.unique())

print('There are {0} users and {1} games in the data'.format(n_users, n_games))

There are 6521 users and 174 games in the data


In [272]:
sparsity = steam_train.shape[0] / float(n_users * n_games)

print('{:.2%} of the user-item matrix is filled'.format(sparsity))

1.69% of the user-item matrix is filled


In [273]:
user_counter = Counter()
for user in steam_train.userID.tolist():
    user_counter[user] +=1

game_counter = Counter()
for game in steam_train.game.tolist():
    game_counter[game] += 1

In [274]:
user2idx = {user: i for i, user in enumerate(steam_train.userID.unique())}
idx2user = {i: user for user, i in user2idx.items()}

game2idx = {game: i for i, game in enumerate(steam_train.game.unique())}
idx2game = {i: game for game, i in game2idx.items()}

In [275]:
user_idx = steam_train['userID'].apply(lambda x: user2idx[x]).values
game_idx = steam_train['gameIdx'] = steam_train['game'].apply(lambda x: game2idx[x]).values
hours = steam_train['hours_played'].values

In [276]:
zero_matrix = np.zeros(shape = (n_users, n_games)) # Create a zero matrix
user_game_pref = zero_matrix.copy()
user_game_pref[user_idx, game_idx] = 1 # Fill preference matrix

user_game_interactions = zero_matrix.copy()
# Confidence matrix
user_game_interactions[user_idx, game_idx] = hours + 1 

In [277]:
k = 5

# Count the number of purchases for each user
purchase_counts = np.apply_along_axis(np.bincount, 1, user_game_pref.astype(int))
buyers_idx = np.where(purchase_counts[:, 1] >= 2 * k)[0] #find the users who purchase 2 * k games
print('{0} users bought {1} or more games'.format(len(buyers_idx), 2 * k))

456 users bought 10 or more games


In [278]:
test_frac = 0.2 # Let's save 10% of the data for validation and 10% for testing.
test_users_idx = np.random.choice(buyers_idx,
                                  size = int(np.ceil(len(buyers_idx) * test_frac)),
                                  replace = False)

In [279]:
val_users_idx = test_users_idx[:int(len(test_users_idx) / 2)]
test_users_idx = test_users_idx[int(len(test_users_idx) / 2):]

In [280]:
def data_process(dat, train, test, user_idx, k):
    for user in user_idx:
        purchases = np.where(dat[user, :] == 1)[0]
        mask = np.random.choice(purchases, size = k, replace = False)
        
        train[user, mask] = 0
        test[user, mask] = dat[user, mask]
    return train, test

In [281]:
train_matrix = user_game_pref.copy()
test_matrix = zero_matrix.copy()
val_matrix = zero_matrix.copy()

# Mask the train matrix and create the validation and test matrices
train_matrix, val_matrix = data_process(user_game_pref, train_matrix, val_matrix, val_users_idx, k)
train_matrix, test_matrix = data_process(user_game_pref, train_matrix, test_matrix, test_users_idx, k)

In [282]:
test_matrix[test_users_idx[0], test_matrix[test_users_idx[0], :].nonzero()[0]]

array([1., 1., 1., 1., 1.])

In [283]:
ops.reset_default_graph() # Create a new graphs

pref = tf.placeholder(tf.float32, (n_users, n_games))  # Here's the preference matrix
interactions = tf.placeholder(tf.float32, (n_users, n_games)) # Here's the hours played matrix
users_idx = tf.placeholder(tf.int32, (None))

In [285]:
n_features = 30

# The X matrix represents the user latent preferences with a shape of user x latent features
X = tf.Variable(tf.truncated_normal([n_users, n_features], mean = 0, stddev = 0.05))

# The Y matrix represents the game latent features with a shape of game x latent features
Y = tf.Variable(tf.truncated_normal([n_games, n_features], mean = 0, stddev = 0.05))

#initilization of the confidence parameter
conf_alpha = tf.Variable(tf.random_uniform([1], 0, 1))

In [286]:
#user bias
user_bias = tf.Variable(tf.truncated_normal([n_users, 1], stddev = 0.2))

# Concatenate the vector to the user matrix
X_plus_bias = tf.concat([X, 
                         #tf.convert_to_tensor(user_bias, dtype = tf.float32),
                         user_bias,
                         tf.ones((n_users, 1), dtype = tf.float32)], axis = 1)

In [287]:
# game bias
item_bias = tf.Variable(tf.truncated_normal([n_games, 1], stddev = 0.2))

# Cocatenate the vector to the game matrix
Y_plus_bias = tf.concat([Y, 
                         tf.ones((n_games, 1), dtype = tf.float32),
                         item_bias],
                         axis = 1)

In [288]:
pred_pref = tf.matmul(X_plus_bias, Y_plus_bias, transpose_b=True)

# Construct the confidence matrix with the hours played and alpha paramter
conf = 1 + conf_alpha * interactions

In [289]:
cost = tf.reduce_sum(tf.multiply(conf, tf.square(tf.subtract(pref, pred_pref))))
l2_sqr = tf.nn.l2_loss(X) + tf.nn.l2_loss(Y) + tf.nn.l2_loss(user_bias) + tf.nn.l2_loss(item_bias)
lambda_c = 0.01
loss = cost + lambda_c * l2_sqr

In [290]:
lr = 0.05
optimize = tf.train.AdagradOptimizer(learning_rate = lr).minimize(loss)

In [291]:
# This is a function that helps to calculate the top k precision 
def top_k_precision(pred, mat, k, user_idx):
    precisions = []
    
    for user in user_idx:
        rec = np.argsort(-pred[user, :]) 
        
        top_k = rec[:k]
        labels = mat[user, :].nonzero()[0]
        
        precision = len(set(top_k) & set(labels)) / float(k) # Calculate the precisions from actual labels
        precisions.append(precision)
    return np.mean(precisions) 

In [292]:
#train model

iterations = 100
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for i in range(iterations):
        sess.run(optimize, feed_dict = {pref: train_matrix,
                                        interactions: user_game_interactions})
        
        if i % 10 == 0:
            mod_loss = sess.run(loss, feed_dict = {pref: train_matrix,
                                                   interactions: user_game_interactions})            
            mod_pred = pred_pref.eval()
            train_precision = top_k_precision(mod_pred, train_matrix, k, val_users_idx)
            val_precision = top_k_precision(mod_pred, val_matrix, k, val_users_idx)
            print('Iterations {0}...'.format(i),
                  'Training Loss {:.2f}...'.format(mod_loss),
                  'Train Precision {:.3f}...'.format(train_precision),
                  'Val Precision {:.3f}'.format(val_precision)
                )

    rec = pred_pref.eval()
    test_precision = top_k_precision(rec, test_matrix, k, test_users_idx)
    print('\n')
    print('Test Precision {:.3f}'.format(test_precision))

Iterations 0... Training Loss 1166758.25... Train Precision 0.126... Val Precision 0.074
Iterations 10... Training Loss 72578.23... Train Precision 0.617... Val Precision 0.009
Iterations 20... Training Loss 49959.28... Train Precision 0.704... Val Precision 0.009
Iterations 30... Training Loss 41471.16... Train Precision 0.726... Val Precision 0.009
Iterations 40... Training Loss 36905.39... Train Precision 0.743... Val Precision 0.004
Iterations 50... Training Loss 33977.66... Train Precision 0.761... Val Precision 0.004
Iterations 60... Training Loss 31931.21... Train Precision 0.778... Val Precision 0.000
Iterations 70... Training Loss 30416.03... Train Precision 0.800... Val Precision 0.000
Iterations 80... Training Loss 29234.18... Train Precision 0.826... Val Precision 0.000
Iterations 90... Training Loss 28294.54... Train Precision 0.835... Val Precision 0.000


Test Precision 0.000


In [293]:
n_examples = 10
users = np.random.choice(test_users_idx, size = n_examples, replace = False)
rec_games = np.argsort(-rec)

In [294]:
for user in users:
    print('User #{0} recommendations ...'.format(idx2user[user]))
    purchase_history = np.where(train_matrix[user, :] != 0)[0]
    recommendations = rec_games[user, :]

    
    new_recommendations = recommendations[~np.in1d(recommendations, purchase_history)][:k]
    
    print('Recommendations')
    print(', '.join([idx2game[game] for game in new_recommendations]))
    print('\n')
    print('Actual purchases')
    print(', '.join([idx2game[game] for game in np.where(test_matrix[user, :] != 0)[0]]))
    print('\n')
    print('Precision of {0}'.format(len(set(new_recommendations) & set(np.where(test_matrix[user, :] != 0)[0])) / float(k)))
    print('--------------------------------------')
    print('\n')

User #83090686 recommendations ...
Recommendations
Clicker Heroes, Trove, PAYDAY 2, Counter-Strike, DayZ


Actual purchases
Portal 2, Left 4 Dead 2, Don't Starve, Scribblenauts Unlimited, Gunpoint


Precision of 0.0
--------------------------------------


User #100519466 recommendations ...
Recommendations
War Thunder, Counter-Strike, Left 4 Dead 2, Dota 2, PlanetSide 2


Actual purchases
Portal 2, Don't Starve, Garry's Mod, PlanetSide 2, Euro Truck Simulator 2


Precision of 0.2
--------------------------------------


User #64973908 recommendations ...
Recommendations
Sniper Elite V2, L.A. Noire, The Wolf Among Us, Left 4 Dead, Alan Wake


Actual purchases
PAYDAY 2, Dungeon Defenders II, Euro Truck Simulator 2, Saints Row IV, Bastion


Precision of 0.0
--------------------------------------


User #68809194 recommendations ...
Recommendations
Killing Floor, Grand Theft Auto IV, Borderlands 2, Counter-Strike, War Thunder


Actual purchases
Mass Effect 2, Team Fortress 2, Dota 2, The 