In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
from sklearn import preprocessing
from scipy.sparse.linalg import svds
import seaborn as sns
import matplotlib.pyplot as plt
from progressbar import progressbar
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial.distance import cdist

In [2]:
reviewsPath = 'C:/Users/AdamS/Desktop/SteamRevs/Data/updated/highRevs.feather'
distancePath = 'C:/Users/AdamS/Desktop/SteamRevs/Data/updated/nonBiasDistanceDf.feather'
tsnePath = 'C:/Users/AdamS/Desktop/SteamRevs/Data/updated/tsne2.feather'
gamesPath = 'C:/Users/AdamS/Desktop/SteamRevs/Data/updated/betterGames.feather'

In [3]:
df = pd.read_feather(reviewsPath)
positive = df[df.voted_up == True].head(1_000_000).copy()
negative = df[df.voted_up == False].copy()

In [4]:
sample = positive.sample(10_000)

In [5]:
positive = positive.drop(sample.index)

In [6]:
sample = sample[sample['steamid'].isin(positive['steamid'])]

In [7]:
myRevs = df[df.steamid == 76561198142214326]

In [8]:
myRevs

Unnamed: 0,steamid,appid,voted_up,votes_up,votes_funny,weighted_vote_score,playtime_forever,playtime_at_review,num_games_owned,num_reviews,review,unix_timestamp_created,unix_timestamp_updated,reviewcount
6314372,76561198142214326,394360,True,0,0,0.0,42684,23131,96,5,It's fun,1561917886,1561917886,3
6314373,76561198142214326,22380,True,0,0,0.0,11148,9622,96,5,One of the greatest games I've ever played. Do...,1511584487,1511584487,3
6314374,76561198142214326,72850,True,0,0,0.0,32956,31707,96,5,Probably my most played game across multiple p...,1480200544,1511586027,3


In [9]:
positive = pd.concat([positive, myRevs])

In [10]:
positive[positive.steamid == 76561198142214326]

Unnamed: 0,steamid,appid,voted_up,votes_up,votes_funny,weighted_vote_score,playtime_forever,playtime_at_review,num_games_owned,num_reviews,review,unix_timestamp_created,unix_timestamp_updated,reviewcount
6314372,76561198142214326,394360,True,0,0,0.0,42684,23131,96,5,It's fun,1561917886,1561917886,3
6314373,76561198142214326,22380,True,0,0,0.0,11148,9622,96,5,One of the greatest games I've ever played. Do...,1511584487,1511584487,3
6314374,76561198142214326,72850,True,0,0,0.0,32956,31707,96,5,Probably my most played game across multiple p...,1480200544,1511586027,3


In [11]:
negative = negative[negative['steamid'].isin(positive['steamid'])]

In [12]:
def setInteraction(df_highest_revs):
    df_highest_revs['interaction'] = 0
    conditions = [
        (df_highest_revs['playtime_forever'] < 5) & df_highest_revs['voted_up'],
        (df_highest_revs['playtime_forever'] > 5) & (df_highest_revs['playtime_forever'] < 20) & df_highest_revs['voted_up'],
        (df_highest_revs['playtime_forever'] > 20) & (df_highest_revs['playtime_forever'] < 100) & df_highest_revs['voted_up'],
        (df_highest_revs['playtime_forever'] > 100) & (df_highest_revs['playtime_forever'] < 500) & df_highest_revs['voted_up'],
        (df_highest_revs['playtime_forever'] > 500) & (df_highest_revs['playtime_forever'] < 100) & df_highest_revs['voted_up'],
        (df_highest_revs['playtime_forever'] > 1000) & df_highest_revs['voted_up'],
    ]
    values = [0, 1, 10, 100, 1000, 10000]
    df_highest_revs['interaction'] = np.select(conditions, values)
    df_highest_revs['interaction']

In [13]:
setInteraction(positive)
negative['interaction'] = 5

In [14]:
%%time
user_item_p = positive.copy().groupby(['steamid','appid'])['interaction'].first().unstack(fill_value = 0.0)

Wall time: 9.41 s


In [15]:
user_item_p

appid,10,20,30,40,50,60,70,80,130,220,...,1020820,1025050,1025250,1029630,1031480,1032430,1035610,1037190,1046030,1051310
steamid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
76561197960267615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76561197960269218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76561197960269425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76561197960270304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76561197960270929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76561199152799217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76561199153533082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76561199154221669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76561199154274167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
%%time
user_item_n = negative.groupby(['steamid','appid'])['interaction'].first().unstack(fill_value = 0.0)

Wall time: 2.55 s


In [17]:
def getMatrix(user_item):
    U, sigma, Vt = svds(user_item.to_numpy().astype('float'), k = 64)
    U.shape, sigma.shape, Vt.shape
    sigma_diag_matrix=np.diag(sigma)
    train_full_matrix = np.dot(np.dot(U, sigma_diag_matrix), Vt)
    prediction_train_full = pd.DataFrame(train_full_matrix, columns = user_item.columns, index=user_item.index)
    prediction_train_full.reset_index(inplace = True)
    prediction_train_full.set_index('steamid')
    return(prediction_train_full)

In [18]:
%%time
full_p = getMatrix(user_item_p)

Wall time: 3min 50s


In [19]:
%%time
full_n = getMatrix(user_item_n)

Wall time: 1min 20s


In [20]:
distanceDf = pd.read_feather(distancePath)
tsne_df = pd.read_feather(tsnePath)

In [21]:
dists = cdist(distanceDf, distanceDf, 'euclid')
dists = pd.DataFrame(dists)
dists.set_index(tsne_df.appid, inplace = True)
dists.columns = (tsne_df.appid)

In [22]:
games = pd.read_feather(gamesPath)

In [23]:
dists3 = dists.copy()
dists3.set_index(games.name, inplace = True)
dists3.columns = (games.name)

In [24]:
def getRecs(userID, prediction_train_full, df_highest_revs):
    topRecs = prediction_train_full[prediction_train_full.steamid == userID].iloc[0].sort_values(ascending = False)[1:10].to_frame()
    topRecs.columns = ['pred']
    return topRecs

def getBest(userID):
    bestGames = getRecs(userID, full_p, positive)
    bestGames.reset_index(inplace = True)
    if userID in negative['steamid'].unique():
        worstGames = getRecs(userID, full_n, negative)
        worstGames.reset_index(inplace = True)
        bestGames['negative'] = bestGames['appid'].isin(worstGames['appid'])
        bestGames = bestGames[bestGames['negative'] == False]
    return bestGames

def getHighInteraction(userID):
    high_interaction = user_item_p.loc[userID].sort_values(ascending = False).head(5)
    high_interaction = pd.DataFrame(high_interaction)
    high_interaction.reset_index(inplace = True)
    high_interaction.columns = ['appid','interaction']
    high_interaction = high_interaction[high_interaction['interaction'] > 0]
    return high_interaction

def removeHigh(bestGames, interGames):
    bestGames['high'] = bestGames['appid'].isin(interGames['appid'])
    bestGames = bestGames[bestGames['high'] == False]
    return bestGames

def getClosest(inter):
    result = []
    for x in inter.appid:
        result.append(dists.loc[x].sort_values().index[1])
    inter['closest'] = result
    return(inter)

def addClosest(best, inter):
    closest = inter[['closest']]
    closest.columns = ['appid']
    best = pd.concat([best,closest]).drop_duplicates().reset_index(drop=True)
    return best

def createRecs(userID):
    bestGames = getBest(userID)
    inter = getHighInteraction(userID)
    bestGames = removeHigh(bestGames, inter)
    inter = getClosest(inter)
    bestGames = addClosest(bestGames, inter)
    return(bestGames.head(10)[['appid']])

In [25]:
testRecs = createRecs(76561198142214326)
testRecs.merge(games, how='left', on='appid')

Unnamed: 0,appid,index,name,dev,positive,negative,owners,price,genre,tags,binary_tags
0,236850,4325,Europa Universalis IV,"Paradox Development Studio, Paradox Tinto",100640,15254,3500000,39.99,"Simulation, Strategy","{'1980s': None, '1990's': None, '2.5D': None, ...","Alternate History, Diplomacy, Economy, Educati..."
1,281990,1974,Stellaris,Paradox Development Studio,129713,16893,3500000,39.99,"Simulation, Strategy","{'1980s': None, '1990's': None, '2.5D': None, ...","4X, Atmospheric, Diplomacy, Exploration, Futur..."
2,8930,7316,Sid Meier's Civilization V,"Firaxis Games, Aspyr (Mac), Aspyr (Linux)",183922,7380,15000000,29.99,Strategy,"{'1980s': None, '1990's': None, '2.5D': None, ...","4X, Addictive, Classic, Co-op, Diplomacy, Econ..."
3,636480,6020,Ravenfield,SteelRaven7,56661,1738,1500000,17.99,"Action, Indie, Early Access","{'1980s': None, '1990's': None, '2.5D': None, ...","Action, Adventure, Atmospheric, Early Access, ..."
4,42960,7697,Victoria II,Paradox Development Studio,15838,1304,1500000,19.99,Strategy,"{'1980s': None, '1990's': None, '2.5D': None, ...","Alternate History, Diplomacy, Economy, Grand S..."
5,294100,2235,RimWorld,Ludeon Studios,156074,3023,3500000,34.99,"Indie, Simulation, Strategy","{'1980s': None, '1990's': None, '2.5D': None, ...","2D, Base-Building, Building, Cartoony, City Bu..."
6,22320,7478,The Elder Scrolls III: Morrowind Game of the Y...,Bethesda Game Studios,20891,1005,1500000,14.99,RPG,"{'1980s': None, '1990's': None, '2.5D': None, ...","Action, Action RPG, Adventure, Atmospheric, Ch..."
7,73170,7868,Darkest Hour: A Hearts of Iron Game,Martin Ivanov,1676,189,350000,9.99,Strategy,"{'1980s': None, '1990's': None, '2.5D': None, ...","Cold War, Grand Strategy, Historical, Military..."
8,22370,7481,Fallout 3: Game of the Year Edition,Bethesda Game Studios,32162,8077,3500000,19.99,RPG,"{'1980s': None, '1990's': None, '2.5D': None, ...","Action, Action RPG, Adventure, Atmospheric, Cu..."


In [52]:
testRecs

Unnamed: 0,appid,name
0,218620,218620
1,252950,252950
2,319630,319630
3,391220,391220
4,20920,20920
5,20900,20900
6,80,80
7,400,400
8,817130,817130
9,391220,391220


In [26]:
total = 0
count = 0
for x in sample['steamid']:
        userID = x
        test = createRecs(userID)
        test['check'] = test[test['appid'].isin(sample[sample.steamid == userID].appid)]
        total = total + test['check'].count()
        count = count + 1

In [35]:
count

9994

In [36]:
print('Number of Games Successfully Recommended: ' + str(total))
print('Number of Users attempted to Recommended to: ' + str(count))

Number of Games Successfully Recommended: 1416
Number of Users attempted to Recommended to: 9994
