In [1]:
import ast
import numpy as np
import random
import sklearn
from scipy import stats
from collections import defaultdict
from sklearn import svm
from sklearn.svm import LinearSVC

In [2]:
def parseDataItems(fname):
    for l in open(fname):
        dataList = []
        temp = l.split("},")
        print len(temp)
        for i in range(0, len(temp)):
            entry = (temp[i] + "}").strip()
            entryEval = ast.literal_eval(entry)
            dataList.append(entryEval)
    return dataList
        
print "Reading data..."
dataItems = parseDataItems("steam/new_all_items_3.json")
print "done"

Reading data...
10258
done


In [3]:
def parseDataUsers(fname):
    for l in open(fname):
        dataList = []
        temp = l.split("]}, {")
        print len(temp)
        for i in range(0, len(temp)):
            entry = ("{" + temp[i] + "]}").strip()
            entryEval = ast.literal_eval(entry)
            dataList.append(entryEval)
    return dataList
        
print "Reading data..."
dataUsers = parseDataUsers("steam/australia_users_items_2.json")
print "done"

Reading data...
88310
done


In [16]:
# Number of free games
countFreeGames = 0
for item in dataItems:
    if item.has_key('original_price') and item['original_price'] == "Free":
        countFreeGames += 1

print "Number of free games: {0}".format(countFreeGames)

Number of free games: 227


In [4]:
# Ratios of games played to games purchased
playedGamesRatioDict = defaultdict()
usersPlayedGamesDict = defaultdict(list)
usersNotPlayedGamesDict = defaultdict(list)
countPlayedAllGames = 0
allGamesPlayedList = []
for user in dataUsers:
    userPlayedCount = 0
    if user.has_key('items'):
        for item in user['items']:
            if item['playtime_forever'] > 0:
                userPlayedCount += 1
                usersPlayedGamesDict[user['user_id']].append(item['item_id'])
            else:
                usersNotPlayedGamesDict[user['user_id']].append(item['item_id'])
    if user['items_count'] > 0:
        playedGamesRatioDict[user['user_id']] = userPlayedCount/float(user['items_count'])
        if playedGamesRatioDict[user['user_id']] == 1.0:
            countPlayedAllGames += 1
            allGamesPlayedList.append(user['items_count'])
    else:
        playedGamesRatioDict[user['user_id']] = 0

In [24]:
# Average ratio of games played to games purchased
print (np.mean(playedGamesRatioDict.values()))

0.5472901772481864

In [57]:
itemCounts = [d['items_count'] for d in dataUsers if d.has_key('items_count')] 

# Average number of games purchased per user
print (np.mean(itemCounts))

# Max
print "Max: {0}".format((max(itemCounts)))

print itemCounts.index(max(itemCounts))
#print dataUsers[3708]

# Min
print "Min: {0}".format((min(itemCounts)))
print itemCounts.index(min(itemCounts))
#print dataUsers[9]

# Median
print "Median: {0}".format((np.median(itemCounts)))

# Mode
print "Mode: {0}".format(stats.mode(np.array(itemCounts)))

# Has one game
hasOneGame = [d for d in itemCounts if d == 1]
print len(hasOneGame)

# Played all games
print "Played all games: {0}".format((countPlayedAllGames))
print (np.mean(allGamesPlayedList))
print max(allGamesPlayedList)

# Average number of games purchased per user

58.3536292606
Max: 7762
3708
Min: 0
9
Median: 26.0
Mode: ModeResult(mode=array([0]), count=array([16806]))
3304
Played all games: 4044
4.40306627102
57


In [None]:
#playedPairs = []
#for user in dataUsers[(len(dataUsers)/2):]:
#    # Get all games that user has played
#    for i in range(0, len(usersPlayedGamesDict)):
#        playedPairs.append((user, usersPlayedGamesDict[i]))

In [6]:
allItemsList = [item['item_id'] for item in dataItems] 
allUsersList = [user['user_id'] for user in dataUsers]

In [16]:
# Build validation set

# Pairs where user played game
validationPlayed = []
while len(validationPlayed) < 5000:
    randUser = random.choice(usersPlayedGamesDict.keys())
    randGame = random.choice(usersPlayedGamesDict[randUser])
    pair = (randUser, randGame)
    if pair not in validationPlayed:
        validationPlayed.append(pair)
        
# Pairs where user purchased game but did not play
validationPurchased = []
while len(validationPurchased) < 100:
    randUser = random.choice(usersNotPlayedGamesDict.keys())
    randGame = random.choice(usersNotPlayedGamesDict[randUser])
    pair = (randUser, randGame)
    if pair not in validationPurchased:
        validationPurchased.append(pair)

# Pairs where user did not purchase or play
validationNegative = []
while len(validationNegative) < 5000:
    user = random.choice(allUsersList)
    item = random.choice(allItemsList)
    if (user, item) not in validationPlayed: #and (user, item) not in validationPurchased:
        validationNegative.append((user, item))
        
validation = validationPlayed + validationPurchased

In [94]:
# Validation set labels
def createLabels():
    resultsDict = defaultdict()
    for (u,g) in validation:
        for userEntry in dataUsers:
            if u in usersPlayedGamesDict:
                if g in usersPlayedGamesDict[u]:
                    resultsDict[(u,g)] = 1
                else:
                    resultsDict[(u,g)] = 0
    return resultsDict

validation = validationPlayed + validationPurchased
resultsDict = createLabels()

In [24]:
# Dictionary of item's list of reviews
reviewsDict = defaultdict()
for item in dataItems:
    reviewsDict[item['item_id']] = item['reviews']

In [8]:
# Dictionary of user's items(items represented as dictionary)
usersItemsListDict = defaultdict()
for user in dataUsers:
    usersItemsListDict[user['user_id']] = user['items']

In [26]:
# Predictions using simple review categories for use in feature array
def feature_reviews(u, g):
    for userEntry in dataUsers:
        if g in reviewsDict:
            if "Very Negative" in reviewsDict[g] or "Negative" in reviewsDict[g]:
                return 1
    return 0  

In [48]:
# Predictions using simple review categories
predictions = []
for (u,g) in validation:
    for userEntry in dataUsers:
        if g in reviewsDict:
            if "Very Negative" in reviewsDict[g] or "Negative" in reviewsDict[g]:
                predictions.append(1)
        else:
            predictions.append(0)

In [46]:
match = [(x==y) for x,y in zip(predictions, resultsDict.values())]

In [47]:
print sum(match) * 1.0/len(match)

0.639285714286


In [9]:
# Dictionary of games' genres
gameGenreDict = defaultdict(list)
for game in dataItems:
    gameGenreDict[game['item_id']] = game['genre']

In [10]:
# Jaccard of user's played genres 
# Jaccard for each of user's played games (genres)

# Get all the genres the user has played
def get_user_genres(user):
    userPlayedGames = []
    userGenres = []
    allUserGenres = []
    if user in usersPlayedGamesDict:#usersItemsListDict:
        userPlayedGames = usersPlayedGamesDict[user]#[g for g in userItemsListDict[user] if ]
        for gameId in userPlayedGames:
            if gameId in allItemsList:
                userGenres.append(gameGenreDict[gameId])
    for listOfGenres in userGenres:
        for g in listOfGenres:
            allUserGenres.append(g)
    return set(allUserGenres)
            
    
print get_user_genres('76561197970982479')   


set(['3D Vision', 'Fantasy', 'Hunting', 'Political', 'Level Editor', 'Rogue-like', 'Turn-Based', 'Gambling', 'Comedy', 'Dynamic Narration', 'Crowdfunded', 'Linear', 'Benchmark', 'Post-apocalyptic', 'Quick-Time Events', 'Dungeon Crawler', 'Online Co-Op', 'Western', 'Lovecraftian', 'First-Person', 'Ninja', 'Robots', 'Adventure', 'Building', 'Magic', 'Games Workshop', 'Great Soundtrack', 'Steampunk', 'Choose Your Own Adventure', 'Fighting', 'Simulation', '2D', 'Realistic', 'Exploration', 'Remake', 'Military', 'Assassin', '4 Player Local', 'Third Person', 'Parody', 'God Game', 'Twin Stick Shooter', 'Pirates', 'Singleplayer', 'Indie', 'Aliens', "Shoot 'Em Up", 'Real-Time with Pause', 'Dragons', 'Trading', 'Cyberpunk', 'Gun Customization', 'Bullet Time', 'Education', 'Swordplay', 'Score Attack', 'Open World', 'Underwater', 'Management', 'Arcade', 'Satire', 'Resource Management', 'Sports', 'Romance', 'Spectacle fighter', 'Retro', 'Bullet Hell', 'Real-Time', 'Family Friendly', 'Soundtrack', 'D

In [50]:
predictions = []
for (u,g) in validation:
    userGenres = get_user_genres(u)
    gameGenres = set()
    if g in gameGenreDict:
        gameGenres = set(gameGenreDict[g])
    commonGenres = userGenres.intersection(gameGenres)
    if len(commonGenres) > 0:
        predictions.append(0)
    else:
        predictions.append(1)
    

print predictions

[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]


In [51]:
match = [(x==y) for x,y in zip(predictions, resultsDict.values())]
print sum(match) * 1.0/len(match)

0.576642335766


In [61]:
# Returns the Jaccard similarity of two lists
def jaccard_sim(listA, listB):
    A = set(listA)
    B = set(listB)
    num = A.intersection(B)
    denom = A.union(B)
    #print num
    #print denom
    #print "j"
    if denom > 0:
        return len(num) / float(len(denom))
    return 0.0
    
# Returns the Jaccard similarities between the game and all of the user's played games
def feature_jaccard(u, g):
    usersPlayedGames = []
    sims = []
    userGamesGenres = []
    itemGenres = []
    if g in gameGenreDict:
        itemGenres = gameGenreDict[g]
    if u in usersPlayedGamesDict:
        usersPlayedGames = usersPlayedGamesDict[u]
    for game in usersPlayedGames:
        if game in gameGenreDict:
            userGameGenres = gameGenreDict[game]
            #print userGameGenres
            sims.append(jaccard_sim(userGameGenres, itemGenres))
    return sims

In [91]:
# Create the feature vector
def feature(u, g):
    feat = [1]
    userGenres = get_user_genres(u, g)
    gameGenres = set()
    if g in gameGenreDict:
        gameGenres = set(gameGenreDict[g])
    commonGenres = userGenres.intersection(gameGenres)
    if len(commonGenres) > 0:
        feat.append(0)
    else:
        feat.append(1)
    #feat.append(feature_reviews(u,g))
    jaccards = feature_jaccard(u,g)
    if jaccards != []:
        max_jaccard = max(jaccards)
        avg_jaccard = np.mean(jaccards)
        feat.append(max_jaccard)
        feat.append(avg_jaccard)
    else:
        feat.append(0)
        feat.append(0)
    return feat

In [92]:
# Create y
def createY(u, g):
    result = 0
    for user in dataUsers:
        if u == user['user_id']:
            if u in usersPlayedGamesDict:
                if g in usersPlayedGamesDict[u]:
                    result = 1
            break
    return result

In [95]:
# SVM to predict whether a user will play a game
# TODO use a training set
X = [feature(u,g) for (u,g) in validation]
y = [createY(u, g) for (u, g) in validation]

clf = LinearSVC(C=1.0)
clf.fit(X, y)
train_predictions = clf.predict(X)
matchTrain = [(x==y) for x,y in zip(resultsDict.values(), train_predictions)]
# Accuracy of the predictor
print sum(matchTrain) * 1.0/len(matchTrain) # train accuracy

0.970184386034
