In [9]:
import ast
import numpy as np
import random
from scipy import stats
from collections import defaultdict

In [2]:
def parseDataItems(fname):
    for l in open(fname):
        dataList = []
        temp = l.split("},")
        print len(temp)
        for i in range(0, len(temp)):
            entry = (temp[i] + "}").strip()
            entryEval = ast.literal_eval(entry)
            dataList.append(entryEval)
    return dataList
        
print "Reading data..."
dataItems = parseDataItems("steam/new_all_items_3.json")
print "done"

Reading data...
10258
done


In [3]:
def parseDataUsers(fname):
    for l in open(fname):
        dataList = []
        temp = l.split("]}, {")
        print len(temp)
        for i in range(0, len(temp)):
            entry = ("{" + temp[i] + "]}").strip()
            entryEval = ast.literal_eval(entry)
            dataList.append(entryEval)
    return dataList
        
print "Reading data..."
dataUsers = parseDataUsers("steam/australia_users_items_2.json")
print "done"

Reading data...
88310
done


In [12]:
dataItems[1]

{'discount_final_price': '',
 'discount_original_price': '',
 'genre': ['Indie',
  'Simulation',
  'Casual',
  'Massively Multiplayer',
  'Adventure'],
 'item_id': '287140',
 'original_price': '$4.99',
 'release_date': 'May 8, 2015',
 'reviews': ['Mostly Positive']}

In [16]:
# Number of free games
countFreeGames = 0
for item in dataItems:
    if item.has_key('original_price') and item['original_price'] == "Free":
        countFreeGames += 1

print "Number of free games: {0}".format(countFreeGames)

Number of free games: 227


In [17]:
# Ratios of games played to games purchased
playedGamesRatioDict = defaultdict()
usersPlayedGamesDict = defaultdict(list)
usersNotPlayedGamesDict = defaultdict(list)
countPlayedAllGames = 0
allGamesPlayedList = []
for user in dataUsers:
    userPlayedCount = 0
    if user.has_key('items'):
        for item in user['items']:
            if item['playtime_forever'] > 0:
                userPlayedCount += 1
                usersPlayedGamesDict[user['user_id']].append(item['item_id'])
            else:
                usersNotPlayedGamesDict[user['user_id']].append(item['item_id'])
    if user['items_count'] > 0:
        playedGamesRatioDict[user['user_id']] = userPlayedCount/float(user['items_count'])
        if playedGamesRatioDict[user['user_id']] == 1.0:
            countPlayedAllGames += 1
            allGamesPlayedList.append(user['items_count'])
    else:
        playedGamesRatioDict[user['user_id']] = 0

In [22]:
playedGamesRatioDict

defaultdict(None,
            {'Suddenly_Whales': 0.5688073394495413,
             'legomagalaga': 0.8631578947368421,
             '76561198090125828': 0.8571428571428571,
             '76561198036840335': 0.8260869565217391,
             '76561198018399888': 0.8275862068965517,
             'DarknessFallsUponUs': 0.6,
             '76561197998166599': 0.6666666666666666,
             '76561198079096218': 0,
             '76561198010441325': 1.0,
             '76561198067910635': 0.423728813559322,
             '76561198073956487': 0,
             'wormwoodgaming': 0.631578947368421,
             'saltyman4': 0.8181818181818182,
             '76561198076857808': 0.6153846153846154,
             'WWABT': 0.7906976744186046,
             'NTMYZ': 0.5740740740740741,
             '76561198050944421': 0.8,
             '76561198055723594': 0,
             '91102984210101': 0.7540983606557377,
             '76561198032260429': 0.7272727272727273,
             '76561198107508313': 0.6315789

In [24]:
# Average ratio of games played to games purchased
print (np.mean(playedGamesRatioDict.values()))

0.5472901772481864

In [57]:
itemCounts = [d['items_count'] for d in dataUsers if d.has_key('items_count')] 

# Average number of games purchased per user
print (np.mean(itemCounts))

# Max
print "Max: {0}".format((max(itemCounts)))

print itemCounts.index(max(itemCounts))
#print dataUsers[3708]

# Min
print "Min: {0}".format((min(itemCounts)))
print itemCounts.index(min(itemCounts))
#print dataUsers[9]

# Median
print "Median: {0}".format((np.median(itemCounts)))

# Mode
print "Mode: {0}".format(stats.mode(np.array(itemCounts)))

# Has one game
hasOneGame = [d for d in itemCounts if d == 1]
print len(hasOneGame)

# Played all games
print "Played all games: {0}".format((countPlayedAllGames))
print (np.mean(allGamesPlayedList))
print max(allGamesPlayedList)

# Average number of games purchased per user

58.3536292606
Max: 7762
3708
Min: 0
9
Median: 26.0
Mode: ModeResult(mode=array([0]), count=array([16806]))
3304
Played all games: 4044
4.40306627102
57


In [None]:
playedPairs = []
for user in dataUsers[(len(dataUsers)/2):]:
    # Get all games that user has played
    for i in range(0, len(usersPlayedGamesDict)):
        playedPairs.append((user, usersPlayedGamesDict[i]))

In [13]:
allItemsList = [item['item_id'] for item in dataItems] 
allUsersList = [user['user_id'] for user in dataUsers]

In [18]:
# Build validation set

# 100 pairs where user played game
validationPlayed = []
while len(validationPlayed) < 100:
    randUser = random.choice(usersPlayedGamesDict.keys())
    randGame = random.choice(usersPlayedGamesDict[randUser])
    pair = (randUser, randGame)
    if pair not in validationPlayed:
        validationPlayed.append(pair)
        
# 100 pairs where user purchased game but did not play
validationPurchased = []
while len(validationPurchased) < 100:
    randUser = random.choice(usersNotPlayedGamesDict.keys())
    randGame = random.choice(usersNotPlayedGamesDict[randUser])
    pair = (randUser, randGame)
    if pair not in validationPurchased:
        validationPurchased.append(pair)

# 100 pairs where user did not purchase or play
validationNegative = []
while len(validationNegative) < 100:
    user = random.choice(allUsersList)
    item = random.choice(allItemsList)
    if (user, item) not in validationPlayed and (user, item) not in validationPurchased:
        validationNegative.append((user, item))

In [32]:
# Validation set labels
def createLabels():
    validation = validationPlayed + validationPurchased + validationNegative
    resultsDict = defaultdict()
    for (u,g) in validation:
        for userEntry in dataUsers:
            if u in usersPlayedGamesDict:
                if g in usersPlayedGamesDict[u]:
                    resultsDict[(u,g)] = 1
                else:
                    resultsDict[(u,g)] = 0
    return resultsDict

resultsDict = createLabels()

In [34]:
# Dictionary of item's list of reviews
reviewsDict = defaultdict()
for item in dataItems:
    reviewsDict[item['item_id']] = item['reviews']

In [48]:
# Predictions using simple review categories
predictions = []
for (u,g) in validation:
    for userEntry in dataUsers:
        if g in reviewsDict:
            if "Very Negative" in reviewsDict[g] or "Negative" in reviewsDict[g]:
                predictions.append(1)
        else:
            predictions.append(0)

In [46]:
match = [(x==y) for x,y in zip(predictions, results.values())]

In [47]:
print sum(match) * 1.0/len(match)

0.639285714286
