In [1]:
import ast
import numpy as np
import random
import sklearn
from scipy import stats
from collections import defaultdict
from sklearn import svm
from sklearn.svm import LinearSVC

In [2]:
# Parse item data
def parseDataItems(fname):
    for l in open(fname):
        dataList = []
        temp = l.split("},")
        print len(temp)
        for i in range(0, len(temp)):
            entry = (temp[i] + "}").strip()
            entryEval = ast.literal_eval(entry)
            dataList.append(entryEval)
    return dataList
        
print "Reading data..."
dataItems = parseDataItems("steam/new_all_items_3.json")
print "done"

Reading data...
10258
done


In [3]:
# Parse user data
def parseDataUsers(fname):
    for l in open(fname):
        dataList = []
        temp = l.split("]}, {")
        print len(temp)
        for i in range(0, len(temp)):
            entry = ("{" + temp[i] + "]}").strip()
            entryEval = ast.literal_eval(entry)
            dataList.append(entryEval)
    return dataList
        
print "Reading data..."
dataUsers = parseDataUsers("steam/australia_users_items_2.json")
print "done"

Reading data...
88310
done


In [4]:
# Training, validation, and test splits
length = len(dataUsers)
dataUsersTrain = dataUsers[0:53310]
dataUsersValidate = dataUsers[53310:70810]
dataUsersTest = dataUsers[70810:]

In [5]:
allItemsList = [item['item_id'] for item in dataItems] 
allUsersList = [user['user_id'] for user in dataUsers]

In [6]:
# Dictionary of games' genres
gameGenreDict = defaultdict(list)
for game in dataItems:
    gameGenreDict[game['item_id']] = game['genre']

In [7]:
# Builds a list of user-game pairs that don't exist in the given dictionary
def build_set_neg(size, usersPlayedGamesD):
    samples = []
    while len(samples) < size:
        user = random.choice(allUsersList)
        item = random.choice(allItemsList)
        pair = (user, item)
        if user not in usersPlayedGamesD or (user in usersPlayedGamesD and item not in usersPlayedGamesD[user]):
            samples.append(pair)
    return samples

In [8]:
# Get all of the user-game pairs in the training data
usersPlayedGamesDictTrain = defaultdict(list)
trainPlayed = []
for user in dataUsersTrain: 
    if user.has_key('items'):
        for item in user['items']:
            if item['playtime_forever'] > 0:
                usersPlayedGamesDictTrain[user['user_id']].append(item['item_id'])
                trainPlayed.append((user['user_id'], item['item_id']))
trainNegative = build_set_neg(len(trainPlayed), usersPlayedGamesDictTrain)
train = trainPlayed + trainNegative

In [9]:
print train[0]
print len(train)
print len(set(train))

('76561197970982479', '10')
5579698
5536681


In [10]:
# Get all of the user-game pairs in the validation data
usersPlayedGamesDictValidate = defaultdict(list)
validatePlayed = []
for user in dataUsersValidate: 
    if user.has_key('items'):
        for item in user['items']:
            if item['playtime_forever'] > 0:
                usersPlayedGamesDictValidate[user['user_id']].append(item['item_id'])
                validatePlayed.append((user['user_id'], item['item_id']))
validateNegative = build_set_neg(len(validatePlayed), usersPlayedGamesDictValidate)
validate = validatePlayed + validateNegative

In [11]:
print validate[0]
print len(validate)
print len(set(validate))

('76561198048192342', '220')
576170
576020


In [12]:
# Get all of the user-game pairs in the test data
usersPlayedGamesDictTest = defaultdict(list)
testPlayed = []
for user in dataUsersTest: 
    if user.has_key('items'):
        for item in user['items']:
            if item['playtime_forever'] > 0:
                usersPlayedGamesDictTest[user['user_id']].append(item['item_id'])
                testPlayed.append((user['user_id'], item['item_id']))
testNegative = build_set_neg(len(testPlayed), usersPlayedGamesDictTest)
test = testPlayed + testNegative

In [51]:
# Create labels for the pairs of users and games
labelsTrain = []
for i in range(0, 25000):#len(trainPlayed)):
    labelsTrain.append(1)
for i in range(0, 25000):#len(trainNegative)):
    labelsTrain.append(0)

labelsValidate = []
for i in range(0, 25000):#len(validatePlayed)):
    labelsValidate.append(1)
for i in range(0, len(validateNegative)):
    labelsValidate.append(0)
    
labelsTest = []
for i in range(0, 5000):#len(testPlayed)):
    labelsTest.append(1)
for i in range(0, 5000):#len(testNegative)):
    labelsTest.append(0)

In [14]:
# Returns the Jaccard similarity of two lists
def jaccard_sim(listA, listB):
    A = set(listA)
    B = set(listB)
    num = A.intersection(B)
    denom = A.union(B)
    if len(denom) > 0:
        return len(num) / float(len(denom))
    return 0.0
    
# Returns the Jaccard similarities between the game and all of the user's played games
def feature_jaccard(u, g, usersPlayedGamesDict):
    usersPlayedGames = []
    sims = []
    userGamesGenres = []
    itemGenres = []
    if g in gameGenreDict:
        itemGenres = gameGenreDict[g]
    if u in usersPlayedGamesDict:
        usersPlayedGames = usersPlayedGamesDict[u]
    for game in usersPlayedGames:
        if game in gameGenreDict:
            userGameGenres = gameGenreDict[game]
            sims.append(jaccard_sim(userGameGenres, itemGenres))
    return sims

In [15]:
# Get all the genres the user has played
def get_user_genres(user, usersPlayedGamesD):
    userPlayedGames = []
    userGenres = []
    allUserGenres = []
    if user in usersPlayedGamesD:
        userPlayedGames = usersPlayedGamesD[user]
        for gameId in userPlayedGames:
            if gameId in allItemsList:
                userGenres.append(gameGenreDict[gameId])
    for listOfGenres in userGenres:
        for g in listOfGenres:
            allUserGenres.append(g)
    return set(allUserGenres)

In [26]:
# Dictionary of item's list of reviews
reviewsDict = defaultdict()
for item in dataItems:
    reviewsDict[item['item_id']] = item['reviews']
        
# Predictions using simple review categories for use in feature array
def feature_reviews(u, g, dataU):
    for userEntry in dataU:
        if g in reviewsDict:
            if "Very Negative" in reviewsDict[g] or "Mostly Negative" in reviewsDict[g] or "Negative" in reviewsDict[g]:
                return 0
    return 1

In [47]:
# Create the feature vector # simple
def feature(u, g, usersPlayedGamesD, setType):
    feat = [1]
    userGenres = get_user_genres(u, usersPlayedGamesD)
    gameGenres = set()
    if g in gameGenreDict:
        gameGenres = set(gameGenreDict[g])
    commonGenres = userGenres.intersection(gameGenres)
    if len(commonGenres) > 0:
        feat.append(1)
    else:
        feat.append(0)
    return feat

In [59]:
# Ratios of games played to games purchased
playedGamesRatioDict = defaultdict()
gamesPlayedRatioDict = defaultdict()
usersNotPlayedGamesDict = defaultdict(list)
countPlayedAllGames = 0
allGamesPlayedList = []
for user in dataUsers:
    userPlayedCount = 0
    if user.has_key('items'):
        for item in user['items']:
            if item['playtime_forever'] > 0:
                userPlayedCount += 1
            else:
                usersNotPlayedGamesDict[user['user_id']].append(item['item_id'])
    if user['items_count'] > 0:
        playedGamesRatioDict[user['user_id']] = userPlayedCount/float(user['items_count'])
        if playedGamesRatioDict[user['user_id']] == 1.0:
            countPlayedAllGames += 1
            allGamesPlayedList.append(user['items_count'])
    else:
        playedGamesRatioDict[user['user_id']] = 0

In [73]:
def getGameRatios(dataU):
    gamesPlayedRatioDict = defaultdict(int)
    gamesPlayedCountDict = defaultdict(int)
    gamesPurchasedCountDict = defaultdict(int)
    for userEntry in dataU:
        userItems = userEntry['items']
        for itemEntry in userItems:
            if itemEntry['playtime_forever'] > 0:
                gamesPlayedCountDict[itemEntry['item_id']] += 1
            gamesPurchasedCountDict[itemEntry['item_id']] += 1
    for gameId in gamesPlayedCountDict:
        if gamesPurchasedCountDict[gameId] > 0:
            gamesPlayedRatioDict[gameId] = gamesPlayedCountDict[gameId] / float(gamesPurchasedCountDict[gameId])
    return gamesPlayedRatioDict
    
                
            

In [74]:
print getGameRatios(dataUsersTrain)

defaultdict(<type 'int'>, {'221540': 0.803125, '287140': 0.5681818181818182, '337730': 0.7777777777777778, '330100': 0.8888888888888888, '425870': 1.0, '511340': 0.6153846153846154, '368230': 0.8898678414096917, '351340': 0.5875, '415990': 0.75, '213570': 0.04918032786885246, '261920': 0.8, '355100': 0.6, '306520': 0.3055555555555556, '358640': 0.1092436974789916, '304540': 0.25, '431250': 0.8888888888888888, '339860': 0.6304347826086957, '377590': 0.2714285714285714, '501110': 0.5, '498600': 0.26666666666666666, '493650': 0.6666666666666666, '449140': 0.8642266824085005, '410340': 0.96875, '235380': 0.896551724137931, '392370': 1.0, '218130': 0.5957828622700763, '413690': 0.4235294117647059, '308270': 0.8421052631578947, '323270': 0.125, '382350': 0.52, '439940': 0.4357142857142857, '311170': 0.14705882352941177, '441770': 0.09090909090909091, '370920': 0.5833333333333334, '279500': 0.125, '341530': 0.373015873015873, '405000': 0.75, '389460': 0.6136363636363636, '386180': 0.944444444

In [87]:
# Create the feature vector
def feature(u, g, usersPlayedGamesD, gameRatios):
    feat = [1]
    userGenres = get_user_genres(u, usersPlayedGamesD)
    gameGenres = set()
    if g in gameGenreDict:
        gameGenres = set(gameGenreDict[g])
    commonGenres = userGenres.intersection(gameGenres)
    if len(commonGenres) > 0:
        feat.append(1)
    else:
        feat.append(0)
    #feat.append(feature_reviews(u,g))
    jaccards = feature_jaccard(u,g,usersPlayedGamesD)
    if jaccards != []:
        max_jaccard = max(jaccards)
        avg_jaccard = np.mean(jaccards)
        feat.append(max_jaccard)
        feat.append(avg_jaccard)
    else:
        feat.append(0)
        feat.append(0)
        
    # Play rate
    #gameRatios = getGameRatios(dataUsersTrain)
    if g in gameRatios:
        feat.append(gameRatios[g])
    else:
        feat.append(0)
    
    return feat

In [33]:
# SVM to predict whether a user will play a game
train = trainPlayed[:5000] + trainNegative[:5000]
X_train = [feature(u,g, usersPlayedGamesDictTrain,"train") for (u,g) in train] # change
y_train = labelsTrain

In [40]:
y_train = labelsTrain
len(y_train)

10000

In [57]:
# SVM on validation set to determine hyperparameter
lams = [0.01, 0.1, 1.0, 100.0, 1000.0]
for lam in lams:
    clf = LinearSVC(C=lam)
    clf.fit(X_train, y_train)
    
    validate = validatePlayed[:5000] + validateNegative[:5000]
    X_validate = [feature(u,g,usersPlayedGamesDictValidate) for (u,g) in validate]
    y_validate = labelsValidate
    validate_predictions = clf.predict(X_validate)
    matchValidate = [(x==y) for x,y in zip(labelsValidate, validate_predictions)]
    print lam
    print sum(matchValidate) * 1.0/len(matchValidate)

0.01
0.9172
0.1
0.9172
1.0
0.9172
100.0
0.9172
1000.0
0.9172


In [88]:
# SVM on training set
train = trainPlayed[:25000] + trainNegative[:25000]
gameRatios = getGameRatios(dataUsersTrain)
X_train = [feature(u,g,usersPlayedGamesDictTrain,gameRatios) for (u,g) in train]
y_train = labelsTrain

clf = LinearSVC(C=1.0)
clf.fit(X_train, y_train)

train_predictions = clf.predict(X_train)
matchTrain = [(x==y) for x,y in zip(labelsTrain, train_predictions)]
print sum(matchTrain) * 1.0/len(matchTrain)

0.91916


In [53]:
print len(X_train)
print len(y_train)

50000
50000


In [54]:
# SVM on test set
clf = LinearSVC(C=1.0)
clf.fit(X_train, y_train)
    
    
test = testPlayed[:5000] + testNegative[:5000]
X_test = [feature(u,g,usersPlayedGamesDictTest, "test") for (u,g) in test]
y_test = labelsTest

test_predictions = clf.predict(X_test)
matchTest = [(x==y) for x,y in zip(labelsTest, test_predictions)]
print sum(matchTest) * 1.0/len(matchTest)

0.8554


In [41]:
# SVM on test set
clf = LinearSVC(C=1.0)
clf.fit(X_train, y_train)
    
    
test = testPlayed[:5000] + testNegative[:5000]
X_test = [feature(u,g,usersPlayedGamesDictTest,"test") for (u,g) in test]
y_test = labelsTest

test_predictions = clf.predict(X_test)
matchTest = [(x==y) for x,y in zip(labelsTest, test_predictions)]
print sum(matchTest) * 1.0/len(matchTest)

0.9102


In [57]:
# JUST TESTING ALTERNATIVES
# Predictions using simple review categories
predictions = []
for (u,g) in test:
    if g in reviewsDict:
        if "Very Negative" not in reviewsDict[g] and "Mostly Negative" not in reviewsDict[g] and "Negative" not in reviewsDict[g]:
                predictions.append(0)
    else:
        predictions.append(1)

match = [(x==y) for x,y in zip(predictions, labelsTest)]

In [58]:
print sum(match) * 1.0/len(match)

0.569479429051


In [82]:
gameRatios = getGameRatios(dataUsersTrain)
predictions = []
for (u,g) in test:
    if g in gameRatios and gameRatios[g] > .5:
        predictions.append(1)
    else:
        predictions.append(0)

match = [(x==y) for x,y in zip(predictions, labelsTest)]

In [83]:
print sum(match) * 1.0/len(match)

0.7177
