In [4]:
import ast
import numpy as np
import random
import sklearn
from scipy import stats
from collections import defaultdict
from sklearn import svm
from sklearn.svm import LinearSVC

In [5]:
# Parse item data
def parseDataItems(fname):
    for l in open(fname):
        dataList = []
        temp = l.split("},")
        print len(temp)
        for i in range(0, len(temp)):
            entry = (temp[i] + "}").strip()
            entryEval = ast.literal_eval(entry)
            dataList.append(entryEval)
    return dataList
        
print "Reading data..."
dataItems = parseDataItems("steam/new_all_items_3.json")
print "done"

Reading data...
10258
done


In [6]:
# Parse user data
def parseDataUsers(fname):
    for l in open(fname):
        dataList = []
        temp = l.split("]}, {")
        print len(temp)
        for i in range(0, len(temp)):
            entry = ("{" + temp[i] + "]}").strip()
            entryEval = ast.literal_eval(entry)
            dataList.append(entryEval)
    return dataList
        
print "Reading data..."
dataUsers = parseDataUsers("steam/australia_users_items_2.json")
print "done"

Reading data...
88310
done


In [7]:
# Training, validation, and test splits
length = len(dataUsers)
dataUsersTrain = dataUsers[0:53310]
dataUsersValidate = dataUsers[53310:70810]
dataUsersTest = dataUsers[70810:]

In [8]:
allItemsList = [item['item_id'] for item in dataItems] 
allUsersList = [user['user_id'] for user in dataUsers]

In [23]:
# Dictionary of games' genres
gameGenreDict = defaultdict(list)
for game in dataItems:
    gameGenreDict[game['item_id']] = game['genre']

In [9]:
# Builds a list of user-game pairs that don't exist in the given dictionary
def build_set_neg(size, usersPlayedGamesD):
    samples = []
    while len(samples) < size:
        user = random.choice(allUsersList)
        item = random.choice(allItemsList)
        pair = (user, item)
        if user not in usersPlayedGamesD or (user in usersPlayedGamesD and item not in usersPlayedGamesD[user]):
            samples.append(pair)
    return samples

In [10]:
# Get all of the user-game pairs in the training data
usersPlayedGamesDictTrain = defaultdict(list)
trainPlayed = []
for user in dataUsersTrain: 
    if user.has_key('items'):
        for item in user['items']:
            if item['playtime_forever'] > 0:
                usersPlayedGamesDictTrain[user['user_id']].append(item['item_id'])
                trainPlayed.append((user['user_id'], item['item_id']))
trainNegative = build_set_neg(len(trainPlayed), usersPlayedGamesDictTrain)
train = trainPlayed + trainNegative

In [40]:
print train[0]
print len(train)
print len(set(train))

('76561197970982479', '10')
5579698
5536594


In [12]:
# Get all of the user-game pairs in the validation data
usersPlayedGamesDictValidate = defaultdict(list)
validatePlayed = []
for user in dataUsersValidate: 
    if user.has_key('items'):
        for item in user['items']:
            if item['playtime_forever'] > 0:
                usersPlayedGamesDictValidate[user['user_id']].append(item['item_id'])
                validatePlayed.append((user['user_id'], item['item_id']))
validateNegative = build_set_neg(len(validatePlayed), usersPlayedGamesDictValidate)
validate = validatePlayed + validateNegative

In [42]:
print validate[0]
print len(validate)
print len(set(validate))

('76561198048192342', '220')
576170
576025


In [13]:
# Get all of the user-game pairs in the test data
usersPlayedGamesDictTest = defaultdict(list)
testPlayed = []
for user in dataUsersTest: 
    if user.has_key('items'):
        for item in user['items']:
            if item['playtime_forever'] > 0:
                usersPlayedGamesDictTest[user['user_id']].append(item['item_id'])
                testPlayed.append((user['user_id'], item['item_id']))
testNegative = build_set_neg(len(testPlayed), usersPlayedGamesDictTest)
test = testPlayed + testNegative

In [60]:
# Create labels for the pairs of users and games
labelsTrain = []
for i in range(0, 25000):#len(trainPlayed)):
    labelsTrain.append(1)
for i in range(0, 50000):#len(trainNegative)):
    labelsTrain.append(0)

labelsValidate = []
for i in range(0, 25000):#len(validatePlayed)):
    labelsValidate.append(1)
for i in range(0, len(validateNegative)):
    labelsValidate.append(0)
    
labelsTest = []
for i in range(0, 5000):#len(testPlayed)):
    labelsTest.append(1)
for i in range(0, 5000):#len(testNegative)):
    labelsTest.append(0)

In [25]:
# Returns the Jaccard similarity of two lists
def jaccard_sim(listA, listB):
    A = set(listA)
    B = set(listB)
    num = A.intersection(B)
    denom = A.union(B)
    if len(denom) > 0:
        return len(num) / float(len(denom))
    return 0.0
    
# Returns the Jaccard similarities between the game and all of the user's played games
def feature_jaccard(u, g, usersPlayedGamesDict):
    usersPlayedGames = []
    sims = []
    userGamesGenres = []
    itemGenres = []
    if g in gameGenreDict:
        itemGenres = gameGenreDict[g]
    if u in usersPlayedGamesDict:
        usersPlayedGames = usersPlayedGamesDict[u]
    for game in usersPlayedGames:
        if game in gameGenreDict:
            userGameGenres = gameGenreDict[game]
            sims.append(jaccard_sim(userGameGenres, itemGenres))
    return sims

In [26]:
# Get all the genres the user has played
def get_user_genres(user, usersPlayedGamesD):
    userPlayedGames = []
    userGenres = []
    allUserGenres = []
    if user in usersPlayedGamesD:
        userPlayedGames = usersPlayedGamesD[user]
        for gameId in userPlayedGames:
            if gameId in allItemsList:
                userGenres.append(gameGenreDict[gameId])
    for listOfGenres in userGenres:
        for g in listOfGenres:
            allUserGenres.append(g)
    return set(allUserGenres)

In [38]:
# Create the feature vector
def feature(u, g, usersPlayedGamesD):
    feat = [1]
    userGenres = get_user_genres(u, usersPlayedGamesD)
    gameGenres = set()
    if g in gameGenreDict:
        gameGenres = set(gameGenreDict[g])
    commonGenres = userGenres.intersection(gameGenres)
    if len(commonGenres) > 0:
        feat.append(1)
    else:
        feat.append(0)
    #feat.append(feature_reviews(u,g))
    jaccards = feature_jaccard(u,g,usersPlayedGamesD)
    if jaccards != []:
        max_jaccard = max(jaccards)
        avg_jaccard = np.mean(jaccards)
        feat.append(max_jaccard)
        feat.append(avg_jaccard)
    else:
        feat.append(0)
        feat.append(0)
    return feat

In [44]:
# SVM to predict whether a user will play a game
train = trainPlayed[:5000] + trainNegative[:5000]
X_train = [feature(u,g, usersPlayedGamesDictTrain) for (u,g) in train_sub] # change
y_train = labelsTrain

In [57]:
# SVM on validation set to determine hyperparameter
lams = [0.01, 0.1, 1.0, 100.0, 1000.0]
for lam in lams:
    clf = LinearSVC(C=lam)
    clf.fit(X_train, y_train)
    
    validate = validatePlayed[:5000] + validateNegative[:5000]
    X_validate = [feature(u,g,usersPlayedGamesDictValidate) for (u,g) in validate]
    y_validate = labelsValidate
    validate_predictions = clf.predict(X_validate)
    matchValidate = [(x==y) for x,y in zip(labelsValidate, validate_predictions)]
    print lam
    print sum(matchValidate) * 1.0/len(matchValidate)

0.01
0.9172
0.1
0.9172
1.0
0.9172
100.0
0.9172
1000.0
0.9172


In [None]:
# SVM on training set
clf = LinearSVC(C=1.0)
clf.fit(X_train, y_train)
    
train = trainPlayed[:5000] + trainNegative[:5000]
X_train = [feature(u,g,usersPlayedGamesDictTrain) for (u,g) in train]
y_train = labelsTrain

train_predictions = clf.predict(X_train)
matchTrain = [(x==y) for x,y in zip(labelsTrain, train_predictions)]
print sum(matchTrain) * 1.0/len(matchTrain)

In [61]:
# SVM on test set
clf = LinearSVC(C=1.0)
clf.fit(X_train, y_train)
    
    
test = testPlayed[:5000] + testNegative[:5000]
X_test = [feature(u,g,usersPlayedGamesDictTest) for (u,g) in test]
y_test = labelsTest

test_predictions = clf.predict(X_test)
matchTest = [(x==y) for x,y in zip(labelsTest, test_predictions)]
print sum(matchTest) * 1.0/len(matchTest)

0.9102
