# import

In [130]:
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise import KNNBasic
import json
import string
import matplotlib.pyplot as plt 
import json_lines
import itertools
import pandas as pd

In [131]:
with open('result.jl', 'rb') as f:
    entities = [x for x in json_lines.reader(f)]
    stories = [x for x in entities if str(x['pageType']) == "story" and str(x['storyType']) == '/book/Harry-Potter/']
    reviews = [x for x in entities if str(x['pageType']) == "review"]
    users = [x for x in entities if str(x['pageType']) == "user"]
    heldoutUser = users[0]
    users = users[1:]
    print(len(stories),len(reviews),len(users))

4037 129050 5175


# construct dicts

In [132]:
storyLinkToIdDict = {}
IdToStoryDict = {}

#create a dict between storied and their id
storyId = 0
for story in stories:
    storyLinkToIdDict[story['storyLink']] = storyId
    IdToStoryDict[storyId] = story
    storyId += 1
    
userLinkToIdDict = {}
IdToUserDict = {}

userId = 0
for user in users:
    userLinkToIdDict[user['name']] = userId
    IdToUserDict[userId] = user
    userId += 1

for review in reviews:
    if(review['reviewer'] not in userLinkToIdDict):
        userLinkToIdDict[review['reviewer']] = userId
        IdToUserDict[userId] = review['reviewer']
        userId+=1

reviewLinkToIdDict = {}
IdToReviewDict = {}
reviewId = 0
for review in reviews:
    reviewLinkToIdDict[review['reviewOf'] + '|' + review['reviewer']] = reviewId
    IdToReviewDict[reviewId] = review
    reviewId += 1

## make scores dict

In [133]:
import operator
import collections
from collections import Counter
import numpy as np

storyReviewDic = Counter({})
storyScores = {}

cnt = 0
for review in reviews:
    if(review['reviewOf'] in storyLinkToIdDict):
        #storyReviewDic[storyLinkToIdDict[review['reviewOf']]] += review['sentimentScore']
        userId = userLinkToIdDict[review['reviewer']]
        storyId = storyLinkToIdDict[review['reviewOf']]
        score = review['sentimentScore']
        storyScores[(userId, storyId)] = {
                    "storyId" : storyId, 
                    "userId" : userId,
                    "score" :  score
                }
        cnt += 1 

### add in favorites data

In [134]:
for user in users:
    userId = userLinkToIdDict[user['name']]
    
    for favorite in user['favorites']:
        if(favorite['favStory'] in storyLinkToIdDict):
            
            storyId = storyLinkToIdDict[favorite['favStory']]
            score = 2
            if((userId, storyId) not in storyScores):
                storyScores[(userId, storyId)] = {
                    "storyId" : storyId, 
                    "userId" : userId,
                    "score" :  0
                }
            storyScores[(userId, storyId)]['score'] += score


In [135]:
inputScores = []
userList = set()
storyList = set()
for score, body in storyScores.items():
    inputScores.append(body)
    userList.add(body['userId'])
    userList.add(body['storyId'])
#print(inputScores[:100])

In [137]:
def train(examples):
    df = pd.DataFrame(examples)
    reader = Reader(rating_scale=(-5, 10))
    data = Dataset.load_from_df(df[['userId', 'storyId', 'score']], reader)
    trainset = data.build_full_trainset()
    algo = KNNWithMeans()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    #predictions = algo.test(testset)
    #cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    return algo    

trainedAlgo = train(inputScores)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [138]:
def getTopNPredictions(algo, examples, userId, n = 10):
    df = pd.DataFrame(examples)
    df_filtered = df.query('userId==' + str(userId))
    #print(df_filtered)
    test_items = []
    for story in stories:
        storyId = storyLinkToIdDict[story['storyLink']]
        test_items.append({
                    "storyId" : storyId, 
                    "userId" : userId,
                    "score" :  0
                })
    df = pd.DataFrame(test_items)
    #remove values the user already knows
    mask = np.logical_not(df['storyId'].isin(set(df_filtered['storyId'])))
    df = df[mask]
    
    reader = Reader(rating_scale=(-5, 10))
    data = Dataset.load_from_df(df[['userId', 'storyId', 'score']], reader)
    trainset = data.build_full_trainset()
    testset = trainset.build_testset()
    predictions = algo.test(testset)
    
    top_n = []
    for uid, iid, true_r, est, _ in predictions:
        top_n.append((IdToStoryDict[iid]['storyLink'], iid, est))
    
    top_n.sort(key=lambda x: x[2], reverse=True)
    top_n = top_n[:n]
    return top_n
    
getTopNPredictions(trainedAlgo, inputScores, 500, 10)

[('/s/11255094/1/Rumpelstiltskin', 2429, 4.8756),
 ('/s/8045114/1/A-Marauder-s-Plan', 1, 3.9314999999999998),
 ('/s/4101650/1/Backward-With-Purpose-Part-I-Always-and-Always', 52, 3.8399),
 ('/s/11401300/1/A-Happy-Reunion', 4027, 3.7981999999999996),
 ('/s/2636645/1/I-Tauwght-I-Taw-A-Putty-Tat', 1390, 3.5465999999999998),
 ('/s/12175260/1/The-Sound-Of-Silence', 3977, 3.5465),
 ('/s/9901082/1/Family-Matters', 2377, 3.3299000000000003),
 ('/s/12026958/1/Sewing-Obsession', 3999, 2.987400000000001),
 ('/s/2703459/1/All-Bets', 2350, 2.63255),
 ('/s/8087429/1/Socks', 1527, 2.6249000000000002)]

In [139]:
def predict(user, n = 10):
    global userId, inputScores, trainedAlgo 
    if(user['name'] not in userLinkToIdDict):
        print('bad', userId)
        userLinkToIdDict[user['name']] = userId
        IdToUserDict[userId] = user
        userId += 1
        trainedAlgo = train(inputScores)
    uid = userLinkToIdDict[heldoutUser['name']]
    return getTopNPredictions(trainedAlgo, inputScores, userLinkToIdDict[user['name']], n)
    

In [140]:
predict(heldoutUser)

bad 5174
Computing the msd similarity matrix...
Done computing similarity matrix.


[('/s/3345078/1/Shadows-Within-the-Light', 622, 9.971400000000003),
 ('/s/3163139/1/Raspberry-Jam', 3252, 7.0242),
 ('/s/10714425/1/Messing-With-Time', 1050, 6.009800000000002),
 ('/s/11726446/1/third-time-s-a-charm', 2052, 5.071400000000001),
 ('/s/11266090/1/The-Beauty-In-Me', 2195, 5.0267),
 ('/s/9486886/1/Moratorium', 2347, 4.156037633764049),
 ('/s/8950627/1/Tainted', 2758, 4.117799999999999),
 ('/s/7481386/1/Killing-Time', 3671, 3.9542),
 ('/s/8326928/1/Three-s-The-Charm', 707, 3.940999999999999),
 ('/s/3951749/1/Harry-Potter-and-the-Quantum-Leap', 389, 3.8918999999999997)]