In [9]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
import random
import pandas as pd
from surprise import SVD, Reader, Dataset

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [12]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [13]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [17]:
### Would-read baseline: just rank which books are popular and which are not, and return '1' if a book is among the top-ranked
allRatings = []
bookCount = defaultdict(int)
totalRead = 0

for user,book,r in readCSV("drive/MyDrive/CSE 258/data/train_Interactions.csv.gz"):
  r = int(r)
  allRatings.append([user,book,r])
  bookCount[book] += 1
  totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
  count += ic
  return1.add(i)
  if count > totalRead*0.725: break

In [18]:
UsersPerItem = defaultdict(set)
ItemsPerUser = defaultdict(set)
for u,b,r in allRatings:
  UsersPerItem[b].add(u)
  ItemsPerUser[u].add(b)

In [None]:
##################################################
# Read prediction                                #
##################################################

In [19]:
def Jaccard(s1, s2):
  intersection = len(s1.intersection(s2))
  union = len(s1.union(s2))
  jaccard_similarity = intersection / union
  return jaccard_similarity

In [20]:
def r(u,b):
  if len(ItemsPerUser[u]) == 0:
    return int(b in return1)
  if b in return1:
    return 1
  js_max = 0
  for b_ in ItemsPerUser[u]:
    if b_ != b:
      js = Jaccard(UsersPerItem[b], UsersPerItem[b_])
      if js > js_max:
        js_max = js
  return js_max > 0.025

In [21]:
predictions = open("drive/MyDrive/CSE 258/predictions_Read.csv", 'w')
for l in open("drive/MyDrive/CSE 258/pairs_Read.csv"):
  if l.startswith("userID"):
    #header
    predictions.write(l)
    continue
  u,b = l.strip().split(',')
  predictions.write(u + ',' + b + "," + str(r(u,b)) + '\n')
predictions.close()

In [None]:
##################################################
# Rating prediction                              #
##################################################

In [22]:
df = pd.DataFrame(allRatings, columns=['user', 'item', 'rating'])
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df[['user', 'item', 'rating']], reader)
trainset = data.build_full_trainset()

In [23]:
algo = SVD(n_factors=5)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7be71f7814b0>

In [24]:
predictions = open("drive/MyDrive/CSE 258/predictions_Rating.csv", 'w')
for l in open("drive/MyDrive/CSE 258/pairs_Rating.csv"):
  if l.startswith("userID"):
    #header
    predictions.write(l)
    continue
  u,i = l.strip().split(',')
  predictions.write(u + ',' + i + "," + str(algo.predict(uid=u, iid=i).est) + "\n")
predictions.close()