In [12]:

import re
from surprise import Dataset
from surprise import Reader
import pandas as pd

from collections import defaultdict
import numpy as np

import heapq
from collections import defaultdict
from operator import itemgetter

In [13]:
ratingsPath = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vQfeqh15cSYmWTpMlHnBidjNmNmVOnQ3uOLDoVJ_ijrOdE2ltOAOvTQIWIGrmTtOWagDoB7zX2j4pTx/pub?output=csv'
moviesPath = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSdI3atwf1ZAUxuCmOFwfVtr8cEdMLvS0aZWB5T0Of3O6vTNlx_6OymOI9wkR3miwlrIW1lrIWdvUks/pub?output=csv'

ratings = pd.read_csv(ratingsPath)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [14]:
movies = pd.read_csv(moviesPath)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [25]:
ratings_data = ratings.values
ratings_data[0:5]

array([[1.00000000e+00, 1.00000000e+00, 4.00000000e+00, 9.64982703e+08],
       [1.00000000e+00, 3.00000000e+00, 4.00000000e+00, 9.64981247e+08],
       [1.00000000e+00, 6.00000000e+00, 4.00000000e+00, 9.64982224e+08],
       [1.00000000e+00, 4.70000000e+01, 5.00000000e+00, 9.64983815e+08],
       [1.00000000e+00, 5.00000000e+01, 5.00000000e+00, 9.64982931e+08]])

In [26]:
movies_data = movies.values
movies_data[0:5]

array([[1, 'Toy Story (1995)',
        'Adventure|Animation|Children|Comedy|Fantasy'],
       [2, 'Jumanji (1995)', 'Adventure|Children|Fantasy'],
       [3, 'Grumpier Old Men (1995)', 'Comedy|Romance'],
       [4, 'Waiting to Exhale (1995)', 'Comedy|Drama|Romance'],
       [5, 'Father of the Bride Part II (1995)', 'Comedy']], dtype=object)

In [27]:
movieID_to_name = {}
name_to_movieID = {}

for row in movies_data:
    movieID = int(row[0])
    movieName = row[1]
    movieID_to_name[movieID] = movieName
    name_to_movieID[movieName] = movieID

#print(movieID_to_name)
#print(name_to_movieID)

In [28]:
def getUserRatings(user):
    userRatings = []
    hitUser = False

    for row in ratings_data:
        userID = int(row[0])
        if (user == userID):
            movieID = int(row[1])
            rating = float(row[2])
            userRatings.append((movieID, rating))
            hitUser = True
        if (hitUser and (user != userID)):
            break

    return userRatings
getUserRatings(1)[0:5]

[(1, 4.0), (3, 4.0), (6, 4.0), (47, 5.0), (50, 5.0)]

In [35]:
def getPopularityRanks():
    ratings = defaultdict(int)
    rankings = defaultdict(int)

    for row in ratings_data:
        movieID = int(row[1])
        ratings[movieID] += 1
    rank = 1
    for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
        rankings[movieID] = rank
        rank += 1
    return rankings
rankingsDict = getPopularityRanks()
{k: rankingsDict[k] for k in list(rankingsDict)[:5]}

{356: 1, 318: 2, 296: 3, 593: 4, 2571: 5}

In [38]:
def getGenres():
    genres = defaultdict(list)
    genreIDs = {}
    maxGenreID = 0


    for row in movies_data:
        movieID = int(row[0])
        genreList = row[2].split('|')
        genreIDList = []
        for genre in genreList:
            if genre in genreIDs:
                genreID = genreIDs[genre]
            else:
                genreID = maxGenreID
                genreIDs[genre] = genreID
                maxGenreID += 1
            genreIDList.append(genreID)
        genres[movieID] = genreIDList
    # Convert integer-encoded genre lists to bitfields that we can treat as vectors
    for (movieID, genreIDList) in genres.items():
        bitfield = [0] * maxGenreID
        for genreID in genreIDList:
            bitfield[genreID] = 1
        genres[movieID] = bitfield            
    
    return genres
genres = getGenres()
{k: genres[k] for k in list(genres)[:5]}

{1: [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 2: [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 3: [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 4: [0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 5: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [39]:
def getYears():
    p = re.compile(r"(?:\((\d{4})\))?\s*$")
    years = defaultdict(int)

    for row in movies_data:
        movieID = int(row[0])
        title = row[1]
        m = p.search(title)
        year = m.group(1)
        if year:
            years[movieID] = int(year)
    return years
years = getYears()
{k: years[k] for k in list(years)[:5]}


{1: 1995, 2: 1995, 3: 1995, 4: 1995, 5: 1995}

In [40]:
def getMovieName(movieID):
    if movieID in movieID_to_name:
        return movieID_to_name[movieID]
    else:
        return ""
    
getMovieName(1)

'Toy Story (1995)'

In [41]:
def getMovieID( movieName):
    if movieName in name_to_movieID:
        return name_to_movieID[movieName]
    else:
        return 0
getMovieID(1)

0

In [79]:
from surprise import KNNBasic
import heapq
from collections import defaultdict
from operator import itemgetter

k = 10


In [84]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(ratings[["userId", "movieId","rating"]], reader)

trainSet = data.build_full_trainset()

trainSet.all_items()

range(0, 9724)

In [90]:
testUserInnerID = None

for i in range(0,1000):
    try:
       testUserInnerID = trainSet.to_inner_uid(i) 
    except:
        print()
    
    if testUserInnerID != None:
        print(i, testUserInnerID)



1 0
2 1
3 2
4 3
5 4
6 5
7 6
8 7
9 8
10 9
11 10
12 11
13 12
14 13
15 14
16 15
17 16
18 17
19 18
20 19
21 20
22 21
23 22
24 23
25 24
26 25
27 26
28 27
29 28
30 29
31 30
32 31
33 32
34 33
35 34
36 35
37 36
38 37
39 38
40 39
41 40
42 41
43 42
44 43
45 44
46 45
47 46
48 47
49 48
50 49
51 50
52 51
53 52
54 53
55 54
56 55
57 56
58 57
59 58
60 59
61 60
62 61
63 62
64 63
65 64
66 65
67 66
68 67
69 68
70 69
71 70
72 71
73 72
74 73
75 74
76 75
77 76
78 77
79 78
80 79
81 80
82 81
83 82
84 83
85 84
86 85
87 86
88 87
89 88
90 89
91 90
92 91
93 92
94 93
95 94
96 95
97 96
98 97
99 98
100 99
101 100
102 101
103 102
104 103
105 104
106 105
107 106
108 107
109 108
110 109
111 110
112 111
113 112
114 113
115 114
116 115
117 116
118 117
119 118
120 119
121 120
122 121
123 122
124 123
125 124
126 125
127 126
128 127
129 128
130 129
131 130
132 131
133 132
134 133
135 134
136 135
137 136
138 137
139 138
140 139
141 140
142 141
143 142
144 143
145 144
146 145
147 146
148 147
149 148
150 149
151 150
152 151
1

In [91]:
sim_options = {'name': 'cosine',
               'user_based': True
               }

model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

# Get top N similar users to our test subject
# (Alternate approach would be to select users up to some similarity threshold - try it!)

similarityRow = simsMatrix[testUserInnerID]

similarUsers = []
for innerID, score in enumerate(similarityRow):
    if (innerID != testUserInnerID):
        similarUsers.append( (innerID, score) )

kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])

# Get the stuff they rated, and add up ratings for each item, weighted by user similarity
candidates = defaultdict(float)
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1]
    theirRatings = trainSet.ur[innerID]
    for rating in theirRatings:
        candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore
    
# Build a dictionary of stuff the user has already seen
watched = {}
for itemID, rating in trainSet.ur[testUserInnerID]:
    watched[itemID] = 1
    
# Get top-rated items from similar users:
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        print(getMovieName(int(movieID)), ratingSum)
        pos += 1
        if (pos > 10):
            break

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Young Frankenstein (1974) 1.9931179103749337
10 Things I Hate About You (1999) 1.9909456490341095
Go (1999) 1.8928823777339514
When a Man Loves a Woman (1994) 1.798894263466008
To Wong Foo, Thanks for Everything! Julie Newmar (1995) 1.7959493410609457
Island of Dr. Moreau, The (1996) 1.6964556734283276
Thin Red Line, The (1998) 1.6937470934662184
Arrival, The (1996) 1.598728063579491
Bowfinger (1999) 1.5940840825379714
Legends of the Fall (1994) 1.4951763004734298
Back to the Future Part III (1990) 1.4951763004734295
