In [12]:
import os
import csv
import sys
import re
from surprise import Dataset
from surprise import Reader
import pandas as pd

from collections import defaultdict
import numpy as np


In [13]:
ratingsPath = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vQfeqh15cSYmWTpMlHnBidjNmNmVOnQ3uOLDoVJ_ijrOdE2ltOAOvTQIWIGrmTtOWagDoB7zX2j4pTx/pub?output=csv'
moviesPath = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSdI3atwf1ZAUxuCmOFwfVtr8cEdMLvS0aZWB5T0Of3O6vTNlx_6OymOI9wkR3miwlrIW1lrIWdvUks/pub?output=csv'

ratings = pd.read_csv(ratingsPath)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [14]:
movies = pd.read_csv(moviesPath)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [25]:
ratings_data = ratings.values
ratings_data[0:5]

array([[1.00000000e+00, 1.00000000e+00, 4.00000000e+00, 9.64982703e+08],
       [1.00000000e+00, 3.00000000e+00, 4.00000000e+00, 9.64981247e+08],
       [1.00000000e+00, 6.00000000e+00, 4.00000000e+00, 9.64982224e+08],
       [1.00000000e+00, 4.70000000e+01, 5.00000000e+00, 9.64983815e+08],
       [1.00000000e+00, 5.00000000e+01, 5.00000000e+00, 9.64982931e+08]])

In [26]:
movies_data = movies.values
movies_data[0:5]

array([[1, 'Toy Story (1995)',
        'Adventure|Animation|Children|Comedy|Fantasy'],
       [2, 'Jumanji (1995)', 'Adventure|Children|Fantasy'],
       [3, 'Grumpier Old Men (1995)', 'Comedy|Romance'],
       [4, 'Waiting to Exhale (1995)', 'Comedy|Drama|Romance'],
       [5, 'Father of the Bride Part II (1995)', 'Comedy']], dtype=object)

In [27]:
movieID_to_name = {}
name_to_movieID = {}

for row in movies_data:
    movieID = int(row[0])
    movieName = row[1]
    movieID_to_name[movieID] = movieName
    name_to_movieID[movieName] = movieID

#print(movieID_to_name)
#print(name_to_movieID)

In [28]:
def getUserRatings(user):
    userRatings = []
    hitUser = False

    for row in ratings_data:
        userID = int(row[0])
        if (user == userID):
            movieID = int(row[1])
            rating = float(row[2])
            userRatings.append((movieID, rating))
            hitUser = True
        if (hitUser and (user != userID)):
            break

    return userRatings
getUserRatings(1)[0:5]

[(1, 4.0), (3, 4.0), (6, 4.0), (47, 5.0), (50, 5.0)]

In [35]:
def getPopularityRanks():
    ratings = defaultdict(int)
    rankings = defaultdict(int)

    for row in ratings_data:
        movieID = int(row[1])
        ratings[movieID] += 1
    rank = 1
    for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
        rankings[movieID] = rank
        rank += 1
    return rankings
rankingsDict = getPopularityRanks()
{k: rankingsDict[k] for k in list(rankingsDict)[:5]}

{356: 1, 318: 2, 296: 3, 593: 4, 2571: 5}

In [38]:
def getGenres():
    genres = defaultdict(list)
    genreIDs = {}
    maxGenreID = 0


    for row in movies_data:
        movieID = int(row[0])
        genreList = row[2].split('|')
        genreIDList = []
        for genre in genreList:
            if genre in genreIDs:
                genreID = genreIDs[genre]
            else:
                genreID = maxGenreID
                genreIDs[genre] = genreID
                maxGenreID += 1
            genreIDList.append(genreID)
        genres[movieID] = genreIDList
    # Convert integer-encoded genre lists to bitfields that we can treat as vectors
    for (movieID, genreIDList) in genres.items():
        bitfield = [0] * maxGenreID
        for genreID in genreIDList:
            bitfield[genreID] = 1
        genres[movieID] = bitfield            
    
    return genres
genres = getGenres()
{k: genres[k] for k in list(genres)[:5]}

{1: [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 2: [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 3: [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 4: [0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 5: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [39]:
def getYears():
    p = re.compile(r"(?:\((\d{4})\))?\s*$")
    years = defaultdict(int)

    for row in movies_data:
        movieID = int(row[0])
        title = row[1]
        m = p.search(title)
        year = m.group(1)
        if year:
            years[movieID] = int(year)
    return years
years = getYears()
{k: years[k] for k in list(years)[:5]}


{1: 1995, 2: 1995, 3: 1995, 4: 1995, 5: 1995}

In [40]:
def getMovieName(movieID):
    if movieID in movieID_to_name:
        return movieID_to_name[movieID]
    else:
        return ""
    
getMovieName(1)

'Toy Story (1995)'

In [41]:
def getMovieID( movieName):
    if movieName in name_to_movieID:
        return name_to_movieID[movieName]
    else:
        return 0
getMovieID(1)

0