In [1]:
import numpy as np
import pandas as pd 

In [2]:
# !curl -O http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
# !unzip ml-latest-small.zip

In [3]:
cd /Users/wannjiun/Desktop/nycdsa/project_5_recommender/ml-latest-small

/Users/wannjiun/Desktop/nycdsa/project_5_recommender/ml-latest-small


In [4]:
ls

README.txt   links.csv    movies.csv   ratings.csv  tags.csv


In [5]:
!head links.csv
!echo # line break
!head movies.csv 
!echo # line break
!head ratings.csv
!echo # line break
!head tags.csv

movieId,imdbId,tmdbId
1,0114709,862
2,0113497,8844
3,0113228,15602
4,0114885,31357
5,0113041,11862
6,0113277,949
7,0114319,11860
8,0112302,45325
9,0114576,9091

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action

userId,movieId,rating,timestamp
1,31,2.5,1260759144
1,1029,3.0,1260759179
1,1061,3.0,1260759182
1,1129,2.0,1260759185
1,1172,4.0,1260759205
1,1263,2.0,1260759151
1,1287,2.0,1260759187
1,1293,2.0,1260759148
1,1339,3.5,1260759125

userId,movieId,tag,timestamp
15,339,sandra 'boring' bullock,1138537770
15,1955,dentist,1193435061
15,7478,Cambodia,1170560997
15,32892,Russian,1170626366
15,34162,forgettable,1141391765
15,35957,short,114139187

In [3]:
df = pd.read_csv('ratings.csv', sep=',')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
df_id = pd.read_csv('links.csv', sep=',')
df_id.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
df = pd.merge(df, df_id, on=['movieId'])
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,31,2.5,1260759144,112792,9909.0
1,7,31,3.0,851868750,112792,9909.0
2,31,31,4.0,1273541953,112792,9909.0
3,32,31,4.0,834828440,112792,9909.0
4,36,31,3.0,847057202,112792,9909.0


In [6]:
num_users = df.userId.unique().shape[0]
num_movies = df.movieId.unique().shape[0]
print str(num_users) + ' users'
print str(num_movies) + ' movies'
print 'Max user id: ' + str(max(df.userId))  
print 'Max movie id: ' + str(max(df.movieId))  

671 users
9066 movies
Max user id: 671
Max movie id: 163949


In [7]:
ratings = np.zeros((num_users, max(df.movieId)))
for row in df.itertuples():
    ratings[row[1]-1, row[2]-1] = row[3]
ratings = ratings[:,:9000]
ratings.shape
## need to find a way to include all movies

(671, 9000)

In [8]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print 'Sparsity: {:3.2f}%'.format(sparsity)

Sparsity: 1.40%


In [9]:
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in xrange(ratings.shape[0]):
        test_ratings = np.random.choice(
            ratings[user, :].nonzero()[0], 
            size=10, 
            replace=True)
        train[user, test_ratings] = 0.0
        test[user, test_ratings] = ratings[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

In [10]:
train, test = train_test_split(ratings)

In [11]:
def fast_similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'movie':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [12]:
%timeit fast_similarity(train, kind='user')

10 loops, best of 3: 41.1 ms per loop


In [16]:
user_similarity = fast_similarity(train, kind='user')
movie_similarity = fast_similarity(train, kind='movie')
print movie_similarity[:5, :5]
movie_similarity.shape

[[ 1.          0.39363361  0.27803334  0.14285441  0.22600865]
 [ 0.39363361  1.          0.22662766  0.17497687  0.28149169]
 [ 0.27803334  0.22662766  1.          0.17723886  0.31053007]
 [ 0.14285441  0.17497687  0.17723886  1.          0.14308289]
 [ 0.22600865  0.28149169  0.31053007  0.14308289  1.        ]]


(9000, 9000)

In [13]:
def predict_fast_simple(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'movie':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [14]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [17]:
item_prediction = predict_fast_simple(train, movie_similarity, kind='movie')
user_prediction = predict_fast_simple(train, user_similarity, kind='user')

print 'User-based CF MSE: ' + str(get_mse(user_prediction, test))
print 'Item-based CF MSE: ' + str(get_mse(item_prediction, test))

User-based CF MSE: 9.79305668292
Item-based CF MSE: 13.3171073128


In [18]:
import requests
import json

from IPython.display import Image
from IPython.display import display
from IPython.display import HTML

# Get base url filepath structure. w185 corresponds to size of movie poster.
headers = {'Accept': 'application/json'}
payload = {'api_key': 'bb3beb7ec7af6d1c0c23ca7381b62a89'} 
response = requests.get("http://api.themoviedb.org/3/configuration", params=payload, headers=headers)
response = json.loads(response.text)
base_url = response['images']['base_url'] + 'w185'

def get_poster(imdbid, base_url):
    # Get IMDB movie ID
    movie_id = "tt0" + str(imdbid) 
    
    # Query themoviedb.org API for movie poster path.
    movie_url = 'http://api.themoviedb.org/3/movie/{:}/images'.format(movie_id)
    headers = {'Accept': 'application/json'}
    payload = {'api_key': 'bb3beb7ec7af6d1c0c23ca7381b62a89'} 
    response = requests.get(movie_url, params=payload, headers=headers)
    try:
        file_path = json.loads(response.text)['posters'][0]['file_path']
    except:
        file_path = ""
        
    return (base_url + file_path, imdbid)  

In [19]:
# Load in movie data
idx_to_movie = {}
for row in df_id.itertuples():
    idx_to_movie[row[1]-1] = row[2]
idx_to_movie    
        
def top_k_movies(similarity, mapper, movie_idx, k=6):
    return [mapper[x] for x in np.argsort(similarity[movie_idx,:])[:-k-1:-1]]

In [20]:
idx = 5
movies = top_k_movies(movie_similarity, idx_to_movie, idx)
movies = filter(lambda imdb: len(str(imdb)) == 6, movies)
movies = movies[:5]

In [22]:
n_display = 5
URL = [0]*n_display
IMDB = [0]*n_display
i = 0
for movie in movies:
    (URL[i], IMDB[i]) = get_poster(movie, base_url)
    i += 1 
    
images = ''
for i in range(n_display):
    images += "<img style='width: 100px; margin: 0px; \
                float: left; border: 1px solid black;' src='%s' />" \
                % URL[i]

display(HTML(images))    