In [1]:
import pandas as pd
import numpy as np
from surprise import SVD
from surprise.model_selection import cross_validate, train_test_split
from surprise import Dataset
from surprise import Reader
from surprise.prediction_algorithms.knns import KNNWithZScore, KNNBaseline
from surprise.prediction_algorithms.matrix_factorization import NMF
from pandas.io.json import json_normalize
from pymongo import MongoClient

In [2]:
def convert_ids(ids_in_csv):
    return pd.to_numeric(ids_in_csv, errors='coerce').astype('int64')

In [4]:
ratings_df = pd.read_csv('../data/the-movies-dataset/ratings_small.csv')
movies_df = pd.read_csv('../data/the-movies-dataset/movies_metadata.csv'
                        , converters={'id': lambda x: convert_ids(x), 'imdb_id': lambda x: convert_ids(x)}
                       ,usecols=['id', 'original_title', 'belongs_to_collection'
                                 , 'budget', 'genres', 'homepage'
                                 ,'imdb_id', 'overview', 'popularity', 'poster_path'
                                 , 'production_companies','release_date', 'revenue', 'runtime',
                                 'spoken_languages', 'status', 'tagline', 'title', 'video',
                                 'vote_average', 'vote_count'])

In [21]:
###May need Fuzzy matching, but for now:
movies_df = movies_df[movies_df.spoken_languages == """[{'iso_639_1': 'en', 'name': 'English'}]"""]


ratings_with_movie_names = ratings_df.merge(movies_df[['id', 'original_title']], how='left', left_on='movieId', right_on='id')
ratings_with_movie_names = ratings_with_movie_names[ratings_with_movie_names.original_title.isnull() == False]

In [22]:
algo = SVD(verbose=True)
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_with_movie_names[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

In [23]:
algo.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1dcfc7e48>

In [24]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, n_jobs=-1, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8972  0.9026  0.8992  0.8975  0.8952  0.8983  0.0025  
MAE (testset)     0.6934  0.6957  0.6947  0.6914  0.6931  0.6937  0.0015  
Fit time          0.82    0.83    0.82    0.79    0.78    0.81    0.02    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    


{'test_rmse': array([0.89718608, 0.9026489 , 0.89917479, 0.89753453, 0.89516795]),
 'test_mae': array([0.69338561, 0.69567222, 0.69471855, 0.69137792, 0.69314943]),
 'fit_time': (0.8225510120391846,
  0.8273391723632812,
  0.8201608657836914,
  0.7931690216064453,
  0.7766008377075195),
 'test_time': (0.023442745208740234,
  0.023396968841552734,
  0.022737741470336914,
  0.022162914276123047,
  0.0221860408782959)}

In [25]:
from collections import defaultdict
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


In [26]:
testset = trainset.build_anti_testset()

In [27]:
predictions = algo.test(testset)

In [28]:
top_n = get_top_n(predictions)

In [53]:
predicted_movies_by_name = defaultdict(list)

for key, value in top_n.items():
    predicted_movies_by_name[key] = [get_movie_name(mov_id[0]) for mov_id in value]

In [29]:
my_n = top_n[1]
my_n

[(858, 4.297286367294758),
 (913, 4.089548032465652),
 (4226, 4.044374780143709),
 (926, 4.035205935163958),
 (296, 4.013958868571714),
 (318, 3.9832182557981923),
 (1945, 3.964012402289571),
 (3088, 3.9619225813821988),
 (260, 3.942522271972541),
 (898, 3.9411604573913017)]

In [30]:
ratings_with_movie_names.head()

Unnamed: 0,userId,movieId,rating,timestamp,id,original_title
10,1,1371,2.5,1260759135,1371.0,Rocky III
11,1,1405,1.0,1260759203,1405.0,Greed
13,1,2105,4.0,1260759139,2105.0,American Pie
15,1,2193,2.0,1260759198,2193.0,My Tutor
16,1,2294,2.0,1260759108,2294.0,Jay and Silent Bob Strike Back


In [43]:
ratings_with_movie_names[ratings_with_movie_names.id == my_n[1][0]]['original_title'].iloc[0]

'The Thomas Crown Affair'

In [48]:
def get_movie_name(movie_id):
    return ratings_with_movie_names[ratings_with_movie_names.id == movie_id]['original_title'].iloc[0]

In [49]:
get_movie_name(858)

'Sleepless in Seattle'

In [50]:
[get_movie_name(mov_id[0]) for mov_id in my_n]

['Sleepless in Seattle',
 'The Thomas Crown Affair',
 'Shriek If You Know What I Did Last Friday the Thirteenth',
 'Galaxy Quest',
 'Terminator 3: Rise of the Machines',
 'The Million Dollar Hotel',
 'Nell',
 'My Darling Clementine',
 'The 39 Steps',
 'Birdman of Alcatraz']

In [41]:
ratings_with_movie_names[ratings_with_movie_names.id == 858]

Unnamed: 0,userId,movieId,rating,timestamp,id,original_title
180,4,858,5.0,949779022,858.0,Sleepless in Seattle
370,5,858,2.5,1163373651,858.0,Sleepless in Seattle
602,8,858,5.0,1154400181,858.0,Sleepless in Seattle
1140,15,858,5.0,997938703,858.0,Sleepless in Seattle
2730,17,858,5.0,1127469000,858.0,Sleepless in Seattle
3324,19,858,5.0,855191478,858.0,Sleepless in Seattle
3564,20,858,2.0,1238729822,858.0,Sleepless in Seattle
3678,21,858,4.0,853850728,858.0,Sleepless in Seattle
3823,22,858,4.0,1131662354,858.0,Sleepless in Seattle
4093,23,858,5.0,1148670263,858.0,Sleepless in Seattle


In [54]:
predicted_movies_by_name

defaultdict(list,
            {1: ['Sleepless in Seattle',
              'The Thomas Crown Affair',
              'Shriek If You Know What I Did Last Friday the Thirteenth',
              'Galaxy Quest',
              'Terminator 3: Rise of the Machines',
              'The Million Dollar Hotel',
              'Nell',
              'My Darling Clementine',
              'The 39 Steps',
              'Birdman of Alcatraz'],
             2: ['Lonely Hearts',
              'Sleepless in Seattle',
              'Straw Dogs',
              'The Thomas Crown Affair',
              'Galaxy Quest',
              'Hard Target',
              'Point Break',
              'The Million Dollar Hotel',
              'License to Wed',
              'Nell'],
             3: ['Sleepless in Seattle',
              'Lonely Hearts',
              'The Thomas Crown Affair',
              'Galaxy Quest',
              'Hard Target',
              'While You Were Sleeping',
              'Shriek If You Know 

In [76]:

def print_user_prediction(userId, predictions_dict):

    users_viewed_movies = ratings_with_movie_names[ratings_with_movie_names['userId'] == userId][['rating', 'original_title']]
    print(f'User {userId} has viewed the following movies:\n')

    for row in users_viewed_movies.itertuples():
        rating = row[1]
        original_title = row[2]
        print(f'\t{original_title}, Rating: {rating}')
        
    print(f'\nThe following movies are recommended for User {userId}\n')
    recommended_movies = [get_movie_name(mov_id[0]) for mov_id in predictions_dict[userId]]
    
    for movie in recommended_movies:
        print(f'\t{movie}')

In [90]:
print_user_prediction(321, top_n)

User 321 has viewed the following movies:

	The Endless Summer, Rating: 5.0
	Rumble Fish, Rating: 5.0
	A River Runs Through It, Rating: 4.0
	Terminator 3: Rise of the Machines, Rating: 5.0
	The Poseidon Adventure, Rating: 5.0
	Apollo 13, Rating: 4.0
	Men in Black II, Rating: 4.0
	A View to a Kill, Rating: 4.0
	Videodrome, Rating: 4.0
	The Thomas Crown Affair, Rating: 5.0
	Hollywoodland, Rating: 4.0
	Bridge to Terabithia, Rating: 4.0
	Rope, Rating: 3.0
	A Time to Kill, Rating: 4.0
	Little Buddha, Rating: 3.0
	Jungle Fever, Rating: 4.0
	Return of the Jedi, Rating: 2.0
	Don Juan DeMarco, Rating: 2.0
	Twin Peaks: Fire Walk with Me, Rating: 2.0
	My Name Is Bruce, Rating: 2.0
	The Bachelor, Rating: 3.0
	Cold Mountain, Rating: 4.0
	In the Name of the King: A Dungeon Siege Tale, Rating: 2.0
	Short Circuit, Rating: 1.0

The following movies are recommended for User 321

	Sleepless in Seattle
	Galaxy Quest
	Birdman of Alcatraz
	Lonely Hearts
	The Million Dollar Hotel
	Shriek If You Know What I D