In [24]:
# Matrix factorization machine learning for movie recommender system
# Template taken from https://github.com/khanhnamle1994/movielens/blob/master/SVD_Model.ipynb

# Import libraries
import numpy as np
import pandas as pd

# Reading ratings file
ratings = pd.read_csv('ml-latest-small/ratings.csv', sep=',', encoding='latin-1')
# ratings = pd.read_csv('ml-25m/ratings.csv', sep='\t')
# Reading users file
#users = pd.read_csv('ml-1m/users.csv', sep='\t', encoding='latin-1')
# users = pd.read_csv('ml-25m/users.csv', sep='\t')
# Reading movies file
movies = pd.read_csv('ml-latest-small/movies.csv', sep=',', encoding='latin-1')
# movies = pd.read_csv('ml-25m/movies.csv', sep = '\t')

In [26]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [27]:
#Setup users and movie datasets and count the unique rows
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

Number of users = 610 | Number of movies = 9724


In [28]:
# Create ratings matrix, each row is a unique user and each column is a movie
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
R = Ratings.to_numpy()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [36]:
# Checking the sparsity of the dataset
sparsity = round(1.0 - len(ratings) / float(n_users * n_movies), 3)
print ('The sparsity level of MovieLens1M dataset is ' +  str(sparsity * 100) + '%')

The sparsity level of MovieLens1M dataset is 98.3%


In [37]:
# SVD set up
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(Ratings_demeaned, k = 50)

In [38]:
# matrix form of the diagonal
sigma = np.diag(sigma)

In [39]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [40]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,2.167328,0.402751,0.840184,-0.076281,-0.551337,2.504091,-0.890114,-0.026443,0.196974,1.593259,...,-0.023453,-0.019967,-0.026939,-0.026939,-0.023453,-0.026939,-0.023453,-0.023453,-0.023453,-0.058732
1,0.211459,0.006658,0.033455,0.017419,0.18343,-0.062473,0.083037,0.024158,0.04933,-0.15253,...,0.019498,0.016777,0.022219,0.022219,0.019498,0.022219,0.019498,0.019498,0.019498,0.032281
2,0.003588,0.030518,0.046393,0.008176,-0.006247,0.107328,-0.012416,0.003779,0.007297,-0.059362,...,0.005909,0.006209,0.00561,0.00561,0.005909,0.00561,0.005909,0.005909,0.005909,0.008004
3,2.051549,-0.387104,-0.252199,0.087562,0.130465,0.27021,0.477835,0.040313,0.025858,-0.017365,...,0.004836,0.004172,0.0055,0.0055,0.004836,0.0055,0.004836,0.004836,0.004836,-0.023311
4,1.344738,0.778511,0.065749,0.111744,0.273144,0.584426,0.25493,0.128788,-0.085541,1.023455,...,-0.008042,-0.007419,-0.008664,-0.008664,-0.008042,-0.008664,-0.008042,-0.008042,-0.008042,-0.010127


In [49]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.userId == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    print ('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print ('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

In [52]:
already_rated, predictions = recommend_movies(preds, 350, movies, ratings, 20)

User 350 has already rated 40 movies.
Recommending highest 20 predicted ratings movies not already rated.


In [53]:
# Top 20 movies that User 1310 has rated 
already_rated.head(20)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
38,350,1356,5.0,864941018,Star Trek: First Contact (1996),Action|Adventure|Sci-Fi|Thriller
0,350,1,4.0,864940931,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
9,350,260,4.0,864940972,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
37,350,1210,4.0,864941053,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
26,350,786,4.0,864940972,Eraser (1996),Action|Drama|Thriller
25,350,785,4.0,864941161,Kingpin (1996),Comedy
24,350,748,4.0,864941229,"Arrival, The (1996)",Action|Sci-Fi|Thriller
18,350,671,4.0,864941229,Mystery Science Theater 3000: The Movie (1996),Comedy|Sci-Fi
11,350,494,4.0,864940972,Executive Decision (1996),Action|Adventure|Thriller
39,350,1393,4.0,864941118,Jerry Maguire (1996),Drama|Romance
