In [None]:
# Matrix factorization machine learning for movie recommender system
# Template taken from https://github.com/khanhnamle1994/movielens/blob/master/SVD_Model.ipynb

# Import libraries
import numpy as np
import pandas as pd

# Reading ratings file
ratings = pd.read_csv('ml-latest-small/ratings.csv', sep=',', encoding='latin-1')
# ratings = pd.read_csv('ml-25m/ratings.csv', sep='\t')
# Reading users file
#users = pd.read_csv('ml-1m/users.csv', sep='\t', encoding='latin-1')
# users = pd.read_csv('ml-25m/users.csv', sep='\t')
# Reading movies file
movies = pd.read_csv('ml-latest-small/movies.csv', sep=',', encoding='latin-1')
# movies = pd.read_csv('ml-25m/movies.csv', sep = '\t')

In [None]:
movies.head()

In [None]:
#Setup users and movie datasets and count the unique rows
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

In [None]:
# Create ratings matrix, each row is a unique user and each column is a movie
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

In [None]:
R = Ratings.to_numpy()
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [None]:
# Checking the sparsity of the dataset
sparsity = round(1.0 - len(ratings) / float(n_users * n_movies), 3)
print ('The sparsity level of MovieLens1M dataset is ' +  str(sparsity * 100) + '%')

In [None]:
# SVD set up
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(Ratings_demeaned, k = 50)

In [None]:
# matrix form of the diagonal
sigma = np.diag(sigma)

In [None]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [None]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()

In [None]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.userId == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    print ('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print ('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

In [None]:
already_rated, predictions = recommend_movies(preds, 450, movies, ratings, 20)

In [None]:
# Top 20 movies that User 1310 has rated 
already_rated.head(20)

In [None]:
# Top 20 movies that User 1310 hopefully will enjoy
predictions

In [None]:
ratings = pd.read_csv('ml-25m/ratings.csv', sep=',')
movies = pd.read_csv('ml-25m/movies.csv', sep=',')

In [None]:
ratings.head()

In [None]:
#Setup users and movie datasets and count the unique rows
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies))

In [None]:
# Create ratings matrix, each row is a unique user and each column is a movie
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

In [69]:
mat = scipy.sparse.coo_matrix((ratings.userId, (ratings.rating, ratings.movieId)), shape=(n_users, n_movies))

NameError: name 'scipy' is not defined