In [1]:
import pandas as pd
from scipy import linalg as la
import numpy as np
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import linalg as spla

In [2]:
#Benjamin's path: path = "/Users/Armen/Desktop/SpringDataProject/"
#Ben C's path:
path = "/Users/benchristensen/Desktop/Recommneder_System_Project/ml-20m/"
r = pd.read_csv(path + "ratings.csv", nrows = 10000)

In [3]:
#Create variable "count" for number of ratings for each movie
merged = r.merge(r.groupby("movieId").size().reset_index(name='count'), how='right', on='movieId')

In [4]:
#Drop any movies that have 17 or fewer ratings
df = merged[merged["count"]>17].sort_values(['userId', 'movieId'])

In [5]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,count
12,1,32,3.5,1112484819,24
36,1,47,3.5,1112484727,28
64,1,50,3.5,1112484580,28
127,1,253,4.0,1112484940,20
147,1,260,4.0,1112484826,34


In [6]:
#Generate user-movie matrix with ratings as its values
user_c = CategoricalDtype(sorted(df.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(df.movieId.unique()), ordered=True)

row = df.userId.astype(user_c).cat.codes
col = df.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((df['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))



In [7]:
#Change to dense for viewing its data
dense = sparse_matrix.todense()

In [8]:
#Create sparse dataframe from user-movie matrix
dfs = pd.SparseDataFrame(sparse_matrix, \
                         index=user_c.categories, \
                         columns=movie_c.categories, \
                         default_fill_value=2.5)

In [9]:
#Initialize nearest neighbors model
neigh = NearestNeighbors(n_neighbors=1000, algorithm='brute', metric='cosine')

In [10]:
#Fit on the movie ratings we have
model = neigh.fit(sparse_matrix)

In [11]:
m = pd.read_csv(path + "movies.csv")

In [28]:
def recommend_movies(df, matrix, fitted_model, user_id, n):
    alpha=1
    movieids_seen = df[df.userId == user_id].movieId.values
    #mask is the indices of the movies that userid has seen in the user-movie matrix
    mask = []
    for i, j in enumerate(movie_c.categories):
        if j in df[df.userId == user_id].movieId.values:
            mask.append(i)
    #use the mask to get the indices of the movies userid hasn't seen
    not_seen = list(set(range(len(movie_c.categories))) - set(mask))
    #cut out the movies (columns) userid has seen from our matrix
    ######new = dense[:,not_seen]
    new = matrix[:,not_seen]
    #get the indices for nearest neighbors of userId
    indices = fitted_model.kneighbors(matrix[user_id-1], 3)[1][0]
    
    #neighbors is the actual rows of the matrix with each neighbors ratings
    neighbors = new[indices].todense()
    #find how many times each movie has been rated by the neighbors
    neigh_movies = []

    #find how many times each movie has been rated by the neighbors
    ratings_count = []
    #Loop through the movie_ids not seen by our user
    for movie_id in np.array(movie_c.categories)[not_seen]:
        count = 0
        #Loop through the user_ids of our user's neighbor
        for id_ in indices + 1:
            #Check if the userId has rated the movie
            if movie_id in df[df.userId == id_].movieId.values:
                count += 1
        ratings_count.append(count)
    ratings_count = np.array(ratings_count)
    #Choose the movie to recommend (weight frequently-seen-movies higher)
    position = np.argmax(np.mean(neighbors - 2.5, axis=0)*alpha*ratings_count.reshape((-1,1)))
    #get the movieid from the matrix column value
    movie_id = np.array(movie_c.categories)[not_seen][position]
    print('movie_id',movie_id)
    return movie_id, m[m.movieId == movie_id].title

In [29]:
print(recommend_movies(df, sparse_matrix, model, 12, 1))

movie_id 21
(21, 20    Get Shorty (1995)
Name: title, dtype: object)


In [None]:
x = np.array([[1,1,.5],
        [1,6,2],
        [2,7,4],
        [3,2,4.5],
        [3,3,3],
        [3,4,1],
        [4,1,2],
        [5,4,1.5],
        [6,3,5],
        [7,10,4.5],
        [8,5,4.5],
        [9,8,.5],
        [9,9,1.5],
        [10,3,3.5]])
df = pd.DataFrame(x,columns=['userId','movieId','rating'])

user_c = CategoricalDtype(sorted(df.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(df.movieId.unique()), ordered=True)

row = df.userId.astype(user_c).cat.codes
col = df.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((df['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))

dense = sparse_matrix.todense()