In [1]:
import pandas as pd
from scipy import linalg as la
import numpy as np
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import linalg as spla

In [2]:
#Benjamin's path: path = "/Users/Armen/Desktop/SpringDataProject/"
#Ben C's path:
path = "/Users/benchristensen/Desktop/Recommneder_System_Project/ml-20m/"
r = pd.read_csv(path + "ratings.csv", nrows = 10000)

In [3]:
#Create variable "count" for number of ratings for each movie
merged = r.merge(r.groupby("movieId").size().reset_index(name='count'), how='right', on='movieId')

In [4]:
#Find the minimum number of movies reviewed by a user
reviews_per_user = merged.groupby("userId").userId.apply(lambda x: len(x))


In [5]:
reviews_per_user.describe().min()

20.0

In [6]:
#Drop any movies that have 17 or fewer ratings
df = merged[merged["count"]>17].sort_values(['userId', 'movieId'])

In [7]:
#Generate user-movie matrix with ratings as its values
user_c = CategoricalDtype(sorted(df.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(df.movieId.unique()), ordered=True)

row = df.userId.astype(user_c).cat.codes
col = df.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((df['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))



In [8]:
#Change to dense for viewing its data
dense = sparse_matrix.todense()

In [9]:
dense.shape

(89, 74)

In [10]:
#Create sparse dataframe from user-movie matrix
dfs = pd.SparseDataFrame(sparse_matrix, \
                         index=user_c.categories, \
                         columns=movie_c.categories, \
                         default_fill_value=2.5)

In [11]:
#Initialize nearest neighbors model
neigh = NearestNeighbors(n_neighbors=1000, algorithm='brute', metric='cosine')

In [12]:
#Fit on the movie ratings we have
model = neigh.fit(sparse_matrix)

In [13]:
m = pd.read_csv(path + "movies.csv")

In [14]:
m.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [22]:
def movies_not_seen(df, user_id, matrix, movie_c):
    """
    This function determines what movies a user has not seen; i.e. potential movies to recommend
    Inputs:
    df: movie dataframe
    user_id: user we want to recommend something to
    matrix: 
    Outputs:
    """
    #get the movies the user has seen
    movieids_seen = df[df.userId == user_id].movieId.values
    
    #mask is the indices of the movies that userid has seen in the user-movie matrix
    mask = []
    for i, j in enumerate(movie_c.categories):
        if j in df[df.userId == user_id].movieId.values:
            mask.append(i)
            
    #use the mask to get the indices of the movies userid hasn't seen
    not_seen = list(set(range(len(movie_c.categories))) - set(mask))
    
    #cut out the movies (columns) userid has seen from our matrix
    new = matrix[:,not_seen]
    
    return new, not_seen

def count_not_seen(df, movie_c, not_seen, indices):
    """
    This function returns the number a times a movie not seen by the user was seen by the user's nearest neighbors
    Input:
    Output
    """
    #find how many times each movie has been rated by the neighbors
    ratings_count = []
    #Loop through the movie_ids not seen by our user
    for movie_id in np.array(movie_c.categories)[not_seen]:
        count = 0
        #Loop through the user_ids of our user's neighbor
        for id_ in indices + 1:
            #Check if the userId has rated the movie
            if movie_id in df[df.userId == id_].movieId.values:
                count += 1
        ratings_count.append(count)
    return np.array(ratings_count)

def knn_recommend_movies(df, matrix, fitted_model, user_id, n, movie_c):
    """
    This function uses the K-nearest neighbors algorithm to suggest a movie for the user to watch.
    
    Inputs:
    df: sparse dataframe containing user and movie information. Contains columns user_id, movie_id, rating
    matrix: sparse matrix containing user_id on the y axis and movie_id on the x_axis, with the rating for each movie in the rows
    fitted_model: the type on model we run this clustering method on  
    user_id: the user we want to recommend a movie for
    n: number of movies to recommend (MAY NOT USE_ CHECK BEFORE FINAL)
    
    Outputs: 
    movie_id, title. The id number and title of the recommended movie
    """
    alpha=1      #we recommend 1 movie 
    new, not_seen = movies_not_seen(df, user_id, matrix, movie_c)
    
    #Initialize nearest neighbors model
    neigh = NearestNeighbors(n_neighbors=1000, algorithm='brute', metric='cosine')
    #Fit on the movie ratings we have
    model = neigh.fit(sparse_matrix)
    
    #get the indices for nearest neighbors of userId
    indices = fitted_model.kneighbors(matrix[user_id-1], 3)[1][0]
    
    #neighbors is the actual rows of the matrix with each neighbors ratings
    neighbors = new[indices].todense()

    #find how many times each movie has been rated by the neighbors
    ratings_count = count_not_seen(df, movie_c, not_seen, indices)
    
    #Choose the movie to recommend (weight frequently-seen-movies higher)
    position = np.argmax(np.mean(neighbors - 2.5, axis=0)*alpha*ratings_count.reshape((-1,1)))
    
    #get the movieid from the matrix column value
    movie_id = np.array(movie_c.categories)[not_seen][position]
    print('movie_id',movie_id)
    return movie_id, m[m.movieId == movie_id].title

In [23]:
print(knn_recommend_movies(df, sparse_matrix, model, 12, 1, movie_c))

<class 'scipy.sparse.csr.csr_matrix'>
movie_id 21
(21, 20    Get Shorty (1995)
Name: title, dtype: object)


In [None]:
x = np.array([[1,1,.5],
        [1,6,2],
        [2,7,4],
        [3,2,4.5],
        [3,3,3],
        [3,4,1],
        [4,1,2],
        [5,4,1.5],
        [6,3,5],
        [7,10,4.5],
        [8,5,4.5],
        [9,8,.5],
        [9,9,1.5],
        [10,3,3.5]])
df = pd.DataFrame(x,columns=['userId','movieId','rating'])

user_c = CategoricalDtype(sorted(df.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(df.movieId.unique()), ordered=True)

row = df.userId.astype(user_c).cat.codes
col = df.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((df['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))

dense = sparse_matrix.todense()

In [None]:
def model(avg_rating, score_from_model, recommend = False):
    recommended_score = avg_rating + score_from_model
    if recommended_score >= 3.5:
        recommend = True
    pass

In [None]:
def knn_recommend_movies_old(df, matrix, fitted_model, user_id, n):
    """
    This function uses the K-nearest neighbors algorithm to suggest a movie for the user to watch.
    
    Inputs:
    df: sparse dataframe containing user and movie information. Contains columns user_id, movie_id, rating
    matrix: sparse matrix containing user_id on the y axis and movie_id on the x_axis, with the rating for each movie in the rows
    fitted_model: the type on model we run this clustering method on  
    user_id: the user we want to recommend a movie for
    n: number of movies to recommend (MAY NOT USE_ CHECK BEFORE FINAL)
    
    Outputs: 
    movie_id, title. The id number and title of the recommended movie
    """
    alpha=1
    movieids_seen = df[df.userId == user_id].movieId.values
    #mask is the indices of the movies that userid has seen in the user-movie matrix
    mask = []
    for i, j in enumerate(movie_c.categories):
        if j in df[df.userId == user_id].movieId.values:
            mask.append(i)
    #use the mask to get the indices of the movies userid hasn't seen
    not_seen = list(set(range(len(movie_c.categories))) - set(mask))
    #cut out the movies (columns) userid has seen from our matrix
    new = matrix[:,not_seen]
    #get the indices for nearest neighbors of userId
    indices = fitted_model.kneighbors(matrix[user_id-1], 3)[1][0]
    #neighbors is the actual rows of the matrix with each neighbors ratings
    neighbors = new[indices].todense()
    #find how many times each movie has been rated by the neighbors
    neigh_movies = []

    #find how many times each movie has been rated by the neighbors
    ratings_count = []
    #Loop through the movie_ids not seen by our user
    for movie_id in np.array(movie_c.categories)[not_seen]:
        count = 0
        #Loop through the user_ids of our user's neighbor
        for id_ in indices + 1:
            #Check if the userId has rated the movie
            if movie_id in df[df.userId == id_].movieId.values:
                count += 1
        ratings_count.append(count)
    ratings_count = np.array(ratings_count)
    #Choose the movie to recommend (weight frequently-seen-movies higher)
    position = np.argmax(np.mean(neighbors - 2.5, axis=0)*alpha*ratings_count.reshape((-1,1)))
    #get the movieid from the matrix column value
    movie_id = np.array(movie_c.categories)[not_seen][position]
    print('movie_id',movie_id)
    return movie_id, m[m.movieId == movie_id].title

print(knn_recommend_movies_old(df, sparse_matrix, model, 12, 1))

## inherit from sklearn?
 - Scoring Accuracy:
 - predict for every movie of the user (binary recommend/do not recommend)
 - that will be your 'x test'
 - create a 'y test' that gives if the user liked each movie or not
 - percent correctly predicted.