In [219]:
import pandas as pd
from scipy import linalg as la
import numpy as np
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import linalg as spla

In [220]:
x = np.array([[1,10,.5],
        [1,60,2],
        [2,70,4],
        [2,20,5],
        [2,30,2],
        [2,40,1],
        [2,90,5],
        [3,20,4.5],
        [3,30,3],
        [3,40,1],
        [4,10,2],
        [5,40,1.5],
        [6,30,5],
        [6,20,4],
        [6,40,.5],
        [6,70,1],
        [6,90,4.5],
        [7,100,4.5],
        [8,50,4.5],
        [9,80,.5],
        [9,90,1.5],
        [10,30,3.5],
        [10,20,4.5],
        [10,50,2],
        [10,70,2],
        [10,90,4]])
df = pd.DataFrame(x,columns=['userId','movieId','rating'])
df.head()

Unnamed: 0,userId,movieId,rating
0,1.0,10.0,0.5
1,1.0,60.0,2.0
2,2.0,70.0,4.0
3,2.0,20.0,5.0
4,2.0,30.0,2.0


In [221]:
user_c = CategoricalDtype(sorted(df.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(df.movieId.unique()), ordered=True)

row = df.userId.astype(user_c).cat.codes
col = df.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((df['rating'], (row, col)), \
                           shape=(user_c.categories.size, movie_c.categories.size))

#dense = sparse_matrix.todense()
dfs = pd.SparseDataFrame(sparse_matrix, \
                         index=user_c.categories, \
                         columns=movie_c.categories, \
                         default_fill_value=2.5)
dense = dfs.values

In [222]:
dense

array([[0.5, 2.5, 2.5, 2.5, 2.5, 2. , 2.5, 2.5, 2.5, 2.5],
       [2.5, 5. , 2. , 1. , 2.5, 2.5, 4. , 2.5, 5. , 2.5],
       [2.5, 4.5, 3. , 1. , 2.5, 2.5, 2.5, 2.5, 2.5, 2.5],
       [2. , 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5],
       [2.5, 2.5, 2.5, 1.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5],
       [2.5, 4. , 5. , 0.5, 2.5, 2.5, 1. , 2.5, 4.5, 2.5],
       [2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 4.5],
       [2.5, 2.5, 2.5, 2.5, 4.5, 2.5, 2.5, 2.5, 2.5, 2.5],
       [2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 0.5, 1.5, 2.5],
       [2.5, 4.5, 3.5, 2.5, 2. , 2.5, 2. , 2.5, 4. , 2.5]])

In [223]:
type(movie_c.categories)

pandas.core.indexes.numeric.Float64Index

In [224]:

df[(df.userId == 1) | (df.userId == 2)]

Unnamed: 0,userId,movieId,rating
0,1.0,10.0,0.5
1,1.0,60.0,2.0
2,2.0,70.0,4.0
3,2.0,20.0,5.0
4,2.0,30.0,2.0
5,2.0,40.0,1.0
6,2.0,90.0,5.0


In [225]:
neigh = NearestNeighbors(algorithm='brute', metric='cosine')
model = neigh.fit(sparse_matrix)
#m = pd.read_csv("/Users/Armen/Desktop/SpringDataProject/movies.csv")

In [248]:
def recommend_movies(fitted_model, user_id, n):
    alpha=1
    movieids_seen = df[df.userId == user_id].movieId.values
    #mask is the indices of the movies that userid has seen
    mask = []
    for i, j in enumerate(movie_c.categories):
        if j in df[df.userId == 3].movieId.values:
            mask.append(i)
    #use the mask to get the indices of the movies userid hasn't seen
    not_seen = list(set(range(len(movie_c.categories))) - set(mask))
    #cut out the movies (columns) userid has seen from our matrix
    new = dense[:,not_seen]

    #get the 50 nearest neighbors of userId
    input_ = fitted_model.kneighbors(dense[user_id-1], 3)
    
    #neighbors is the actual rows of the matrix with each neighbors ratings
    neighbors = new[input_[1][0]]
    #find how many times each movie has been rated by the neighbors
    neigh_movies = []
    for id in input_[1][0] + 1:
        x = df[df.userId == id].movieId.values
        x = list(set(x) - set(movieids_seen))
        neigh_movies += x
    ratings_count = np.array(np.unique(neigh_movies,return_counts = True)[1])
    
    #Ben's calculation
    position = np.argmax((np.array(np.mean(neighbors - 2.5, axis=0))*alpha*ratings_count[2]))
    #get the movieid from the matrix column value
    movie_id = np.array(movie_c.categories)[not_seen][position]
    print('movie_id',movie_id)
    return movie_id#m[m.movieId == movie_id].title

In [249]:
x = recommend_movies(model, 3, 1)

[1 3 3]
[ 0.         -0.16666667  0.         -0.16666667  0.          2.
  0.        ]
5
movie_id 90.0


