In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import sklearn
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens-dataset/movies.csv
/kaggle/input/movielens-dataset/ratings.csv


In [2]:
movies = pd.read_csv("../input/movielens-dataset/movies.csv")
movies.head()

ratings = pd.read_csv('../input/movielens-dataset/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [3]:
n_users = len(ratings['userId'].unique())
n_movies = len(ratings['movieId'].unique())
n_ratings = len(ratings)
  
user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
user_freq.head()

Unnamed: 0,userId,n_ratings
0,1,113
1,2,29
2,3,73
3,4,124
4,5,68


In [4]:
mean_rating = ratings.groupby('movieId')[['rating']].mean()

# Lowest rated movies
lowest_rated = mean_rating['rating'].idxmin()
movies.loc[movies['movieId'] == lowest_rated]

# Highest rated movies
highest_rated = mean_rating['rating'].idxmax()
movies.loc[movies['movieId'] == highest_rated]

# ratings[ratings['movieId']==highest_rated]
# ratings[ratings['movieId']==lowest_rated]
  
# Bayesian average
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()

In [5]:
def create_matrix(df):
      
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())
    
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))
    
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))
      
    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]
  
    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
      
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [6]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

In [7]:
def find_similar_movies(movie_id, X, k, metric = 'cosine', show_distance = False):
      
    neighbour_ids = []
      
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k+=1
    
    kNN = NearestNeighbors(n_neighbors = k, algorithm = 'brute', metric = metric)
    kNN.fit(X)
    
    movie_vec = movie_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
        
    neighbour_ids.pop(0)
    
    return neighbour_ids

In [8]:
movie_titles = dict(zip(movies['movieId'], movies['title']))
  
movie_id = 6
  
similar_ids = find_similar_movies(movie_id, X, k=5)
movie_title = movie_titles[movie_id]
  
print("Since you watched", movie_title)
for i in similar_ids:
    print(movie_titles[i])

Since you watched Heat (1995)
Rock, The (1996)
Eraser (1996)
Broken Arrow (1996)
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
Mission: Impossible (1996)
