In [1]:
import pandas as pd
links = pd.read_csv("ml-latest-small/links.csv")
movies = pd.read_csv("ml-latest-small/movies.csv")
ratings = pd.read_csv("ml-latest-small/ratings.csv")
tags = pd.read_csv("ml-latest-small/tags.csv")

In [2]:
for i in (links,movies,ratings,tags):
    print(i.head(15),end="\n\n\n")

    movieId  imdbId   tmdbId
0         1  114709    862.0
1         2  113497   8844.0
2         3  113228  15602.0
3         4  114885  31357.0
4         5  113041  11862.0
5         6  113277    949.0
6         7  114319  11860.0
7         8  112302  45325.0
8         9  114576   9091.0
9        10  113189    710.0
10       11  112346   9087.0
11       12  112896  12110.0
12       13  112453  21032.0
13       14  113987  10858.0
14       15  112760   1408.0


    movieId                               title  \
0         1                    Toy Story (1995)   
1         2                      Jumanji (1995)   
2         3             Grumpier Old Men (1995)   
3         4            Waiting to Exhale (1995)   
4         5  Father of the Bride Part II (1995)   
5         6                         Heat (1995)   
6         7                      Sabrina (1995)   
7         8                 Tom and Huck (1995)   
8         9                 Sudden Death (1995)   
9        10             

In [3]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
n_ratings = len(ratings)
n_movies = len(ratings['movieId'].unique())
n_users = len(ratings['userId'].unique())

In [6]:
print(n_ratings ,n_movies,n_users,n_ratings/n_users,n_ratings/n_movies)

100836 9724 610 165.30491803278687 10.369806663924312


In [7]:
user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
user_freq.head(15)

Unnamed: 0,userId,n_ratings
0,1,232
1,2,29
2,3,39
3,4,216
4,5,44
5,6,314
6,7,152
7,8,47
8,9,46
9,10,140


In [8]:
mean_rating = ratings.groupby('movieId')[['rating']].mean()

In [9]:
lowest_rated = mean_rating['rating'].idxmin()
movies.loc[movies['movieId'] == lowest_rated]
highest_rated = mean_rating['rating'].idxmax()
movies.loc[movies['movieId'] == highest_rated]

Unnamed: 0,movieId,title,genres
48,53,Lamerica (1994),Adventure|Drama


In [10]:
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()
movie_stats

Unnamed: 0_level_0,count,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,215,3.920930
2,110,3.431818
3,52,3.259615
4,7,2.357143
5,49,3.071429
...,...,...
193581,1,4.000000
193583,1,3.500000
193585,1,3.500000
193587,1,3.500000


In [11]:
from scipy.sparse import csr_matrix

In [12]:
# wrong data set -> collobara
def create_matrix(df):
	
	N = len(df['userId'].unique())
	M = len(df['movieId'].unique())
	
	# Map Ids to indices
	user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
	movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))
	
	# Map indices to IDs
	user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
	movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))
	
	user_index = [user_mapper[i] for i in df['userId']]
	movie_index = [movie_mapper[i] for i in df['movieId']]

	X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
	
	return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

In [13]:
from sklearn.neighbors import NearestNeighbors
def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
	
	neighbour_ids = []
	
	movie_ind = movie_mapper[movie_id]
	movie_vec = X[movie_ind]
	k+=1
	kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
	kNN.fit(X)
	movie_vec = movie_vec.reshape(1,-1)
	neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
	for i in range(0,k):
		n = neighbour.item(i)
		neighbour_ids.append(movie_inv_mapper[n])
	neighbour_ids.pop(0)
	return neighbour_ids


In [18]:
movie_titles = dict(zip(movies['movieId'], movies['title']))

movie_id = 1376

similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]

print(f"Movie name -> {movie_title}")
for i in similar_ids:
	print(movie_titles[i])


Movie name -> Star Trek IV: The Voyage Home (1986)
Star Trek II: The Wrath of Khan (1982)
Star Trek VI: The Undiscovered Country (1991)
Star Trek III: The Search for Spock (1984)
Star Trek: Insurrection (1998)
Star Trek: The Motion Picture (1979)
Star Trek V: The Final Frontier (1989)
Star Trek: First Contact (1996)
Superman (1978)
Superman II (1980)
X-Files: Fight the Future, The (1998)
