# Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.declarative import declarative_base
import sqlalchemy
from sqlalchemy.sql import func
from datetime import datetime

# Credentials

In [2]:
pg_user_dev = 'postgres'
pg_pass_dev = 'admin'
pg_db_dev = 'postgres'
pg_host_dev = 'localhost'
pg_port_dev = 5432
DEV_DB = f"postgresql://{pg_user_dev}:{pg_pass_dev}@{pg_host_dev}:{pg_port_dev}/{pg_db_dev}"

# DB Session

In [3]:
# Start the session
engine = create_engine(DEV_DB, echo=False, client_encoding="UTF-8")
Session = sessionmaker(bind=engine)
Session.configure(bind=engine)
session = Session()
meta = MetaData(bind=engine)
Base = declarative_base()

# Define Tables

In [4]:
# Read in tables from the database
class MovieUsers(Base):
    __table__ = Table('movie_users', meta, autoload=True, schema="movie_recommender")
    ratings = relationship('Ratings', backref='movie_users', lazy=True)
    tags = relationship('Tags', backref='movie_users', lazy=True)

# Read in tables from the database
class Movies(Base):
    __table__ = Table('movies', meta, autoload=True, schema="movie_recommender")
    ratings = relationship('Ratings', backref='movies', lazy=True)
    tags = relationship('Tags', backref='movies', lazy=True)
    links = relationship('Links', backref='movies', uselist=False, lazy=True)


class Ratings(Base):
    __table__ = Table('ratings', meta, autoload=True, schema="movie_recommender")

class Tags(Base):
    __table__ = Table('tags', meta, autoload=True, schema="movie_recommender")

class Links(Base):
    __table__ = Table('links', meta, autoload=True, schema="movie_recommender")

In [5]:
ratings = pd.read_sql_table(
    "ratings",
    con=engine,
    schema = "movie_recommender"
)

In [6]:
movies = pd.read_sql_table(
    "movies",
    con=engine,
    schema = "movie_recommender"
)

In [7]:
users = pd.read_sql_table(
    "movie_users",
    con=engine,
    schema = "movie_recommender"
)

In [8]:
genres = [x for x in movies.columns if 'genre_' in x]

In [9]:
movies

Unnamed: 0,movieId,title,genre_no_genres_listed,genre_action,genre_adventure,genre_animation,genre_children,genre_comedy,genre_crime,genre_documentary,...,genre_film_noir,genre_horror,genre_imax,genre_musical,genre_mystery,genre_romance,genre_sci_fi,genre_thriller,genre_war,genre_western
0,1,Toy Story (1995),False,False,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,Jumanji (1995),False,False,True,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,Grumpier Old Men (1995),False,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
3,4,Waiting to Exhale (1995),False,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
4,5,Father of the Bride Part II (1995),False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),False,True,False,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
9738,193583,No Game No Life: Zero (2017),False,False,False,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
9739,193585,Flint (2017),False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9740,193587,Bungo Stray Dogs: Dead Apple (2018),False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Top Movies by Genre

In [10]:
# Get Top 20 Most Popular Movies
movie_counts_subquery = session.query(Ratings.movieId, func.count(Ratings.movieId).label(
    "movies_count")).group_by(Ratings.movieId).subquery()
movie_counts = session.query(Movies.movieId, Movies.title, movie_counts_subquery.c.movies_count).join(
    movie_counts_subquery, (Movies.movieId == movie_counts_subquery.c.movieId)).order_by(
    sqlalchemy.desc("movies_count")).all()

In [11]:
movie_counts = pd.DataFrame(movie_counts, columns = ["movieId", "title", "movies_count"])

In [12]:
movie_counts

Unnamed: 0,movieId,title,movies_count
0,356,Forrest Gump (1994),329
1,318,"Shawshank Redemption, The (1994)",317
2,296,Pulp Fiction (1994),307
3,593,"Silence of the Lambs, The (1991)",279
4,2571,"Matrix, The (1999)",278
...,...,...,...
9719,61319,Somers Town (2008),1
9720,4032,"Everlasting Piece, An (2000)",1
9721,173205,The Meyerowitz Stories (2017),1
9722,173209,War Machine (2017),1


In [13]:
# Get Most Popular Movies for Specific Genre
genre = "genre_drama"
movie_counts_genre_subquery = session.query(Ratings.movieId, func.count(Ratings.movieId).label(
    "movies_count")).join(Movies).filter(getattr(Movies, genre) == True).group_by(Ratings.movieId).subquery()
movie_counts_genre = session.query(Movies.movieId, Movies.title, movie_counts_genre_subquery.c.movies_count).join(
    movie_counts_genre_subquery, (Movies.movieId == movie_counts_genre_subquery.c.movieId)).order_by(
    sqlalchemy.desc("movies_count")).all()

In [14]:
movie_counts_genre = pd.DataFrame(movie_counts_genre, columns = ["movieId", "title", "movies_count"])

In [15]:
movie_counts_genre

Unnamed: 0,movieId,title,movies_count
0,356,Forrest Gump (1994),329
1,318,"Shawshank Redemption, The (1994)",317
2,296,Pulp Fiction (1994),307
3,110,Braveheart (1995),237
4,527,Schindler's List (1993),220
...,...,...,...
4344,8239,Viridiana (1961),1
4345,757,Ashes of Time (Dung che sai duk) (1994),1
4346,33201,Between Your Legs (Entre las piernas) (1999),1
4347,33237,San Francisco (1936),1


In [16]:
movie_counts_genres = {}
for genre in genres:
    movie_counts_genre_subquery = session.query(Ratings.movieId, func.count(Ratings.movieId).label(
    "movies_count")).join(Movies).filter(getattr(Movies, genre) == True).group_by(Ratings.movieId).subquery()
    movie_counts_genre = session.query(
        Movies.movieId, Movies.title, movie_counts_genre_subquery.c.movies_count).join(
    movie_counts_genre_subquery, (Movies.movieId == movie_counts_genre_subquery.c.movieId)).order_by(
    sqlalchemy.desc("movies_count")).all()
    movie_counts_genre = pd.DataFrame(movie_counts_genre, columns = ["movieId", "title", "movie_count"])
    movie_counts_genres[genre] = movie_counts_genre

In [17]:
movie_counts_genres["genre_action"]

Unnamed: 0,movieId,title,movie_count
0,2571,"Matrix, The (1999)",278
1,260,Star Wars: Episode IV - A New Hope (1977),251
2,480,Jurassic Park (1993),238
3,110,Braveheart (1995),237
4,589,Terminator 2: Judgment Day (1991),224
...,...,...,...
1823,64695,Sword of the Stranger (Sutorejia: Mukô hadan) ...,1
1824,63826,Splinter (2008),1
1825,62299,Alone in the Dark II (2008),1
1826,62008,Dead Fury (2008),1


# Top Genres

In [18]:
genres

['genre_no_genres_listed',
 'genre_action',
 'genre_adventure',
 'genre_animation',
 'genre_children',
 'genre_comedy',
 'genre_crime',
 'genre_documentary',
 'genre_drama',
 'genre_fantasy',
 'genre_film_noir',
 'genre_horror',
 'genre_imax',
 'genre_musical',
 'genre_mystery',
 'genre_romance',
 'genre_sci_fi',
 'genre_thriller',
 'genre_war',
 'genre_western']

In [19]:
cols = [sqlalchemy.func.count(1).filter(getattr(Movies, x) == True) for x in genres]

In [20]:
# Get Most Popular Movies for Specific Genre
top_genres = session.query(*cols).all()
top_genres = {genres[x]:top_genres[0][x] for x in range(len(genres))}
top_genres = pd.Series(top_genres).sort_values(ascending = False)
top_genres

genre_drama               4361
genre_comedy              3756
genre_thriller            1894
genre_action              1828
genre_romance             1596
genre_adventure           1263
genre_crime               1199
genre_sci_fi               980
genre_horror               978
genre_fantasy              779
genre_children             664
genre_animation            611
genre_mystery              573
genre_documentary          440
genre_war                  382
genre_musical              334
genre_western              167
genre_imax                 158
genre_film_noir             87
genre_no_genres_listed      34
dtype: int64

# Build up Movies from each Genre

In [21]:
top_10_genres = top_genres.index[0:10]

In [22]:
top_10_genres

Index(['genre_drama', 'genre_comedy', 'genre_thriller', 'genre_action',
       'genre_romance', 'genre_adventure', 'genre_crime', 'genre_sci_fi',
       'genre_horror', 'genre_fantasy'],
      dtype='object')

In [23]:
top_10_genres_movie_counts = {key:value for key,value in movie_counts_genres.items() if key in top_10_genres}

In [24]:
ratings_matrix = pd.pivot_table(data = ratings, index = 'userId', values = 'rating', columns = 'movieId')

In [25]:
user_id = 1

In [26]:
movies_rated = ratings_matrix.loc[user_id].dropna().index

In [27]:
num_movies_rated = len(movies_rated)

In [28]:
min_neighbours = 1
max_neighbours = 40

In [29]:
min_support = 3

In [30]:
ratings_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [31]:
ratings_sub_matrix = ratings_matrix.drop(user_id).loc[:,movies_rated]

In [32]:
num_matching_ratings = ratings_sub_matrix.notnull().sum(axis=1)

In [33]:
# If there are more movies then the minimum support, we want to only consider
# The users that have the minimum matching movies
if num_movies_rated >= min_support:
    # Calculate the number of matching ratings
    num_matching_ratings = ratings_sub_matrix.notnull().sum(axis=1)
    # Consider only the users greater then the min support
    matching_users = num_matching_ratings[num_matching_ratings >= min_support].index
    ratings_sub_matrix = ratings_sub_matrix.loc[matching_users]

In [34]:
ratings_sub_matrix

movieId,1,3,6,47,50,70,101,110,151,157,...,3671,3702,3703,3729,3740,3744,3793,3809,4006,5060
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,,,,,,,,...,,,5.0,,,,,,,
4,,,,2.0,,,,,,,...,,,,,,,,3.0,,
5,4.0,,,,4.0,,,4.0,,,...,,,,,,,,,,
6,,5.0,4.0,4.0,1.0,,,5.0,4.0,,...,,,,,,,,,,
7,4.5,,,,4.5,,,,,,...,,,,,,,3.5,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,3.0,4.5,4.0,,3.5,,4.0,...,,,,,,,,,,
607,4.0,,,,,,,5.0,,,...,,,,,,,3.0,,,3.0
608,2.5,2.0,,4.5,4.5,3.0,,4.0,,,...,,,,,,,4.0,1.5,,
609,3.0,,,,,,,3.0,,,...,,,,,,,,,,


In [35]:
user_ratings = ratings_matrix.loc[user_id].dropna()

In [36]:
ratings_diffs = (ratings_sub_matrix - user_ratings)**2

In [37]:
user_dists = np.sqrt(ratings_diffs.sum(axis = 1))

In [38]:
user_sims = 1 / (1 + user_dists);

In [39]:
user_sims.sort_values()

userId
68     0.044330
599    0.048389
217    0.048433
160    0.050518
474    0.051035
         ...   
366    0.585786
278    0.666667
511    0.666667
550    0.666667
77     1.000000
Length: 576, dtype: float64

In [40]:
most_similar_users = user_sims.sort_values(ascending = False).iloc[0:max_neighbours]

In [41]:
most_similar_users

userId
77     1.000000
278    0.666667
511    0.666667
550    0.666667
366    0.585786
258    0.500000
9      0.500000
523    0.500000
360    0.500000
481    0.500000
53     0.500000
49     0.500000
538    0.472136
319    0.472136
154    0.472136
398    0.449490
65     0.430501
530    0.414214
25     0.414214
162    0.414214
90     0.414214
13     0.414214
515    0.400000
300    0.400000
499    0.400000
473    0.387426
338    0.376179
581    0.376179
476    0.366025
582    0.366025
595    0.366025
157    0.366025
189    0.356789
505    0.356789
460    0.356789
30     0.356789
272    0.356789
504    0.348331
205    0.348331
491    0.340542
dtype: float64

In [42]:
compare_ratings = ratings[
    (ratings['userId'] == 278) & ratings['movieId'].isin(movies_rated)].sort_values(by = 'movieId')

In [43]:
pd.DataFrame(most_similar_users, columns = ['similarity'])

Unnamed: 0_level_0,similarity
userId,Unnamed: 1_level_1
77,1.0
278,0.666667
511,0.666667
550,0.666667
366,0.585786
258,0.5
9,0.5
523,0.5
360,0.5
481,0.5


In [44]:
ratings_sub_matrix = pd.merge(ratings_matrix, pd.DataFrame(most_similar_users, columns = ['similarity']), 
                        left_index = True, right_index = True)

In [45]:
ratings_sub_matrix = ratings_sub_matrix.dropna(axis = 1, how = 'all')

In [46]:
movies_to_rate = [x for x in ratings_sub_matrix.columns if x != 'similarity']

In [47]:
for col in movies_to_rate:
    ratings_sub_matrix[col] = ratings_sub_matrix[col] * ratings_sub_matrix['similarity']
    total_weights = ratings_sub_matrix[ratings_sub_matrix[col].notnull()]['similarity'].sum()
    ratings_sub_matrix[col] = ratings_sub_matrix[col]/total_weights

In [48]:
num_ratings = pd.DataFrame(ratings_sub_matrix[movies_to_rate].notnull().sum(), columns = ['num_ratings'])

In [49]:
movie_ratings = pd.DataFrame(ratings_sub_matrix[movies_to_rate].sum(), columns = ['rating'])

In [50]:
movie_ratings = pd.merge(movie_ratings, num_ratings, left_index = True, right_index = True)

In [51]:
movie_ratings

Unnamed: 0,rating,num_ratings
1,3.890420,5
2,4.288675,2
4,3.000000,1
7,4.000000,1
10,3.000000,1
...,...,...
190209,4.000000,1
190213,1.000000,1
190215,1.500000,1
190219,1.000000,1


In [52]:
movie_ratings['relative_confidence'] = np.log(1 + movie_ratings['num_ratings'])

In [53]:
movie_ratings.sort_values(by = 'relative_confidence', ascending = False)

Unnamed: 0,rating,num_ratings,relative_confidence
318,4.694323,19,2.995732
79132,4.347668,17,2.890372
356,4.451967,16,2.833213
2959,4.551260,16,2.833213
527,4.764767,15,2.772589
...,...,...,...
2392,3.000000,1,0.693147
2396,5.000000,1,0.693147
2401,4.000000,1,0.693147
2427,4.500000,1,0.693147


In [54]:
movie_ratings['weighted_rating'] = movie_ratings['rating'] * movie_ratings['relative_confidence']

In [55]:
movie_ratings.sort_values(by = 'weighted_rating', ascending = False)[0:20]

Unnamed: 0,rating,num_ratings,relative_confidence,weighted_rating
318,4.694323,19,2.995732,14.062935
527,4.764767,15,2.772589,13.210738
2571,4.661323,15,2.772589,12.923933
2959,4.55126,16,2.833213,12.894689
356,4.451967,16,2.833213,12.613372
79132,4.347668,17,2.890372,12.566377
4993,4.478256,14,2.70805,12.127343
109487,4.593233,13,2.639057,12.121805
58559,4.686635,12,2.564949,12.020981
7153,4.524997,13,2.639057,11.941727


In [56]:
movie_ratings.sort_values(by = 'num_ratings', ascending = False)[0:20]

Unnamed: 0,rating,num_ratings,relative_confidence,weighted_rating
318,4.694323,19,2.995732,14.062935
79132,4.347668,17,2.890372,12.566377
356,4.451967,16,2.833213,12.613372
2959,4.55126,16,2.833213,12.894689
527,4.764767,15,2.772589,13.210738
2571,4.661323,15,2.772589,12.923933
4993,4.478256,14,2.70805,12.127343
7153,4.524997,13,2.639057,11.941727
109487,4.593233,13,2.639057,12.121805
58559,4.686635,12,2.564949,12.020981


In [57]:
movie_ratings.sort_values(by = 'rating', ascending = False)[0:20]

Unnamed: 0,rating,num_ratings,relative_confidence,weighted_rating
110102,5.0,1,0.693147,3.465736
2687,5.0,1,0.693147,3.465736
4090,5.0,1,0.693147,3.465736
671,5.0,1,0.693147,3.465736
7888,5.0,1,0.693147,3.465736
98243,5.0,1,0.693147,3.465736
122900,5.0,1,0.693147,3.465736
72998,5.0,1,0.693147,3.465736
71264,5.0,1,0.693147,3.465736
520,5.0,1,0.693147,3.465736


# Create function for recommending videos

In [58]:
movies_rated = ratings_matrix.loc[user_id].dropna().index

In [59]:
movies_rated

Int64Index([   1,    3,    6,   47,   50,   70,  101,  110,  151,  157,
            ...
            3671, 3702, 3703, 3729, 3740, 3744, 3793, 3809, 4006, 5060],
           dtype='int64', name='movieId', length=232)

In [60]:
ratings_matrix.loc[user_id].dropna()

movieId
1       4.0
3       4.0
6       4.0
47      5.0
50      5.0
       ... 
3744    4.0
3793    5.0
3809    4.0
4006    4.0
5060    5.0
Name: 1, Length: 232, dtype: float64

In [61]:
def get_nearest_neighbors(user_id):
    
    # Get the ratings for the user
    user_ratings = ratings_matrix.loc[user_id].dropna()
    
    # Get a list of all movies rated by the user
    movies_rated = user_ratings.index
    
    # The total number of movies rated by the user
    num_movies_rated = len(user_ratings)
    
    #Only consider the users that are not yourself and the movies that the user has rated
    ratings_sub_matrix = ratings_matrix.drop(user_id).loc[:,movies_rated]
    
    # Get the number of matching movies rated by other users
    num_matching_ratings = ratings_sub_matrix.notnull().sum(axis=1)
    
    # If there are more movies then the minimum support, we want to only consider
    # The users that have the minimum matching movies
    if num_movies_rated >= min_support:
        # Consider only the users greater then the min support
        matching_users = num_matching_ratings[num_matching_ratings >= min_support].index
        ratings_sub_matrix = ratings_sub_matrix.loc[matching_users]
        
    # Get the difference of movie ratings between the user and all other users
    ratings_diffs = (ratings_sub_matrix - user_ratings)**2
    # Get the Euclidian Distance
    user_dists = np.sqrt(ratings_diffs.sum(axis = 1))
    # Get similarity score
    user_sims = 1 / (1 + user_dists);
    
    user_sims = pd.DataFrame(user_sims, columns = ['similarity']).sort_values(
        by = 'similarity', ascending = False)
    most_similar_users = user_sims.iloc[0:max_neighbours]
    
    # Merge the most similar users and their similarity onto their movie ratings
    ratings_sub_matrix = pd.merge(ratings_matrix, most_similar_users, 
                        left_index = True, right_index = True)
    
    # Remove all movies that have not been rated
    ratings_sub_matrix = ratings_sub_matrix.dropna(axis = 1, how = 'all')
    
    movies_to_rate = [x for x in ratings_sub_matrix.columns if x != 'similarity']
    
    for col in movies_to_rate:
        # Weight all of the ratings by the user similarity
        ratings_sub_matrix[col] = ratings_sub_matrix[col] * ratings_sub_matrix['similarity']
        total_weights = ratings_sub_matrix[ratings_sub_matrix[col].notnull()]['similarity'].sum()
        ratings_sub_matrix[col] = ratings_sub_matrix[col]/total_weights
    
    # Get the total number of ratings
    num_ratings = pd.DataFrame(ratings_sub_matrix[movies_to_rate].notnull().sum(), columns = ['num_ratings'])
    # Get the weighted sum of the ratings
    movie_ratings = pd.DataFrame(ratings_sub_matrix[movies_to_rate].sum(), columns = ['rating'])
    movie_ratings = pd.merge(movie_ratings, num_ratings, left_index = True, right_index = True)
    
    # Get the relative confidence score, the more total ratings, the more confident we can be in the score
    movie_ratings['relative_confidence'] = np.log(1 + movie_ratings['num_ratings'])
    # Weight the rating with the confidence
    movie_ratings['weighted_rating'] = movie_ratings['rating'] * movie_ratings['relative_confidence']
    
    movie_ratings = movie_ratings.sort_values(by='weighted_rating', ascending = False)
    
    return movie_ratings

In [62]:
get_nearest_neighbors(user_id = 1)

Unnamed: 0,rating,num_ratings,relative_confidence,weighted_rating
318,4.694323,19,2.995732,14.062935
527,4.764767,15,2.772589,13.210738
2571,4.661323,15,2.772589,12.923933
2959,4.551260,16,2.833213,12.894689
356,4.451967,16,2.833213,12.613372
...,...,...,...,...
169,1.000000,1,0.693147,0.693147
1760,0.500000,1,0.693147,0.346574
86644,0.500000,1,0.693147,0.346574
130634,0.500000,1,0.693147,0.346574


# Function for Predicting Nearest Neighbours

In [63]:
my_ratings = ratings_matrix.loc[user_id].dropna().reset_index()
my_ratings.columns = ['movieId', 'rating']
my_ratings = my_ratings.to_dict(orient = 'records')

In [64]:
my_ratings

[{'movieId': 1, 'rating': 4.0},
 {'movieId': 3, 'rating': 4.0},
 {'movieId': 6, 'rating': 4.0},
 {'movieId': 47, 'rating': 5.0},
 {'movieId': 50, 'rating': 5.0},
 {'movieId': 70, 'rating': 3.0},
 {'movieId': 101, 'rating': 5.0},
 {'movieId': 110, 'rating': 4.0},
 {'movieId': 151, 'rating': 5.0},
 {'movieId': 157, 'rating': 5.0},
 {'movieId': 163, 'rating': 5.0},
 {'movieId': 216, 'rating': 5.0},
 {'movieId': 223, 'rating': 3.0},
 {'movieId': 231, 'rating': 5.0},
 {'movieId': 235, 'rating': 4.0},
 {'movieId': 260, 'rating': 5.0},
 {'movieId': 296, 'rating': 3.0},
 {'movieId': 316, 'rating': 3.0},
 {'movieId': 333, 'rating': 5.0},
 {'movieId': 349, 'rating': 4.0},
 {'movieId': 356, 'rating': 4.0},
 {'movieId': 362, 'rating': 5.0},
 {'movieId': 367, 'rating': 4.0},
 {'movieId': 423, 'rating': 3.0},
 {'movieId': 441, 'rating': 4.0},
 {'movieId': 457, 'rating': 5.0},
 {'movieId': 480, 'rating': 4.0},
 {'movieId': 500, 'rating': 3.0},
 {'movieId': 527, 'rating': 5.0},
 {'movieId': 543, 'rati

In [65]:
my_ratings = [{'movieId': 356, 'title': 'Forrest Gump (1994)', 'movies_count': 329, 'rating': 4},
 {'movieId': 296, 'title': 'Pulp Fiction (1994)', 'movies_count': 307, 'rating': 5},
 {'movieId': 593,
  'title': 'Silence of the Lambs, The (1991)',
  'movies_count': 279, 'rating': 5},
 {'movieId': 2959, 'title': 'Fight Club (1999)', 'movies_count': 218, 'rating': 5},
 {'movieId': 2858, 'title': 'American Beauty (1999)', 'movies_count': 204, 'rating': 5},
 {'movieId': 858, 'title': 'Godfather, The (1972)', 'movies_count': 192, 'rating': 5},
 {'movieId': 1265, 'title': 'Groundhog Day (1993)', 'movies_count': 143, 'rating': 3},
 {'movieId': 79132, 'title': 'Inception (2010)', 'movies_count': 143, 'rating': 4},
 {'movieId': 1197, 'title': 'Princess Bride, The (1987)', 'movies_count': 142, 'rating': 4},
 {'movieId': 1704, 'title': 'Good Will Hunting (1997)', 'movies_count': 141, 'rating': 4.5},
 {'movieId': 1136,
  'title': 'Monty Python and the Holy Grail (1975)',
  'movies_count': 136, 'rating': 3},
 {'movieId': 293,
  'title': 'Léon: The Professional (a.k.a. The Professional) (Léon) (1994)',
  'movies_count': 133, 'rating' : 5},
 {'movieId': 1089, 'title': 'Reservoir Dogs (1992)', 'movies_count': 131, 'rating': 4},
 {'movieId': 1221,
  'title': 'Godfather: Part II, The (1974)',
  'movies_count': 129, 'rating': 5},
 {'movieId': 1682, 'title': 'Truman Show, The (1998)', 'movies_count': 125, 'rating': 4.5},
 {'movieId': 1206, 'title': 'Clockwork Orange, A (1971)', 'movies_count': 120, 'rating': 3},
 {'movieId': 4963, 'title': "Ocean's Eleven (2001)", 'movies_count': 119, 'rating': 5},
 {'movieId': 5989, 'title': 'Catch Me If You Can (2002)', 'movies_count': 115, 'rating': 5}]

In [66]:
ratings_matrix.loc[user_id].dropna()

movieId
1       4.0
3       4.0
6       4.0
47      5.0
50      5.0
       ... 
3744    4.0
3793    5.0
3809    4.0
4006    4.0
5060    5.0
Name: 1, Length: 232, dtype: float64

In [67]:
def get_nearest_neighbors_prediction(my_ratings):
    my_ratings = pd.DataFrame([{key:value for key,value in y.items() if key in [
        'movieId','rating']} for y in my_ratings]).set_index('movieId')

    my_ratings = my_ratings['rating']

    # Get a list of all movies rated by the user
    movies_rated = my_ratings.index

    # The total number of movies rated by the user
    num_movies_rated = len(my_ratings)

    #Only consider the users that are not yourself and the movies that the user has rated
    ratings_sub_matrix = ratings_matrix.loc[:,movies_rated]

    # Get the number of matching movies rated by other users
    num_matching_ratings = ratings_sub_matrix.notnull().sum(axis=1)

    # If there are more movies then the minimum support, we want to only consider
    # The users that have the minimum matching movies
    if num_movies_rated >= min_support:
        # Consider only the users greater then the min support
        matching_users = num_matching_ratings[num_matching_ratings >= min_support].index
        ratings_sub_matrix = ratings_sub_matrix.loc[matching_users]

    # Get the difference of movie ratings between the user and all other users
    ratings_diffs = (ratings_sub_matrix - my_ratings)**2
    # Get the Euclidian Distance
    user_dists = np.sqrt(ratings_diffs.sum(axis = 1))
    # Get similarity score
    user_sims = 1 / (1 + user_dists);

    user_sims = pd.DataFrame(user_sims, columns = ['similarity']).sort_values(
        by = 'similarity', ascending = False)
    most_similar_users = user_sims.iloc[0:max_neighbours]

    # Merge the most similar users and their similarity onto their movie ratings
    ratings_sub_matrix = pd.merge(ratings_matrix, most_similar_users, 
                        left_index = True, right_index = True)

    # Remove all movies that have not been rated
    ratings_sub_matrix = ratings_sub_matrix.dropna(axis = 1, how = 'all')

    movies_to_rate = [x for x in ratings_sub_matrix.columns if x != 'similarity']

    for col in movies_to_rate:
        # Weight all of the ratings by the user similarity
        ratings_sub_matrix[col] = ratings_sub_matrix[col] * ratings_sub_matrix['similarity']
        total_weights = ratings_sub_matrix[ratings_sub_matrix[col].notnull()]['similarity'].sum()
        ratings_sub_matrix[col] = ratings_sub_matrix[col]/total_weights

    # Get the total number of ratings
    num_ratings = pd.DataFrame(ratings_sub_matrix[movies_to_rate].notnull().sum(), columns = ['num_ratings'])
    # Get the weighted sum of the ratings
    movie_ratings = pd.DataFrame(ratings_sub_matrix[movies_to_rate].sum(), columns = ['rating'])
    movie_ratings = pd.merge(movie_ratings, num_ratings, left_index = True, right_index = True)

    # Get the relative confidence score, the more total ratings, the more confident we can be in the score
    movie_ratings['relative_confidence'] = np.log(1 + movie_ratings['num_ratings'])
    # Weight the rating with the confidence
    movie_ratings['weighted_rating'] = movie_ratings['rating'] * movie_ratings['relative_confidence']

    movie_ratings = movie_ratings.sort_values(by='weighted_rating', ascending = False)
    
    return movie_ratings

In [68]:
movie_ratings = get_nearest_neighbors_prediction(my_ratings)

In [69]:
pd.merge(movie_ratings, movies, left_index = True, right_index = True)[0:40]

Unnamed: 0,rating,num_ratings,relative_confidence,weighted_rating,movieId,title,genre_no_genres_listed,genre_action,genre_adventure,genre_animation,...,genre_film_noir,genre_horror,genre_imax,genre_musical,genre_mystery,genre_romance,genre_sci_fi,genre_thriller,genre_war,genre_western
296,4.83201,23,3.178054,15.356387,338,Virtuosity (1995),False,True,False,False,...,False,False,False,False,False,False,True,True,False,False
593,4.872475,18,2.944439,14.346706,735,Cemetery Man (Dellamorte Dellamore) (1994),False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
318,4.669991,20,3.044522,14.217891,360,I Love Trouble (1994),False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
356,4.229635,20,3.044522,12.87722,412,"Age of Innocence, The (1993)",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
527,4.149405,21,3.091042,12.825987,616,"Aristocats, The (1970)",False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
858,5.0,10,2.397895,11.989476,1129,Escape from New York (1981),False,True,True,False,...,False,False,False,False,False,False,True,True,False,False
50,4.748835,11,2.484907,11.800411,55,Georgia (1995),False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2571,4.417652,13,2.639057,11.658437,3439,Teenage Mutant Ninja Turtles II: The Secret of...,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
110,4.204665,15,2.772589,11.657805,128,Jupiter's Wife (1994),False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
457,4.283814,14,2.70805,11.600783,522,Romper Stomper (1992),False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [70]:
pd.merge(movie_ratings, movies, left_index = True, right_index = True)[0:40]

Unnamed: 0,rating,num_ratings,relative_confidence,weighted_rating,movieId,title,genre_no_genres_listed,genre_action,genre_adventure,genre_animation,...,genre_film_noir,genre_horror,genre_imax,genre_musical,genre_mystery,genre_romance,genre_sci_fi,genre_thriller,genre_war,genre_western
296,4.83201,23,3.178054,15.356387,338,Virtuosity (1995),False,True,False,False,...,False,False,False,False,False,False,True,True,False,False
593,4.872475,18,2.944439,14.346706,735,Cemetery Man (Dellamorte Dellamore) (1994),False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
318,4.669991,20,3.044522,14.217891,360,I Love Trouble (1994),False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
356,4.229635,20,3.044522,12.87722,412,"Age of Innocence, The (1993)",False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
527,4.149405,21,3.091042,12.825987,616,"Aristocats, The (1970)",False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
858,5.0,10,2.397895,11.989476,1129,Escape from New York (1981),False,True,True,False,...,False,False,False,False,False,False,True,True,False,False
50,4.748835,11,2.484907,11.800411,55,Georgia (1995),False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2571,4.417652,13,2.639057,11.658437,3439,Teenage Mutant Ninja Turtles II: The Secret of...,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
110,4.204665,15,2.772589,11.657805,128,Jupiter's Wife (1994),False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
457,4.283814,14,2.70805,11.600783,522,Romper Stomper (1992),False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


# Get Predicted Rating for Every User

In [71]:
user_id = 1

In [72]:
my_ratings = ratings_matrix.loc[user_id].dropna().reset_index()
my_ratings.columns = ['movieId', 'rating']
my_ratings = my_ratings.to_dict(orient = 'records')

In [73]:
my_ratings

[{'movieId': 1, 'rating': 4.0},
 {'movieId': 3, 'rating': 4.0},
 {'movieId': 6, 'rating': 4.0},
 {'movieId': 47, 'rating': 5.0},
 {'movieId': 50, 'rating': 5.0},
 {'movieId': 70, 'rating': 3.0},
 {'movieId': 101, 'rating': 5.0},
 {'movieId': 110, 'rating': 4.0},
 {'movieId': 151, 'rating': 5.0},
 {'movieId': 157, 'rating': 5.0},
 {'movieId': 163, 'rating': 5.0},
 {'movieId': 216, 'rating': 5.0},
 {'movieId': 223, 'rating': 3.0},
 {'movieId': 231, 'rating': 5.0},
 {'movieId': 235, 'rating': 4.0},
 {'movieId': 260, 'rating': 5.0},
 {'movieId': 296, 'rating': 3.0},
 {'movieId': 316, 'rating': 3.0},
 {'movieId': 333, 'rating': 5.0},
 {'movieId': 349, 'rating': 4.0},
 {'movieId': 356, 'rating': 4.0},
 {'movieId': 362, 'rating': 5.0},
 {'movieId': 367, 'rating': 4.0},
 {'movieId': 423, 'rating': 3.0},
 {'movieId': 441, 'rating': 4.0},
 {'movieId': 457, 'rating': 5.0},
 {'movieId': 480, 'rating': 4.0},
 {'movieId': 500, 'rating': 3.0},
 {'movieId': 527, 'rating': 5.0},
 {'movieId': 543, 'rati

In [74]:
my_predicted_ratings_matrix = get_nearest_neighbors_prediction(my_ratings)

In [76]:
my_predicted_ratings_matrix['movieId'] = my_predicted_ratings_matrix.index

In [77]:
my_predicted_ratings_matrix['userId'] = user_id

In [81]:
predicted_ratings = []

for current_user in users['userId'].values:
    print(current_user)
    my_predicted_ratings_matrix = get_nearest_neighbors_prediction(my_ratings)
    my_predicted_ratings_matrix['movieId'] = my_predicted_ratings_matrix.index
    my_predicted_ratings_matrix['userId'] = current_user
    predicted_ratings.append(my_predicted_ratings_matrix)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [84]:
predicted_ratings = pd.concat(predicted_ratings)

In [86]:
ratings

Unnamed: 0,id,userId,movieId,rating,timestamp
0,1,1,1,4.0,2000-07-30 14:45:03
1,2,1,3,4.0,2000-07-30 14:20:47
2,3,1,6,4.0,2000-07-30 14:37:04
3,4,1,47,5.0,2000-07-30 15:03:35
4,5,1,50,5.0,2000-07-30 14:48:51
...,...,...,...,...,...
100831,100832,610,166534,4.0,2017-05-03 17:53:22
100832,100833,610,168248,5.0,2017-05-03 18:21:31
100833,100834,610,168250,5.0,2017-05-08 15:50:47
100834,100835,610,168252,5.0,2017-05-03 17:19:12


In [85]:
predicted_ratings

Unnamed: 0,rating,num_ratings,relative_confidence,weighted_rating,movieId,userId
318,4.694323,19,2.995732,14.062935,318,1
527,4.794772,16,2.833213,13.584613,527,1
2571,4.702609,16,2.833213,13.323495,2571,1
2959,4.608082,17,2.890372,13.319069,2959,1
79132,4.347668,17,2.890372,12.566377,79132,1
...,...,...,...,...,...,...
4131,1.000000,1,0.693147,0.693147,4131,610
130634,0.500000,1,0.693147,0.346574,130634,610
1760,0.500000,1,0.693147,0.346574,1760,610
167296,0.500000,1,0.693147,0.346574,167296,610
