In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tests as t
import progressbar
from scipy.sparse import csr_matrix
from IPython.display import HTML


%matplotlib inline

# Read in the datasets
movies = pd.read_csv('./Datasets/movies_clean.csv')
reviews = pd.read_csv('./Datasets/reviews_clean.csv')[:2000]

del movies['Unnamed: 0']
del reviews['Unnamed: 0']

print(reviews.head())

   user_id  movie_id  rating   timestamp            date_time
0        1    114508       8  1381006850  2013-10-06 02:30:50
1        2    358273       9  1579057827  2020-01-15 08:40:27
2        2  10039344       5  1578603053  2020-01-10 02:20:53
3        2   6751668       9  1578955697  2020-01-14 04:18:17
4        2   7131622       8  1579559244  2020-01-21 03:57:24


In [32]:
user_items = reviews[['user_id', 'movie_id', 'rating']]
user_items.head()

Unnamed: 0,user_id,movie_id,rating
0,1,114508,8
1,2,358273,9
2,2,10039344,5
3,2,6751668,9
4,2,7131622,8


In [33]:
user_by_movie = user_items.groupby(['user_id', 'movie_id'])['rating'].max().unstack()
user_by_movie.head()

movie_id,33477,37059,38650,39628,40522,41716,42332,45152,45591,45607,...,8579674,8632862,8772262,9086228,9243946,9495224,9541602,10039344,10367276,10648440
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,10.0,,,,,,,5.0,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [153]:
def movies_watched(user_id):
    movies = user_by_movie.loc[user_id][user_by_movie.loc[user_id].isnull()==False].index.values
    return movies

In [154]:
def create_user_movie_dict():
    n_users = user_by_movie.shape[0]
    
    user_id_movies = dict()
    
    cnter = 0
    bar = progressbar.ProgressBar(maxval=n_users+1, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    
    for user_id in range(1, n_users+1):
        movies = movies_watched(user_id)
        user_id_movies[user_id] = movies
        cnter += 1
        bar.update(cnter)
        
    bar.finish()
        
    return user_id_movies
        
movies_seen = create_user_movie_dict()    



In [39]:
def create_movies_to_analyze(movies_seen, lower_bound):
    
    movies_to_analyze = dict()
    
    for user_id, movies in movies_seen.items():
        if len(movies) > lower_bound:
            movies_to_analyze[user_id] = movies
    return movies_to_analyze

movies_to_analyze = create_movies_to_analyze(movies_seen, 2)

In [40]:
movies_to_analyze

{2: array([  358273,  6751668,  7131622,  7975244,  7984734,  8579674,
        10039344], dtype=int64),
 3: array([ 790636, 1800241, 2278871, 2395417, 3344922], dtype=int64),
 4: array([ 267626, 1343092, 1477855, 1920849, 2024432, 2084970], dtype=int64),
 5: array([2884206, 3040964, 5022702], dtype=int64),
 6: array([1800241, 2378281, 2980516], dtype=int64),
 7: array([1001508, 1142988, 1292566, 1355644, 1441953, 1454029, 1878870,
        2250912, 2543164, 3783958], dtype=int64),
 11: array([ 481141, 1282140, 2388771, 4477536, 5213744, 6450804, 6957966,
        7286456, 8201170, 8579674], dtype=int64),
 12: array([1478839, 7343762, 8367814, 8579674], dtype=int64),
 15: array([1386697, 2277860, 3076658, 4196776], dtype=int64),
 18: array([ 110076, 1188982, 2574698], dtype=int64),
 19: array([1843866, 1872181, 2103281, 2235779, 3322940], dtype=int64),
 21: array([ 770828, 1210819, 1663662, 1670345, 2334879], dtype=int64),
 27: array([  73486,   75314,  105323,  119114,  120620,  146838, 

In [127]:
def compute_corr_pearson(user1, user2):
    movies1 = movies_to_analyze[user1]
    movies2 = movies_to_analyze[user2]
    
    sim_mov = np.intersect1d(movies1, movies2, assume_unique=True)
    
    if len(sim_mov) > 0:
        
        df = user_by_movie.loc[(user1, user2), sim_mov]
    
        dist = np.linalg.norm(df.loc[user1] - df.loc[user2])
        
        return dist

In [128]:
corr_df = []
corr = 0
for user1 in movies_to_analyze:
    for user2 in movies_to_analyze:
        dist = compute_corr_pearson(user1, user2)
        if dist is not None:
            corr_df.append({'user1':user1, 'user2':user2, 'euc_dist':dist})

In [132]:
df_dist = pd.DataFrame(corr_df)

In [183]:
user_by_movie.loc[61][user_by_movie.loc[61].isnull() == False].count()

121

In [141]:
def find_closest_distance(user):
    closest_neighbor = df_dist[df_dist['user1'] == user].sort_values(by='euc_dist').iloc[1:]['user2']
    closest_neighbor = np.array(closest_neighbor)
    return closest_neighbor
print(find_closest_distance(11))

[41 74 12 91  2 61 95]


In [178]:
def movies_liked(user_id, min_rating=7):
    return np.array(user_items.query('user_id == @user_id and rating >= @min_rating')['movie_id'])
movies_liked(11)

array([1282140, 2388771, 4477536, 6450804, 7286456, 8201170, 8579674],
      dtype=int64)

In [152]:
def movie_names(movie_ids):
    movie_lst = movies[movies['movie_id'].isin(movie_ids)]['movie']
    return list(movie_lst)
movie_names(movies_liked(11))

['Easy A (2010)',
 'Mowgli (2018)',
 'Fifty Shades Freed (2018)',
 'Terminator: Dark Fate (2019)',
 'Joker (2019)',
 'The Perfect Date (2019)',
 '1917 (2019)']

In [184]:
def make_recommendations(user, num_recs=10):
    # movies_seen by user (we don't want to recommend these)
    movies_seen = movies_watched(user)
    closest_neighbors = find_closest_distance(user)
    
    # Keep the recommended movies here
    recs = np.array([])
    
    # Go through the neighbors and identify movies they like the user hasn't seen
    for neighbor in closest_neighbors:
        neighbs_likes = movies_liked(neighbor)
        
        #Obtain recommendations for each neighbor
        new_recs = np.setdiff1d(neighbs_likes, movies_seen, assume_unique=True)
        
        # Update recs with new recs
        recs = np.unique(np.concatenate([new_recs, recs], axis=0))
        
        # If we have enough recommendations exit the loop
        if len(recs) > num_recs-1:
            break
    
    # Pull movie titles using movie ids
    recommendations = movie_names(recs)
    
    return recommendations[:10]

In [185]:
make_recommendations(2)

['The Deer Hunter (1978)',
 'Fahrenheit 451 (2018)',
 'A Walk Among the Tombstones (2014)',
 'Jurassic World (2015)',
 'Shazam! (2019)',
 'Ex Machina (2014)',
 'World War Z (2013)',
 'Lone Survivor (2013)',
 'Cold in July (2014)',
 'Broken City (2013)']

In [186]:
def all_recs(num_recs=10):
    users = np.unique(df_dist['user1'])
    n_users = users.shape[0]
    
    all_recs = dict()
    
    center = 0
    bar = progressbar.ProgressBar(maxval=n_users+1, widgets=[progressbar.Bar('=','[',']'),' ', progressbar.Percentage()])
    bar.start()
    
    for user in users:
        center += 1
        bar.update(center)
        
        recs = make_recommendations(user, num_recs)
        all_recs[user] = recs
        
    bar.finish()
    return all_recs

In [187]:
all_recs()



{2: ['The Deer Hunter (1978)',
  'Fahrenheit 451 (2018)',
  'A Walk Among the Tombstones (2014)',
  'Jurassic World (2015)',
  'Shazam! (2019)',
  'Ex Machina (2014)',
  'World War Z (2013)',
  'Lone Survivor (2013)',
  'Cold in July (2014)',
  'Broken City (2013)'],
 3: ['The Godfather: Part II (1974)',
  'Alien (1979)',
  'Aliens (1986)',
  'The Fly (1986)',
  'A Few Good Men (1992)',
  'Heat (1995)',
  'Meet Joe Black (1998)',
  'Rush Hour (1998)',
  'Fight Club (1999)',
  'Sleepy Hollow (1999)'],
 4: ['The Godfather (1972)',
  'The Godfather: Part II (1974)',
  'The Godfather: Part III (1990)',
  'Beauty and the Beast (1991)',
  'The Prestige (2006)',
  'World War Z (2013)',
  'Prisoners (2013)',
  'Despicable Me 2 (2013)',
  "We're the Millers (2013)",
  'Pompeii (2014)'],
 5: ['Rebel Without a Cause (1955)',
  'What Ever Happened to Baby Jane? (1962)',
  'Westworld (1973)',
  'Star Wars (1977)',
  'The Shining (1980)',
  'An American Werewolf in London (1981)',
  'Vacation (1983)