In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import seaborn as sns

from collections import Counter

%matplotlib inline

In [3]:
df_movies = pd.read_csv('/content/drive/MyDrive/netflix_prize_data/movies_lens/movie.csv')
df_ratings = pd.read_csv('/content/drive/MyDrive/netflix_prize_data/movies_lens/rating.csv')
df_tags = pd.read_csv('/content/drive/MyDrive/netflix_prize_data/movies_lens/tag.csv')
df_links = pd.read_csv('/content/drive/MyDrive/netflix_prize_data/movies_lens/link.csv')
df_genome_scores = pd.read_csv('/content/drive/MyDrive/netflix_prize_data/movies_lens/genome_scores.csv')
df_genome_tags = pd.read_csv('/content/drive/MyDrive/netflix_prize_data/movies_lens/genome_tags.csv')

# User-User Collaborative Filtering

## Data Preprocessing

In [4]:
#drop the timestamp in rating dataframe
df_ratings.drop('timestamp', axis = 1, inplace = True)

In [5]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


Find the rating count for each user and sort in descending order

In [None]:
df_user_ratings_no = df_ratings.groupby(by=['userId'])['rating'].count().reset_index()

In [None]:
df_user_ratings_no.sort_values(by=['rating'], ascending=False)[:100]

Unnamed: 0,userId,rating
118204,118205,9254
8404,8405,7515
82417,82418,5646
121534,121535,5520
125793,125794,5491
...,...,...
2260,2261,2644
42203,42204,2639
902,903,2608
69792,69793,2608


This collaborative filtering will be worked on 1000 users **(userId 1 to 1000)** and the movies which has **at least 100 rating count** because of the limitation of the computing power

In [6]:
df_ratings_sample = df_ratings[df_ratings['userId'] <= 1000]

In [7]:
df_ratings_sample.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [8]:
# find the rating count for each movie
df_ratings_count = df_ratings_sample.groupby('movieId')['rating'].count()

In [9]:
df_ratings_count = pd.merge(df_ratings_sample, df_ratings_count, on = 'movieId', how = 'inner')

In [10]:
df_ratings_count.head()

Unnamed: 0,userId,movieId,rating_x,rating_y
0,1,2,3.5,165
1,5,2,3.0,165
2,13,2,3.0,165
3,29,2,3.0,165
4,34,2,3.0,165


In [16]:
#select the movie which has at lest 100 rating
df_ratings_count = df_ratings_count[df_ratings_count['rating_y'] > 100]

In [12]:
df_ratings_count.drop('rating_y', axis = 1, inplace = True)

In [13]:
df_ratings_count.shape

(52362, 3)

In [14]:
df_ratings_count.rename(columns={'rating_x':'rating'},inplace=True)

## Create user movie martrix

In [18]:
def create_user_movie_ratings_matrix(df):
  '''
    Create the user movie matrix which is further used for collaborative filtering

    INPUT:
    df - pandas dataframe with userid, movieid
    
    OUTPUT:
    user_movie - user movie ratings matrix 
  '''
  user_movie_ratings_matrix = df.groupby(by = ['userId','movieId'])['rating'].max().unstack().fillna(0)

  return user_movie_ratings_matrix.astype(float)

In [19]:
#create matrix
user_movie_ratings_matrix = create_user_movie_ratings_matrix(df_ratings_count)

In [20]:
user_movie_ratings_matrix.head()

movieId,1,2,6,7,10,11,16,17,19,21,25,32,34,36,39,47,48,50,62,70,95,104,110,111,141,150,153,160,161,163,165,172,173,185,186,196,208,223,231,235,...,3996,4011,4022,4027,4034,4226,4306,4878,4886,4896,4963,4973,4993,4995,5349,5378,5418,5445,5816,5952,5989,6016,6333,6365,6377,6539,6711,6874,7153,7361,7438,8360,8636,8961,32587,33794,44191,48516,58559,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,3.5,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,4.0,4.0,0.0,4.0,0.0,3.5,4.0,3.5,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,4.0,0.0,4.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,3.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,5.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,2.0,4.0,0.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
def find_similarity(userId1, userId2, user_movie_ratings_matrix=user_movie_ratings_matrix):
  '''
    Calculate similarity of users

    INPUT:
    userId1 - user id 1
    userId 2 - user id 2
    user_moving_ratings_matrix - matrix of user moving with corresponding rating
    
    OUTPUT:
    similarity - similarity scores
  '''
  similarity = np.dot(user_movie_ratings_matrix.loc[userId1, :], user_movie_ratings_matrix.loc[userId2, :])
  
  return similarity

In [27]:
def get_similar_users(userId, user_movie_ratings_matrix=user_movie_ratings_matrix, m=10):
    '''
    INPUT:
    userId - user id
    user_moving_ratings_matrix - matrix of user moving with corresponding rating
    m - number of similar users
    
    OUTPUT:
    users[:m] - top m similar users
    '''
    users = []
    
    for i in user_movie_ratings_matrix.index:
        if i != userId:
            similarity = find_similarity(userId, i)
            users.append((i, similarity))
  
    users.sort(key=lambda x: x[1], reverse=True)
    return users[:m]

Get similar user of userId **156**

In [25]:
similar_users = get_similar_users(156)

In [26]:
similar_users

[(775, 3688.25),
 (741, 3482.75),
 (903, 3290.5),
 (982, 3138.25),
 (91, 3053.5),
 (586, 2996.75),
 (294, 2984.5),
 (58, 2972.5),
 (648, 2857.5),
 (359, 2854.0)]

In [57]:
def get_movies(userid):
    '''
    INPUT:
    userid - (int) a user id
       
    OUTPUT:
    movie_ids - set of movie_ids that the user has already watched
    
    '''
    movie_titles = []

    movie_ids = set(df_ratings[df_ratings['userId'] == userid].sort_values('rating', ascending = False)['movieId'].tolist())

    return movie_ids 

In [34]:
movie_dict = dict(zip(df_movies.movieId, df_movies.title))

def get_movie_titles(movieIds):
  '''
  Get movie titles according to movieIds and return titles

  INPUT :
  movieIds - movieId lists

  OUTPUT :
  movie_titles - list of movie titles according to the movieIds
  '''
  
  movie_titles = []
  for movieId in movieIds:
    if movieId in movie_dict:
      movie_titles.append(movie_dict[movieId])
  return movie_titles

In [60]:
def get_recommendations(userId, df_ratings=df_ratings_sample, m=10):
  '''
  INPUT:
  userId - user id
  df_ratings - userId movieId rating dataframe
  m - number of recommendations

  OUTPUT:
  movies - top m rated movies
  '''
  watched_movie_ids = get_movies(userId)

  similar_users = get_similar_users(userId)
 
  movies = []
  for (uId, _) in similar_users:
    movies.extend(list(get_movies(uId)))

  movies = list(set(movies))
  movies = [movie for movie in movies if movie not in watched_movie_ids]


  return get_movie_titles(movies[:m])

**List of recommended movies for user 755**

In [61]:
get_recommendations(775, 10)

['Waiting to Exhale (1995)',
 'Tom and Huck (1995)',
 'Sudden Death (1995)',
 'Money Train (1995)',
 'Assassins (1995)',
 'Now and Then (1995)',
 'Across the Sea of Time (1995)',
 'It Takes Two (1995)',
 'Cry, the Beloved Country (1995)',
 'Guardian Angel (1994)']

In [62]:
sample_movies_775 = get_movies(775)
get_movie_titles(sample_movies_775)

['Toy Story (1995)',
 'Jumanji (1995)',
 'Grumpier Old Men (1995)',
 'Beyond the Valley of the Dolls (1970)',
 'Hiroshima Mon Amour (1959)',
 'Heat (1995)',
 'Sabrina (1995)',
 'Father of the Bride Part II (1995)',
 'GoldenEye (1995)',
 'American President, The (1995)',
 'Dracula: Dead and Loving It (1995)',
 'Nixon (1995)',
 'Day of the Jackal, The (1973)',
 'Casino (1995)',
 'Sense and Sensibility (1995)',
 'Four Rooms (1995)',
 'Ace Ventura: When Nature Calls (1995)',
 'Cutthroat Island (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)',
 'Powder (1995)',
 'Leaving Las Vegas (1995)',
 'Othello (1995)',
 'Persuasion (1995)',
 'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
 'Dangerous Minds (1995)',
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
 'Night of the Living Dead (1990)',
 'Babe (1995)',
 'Carrington (1995)',
 'Dead Man Walking (1995)',
 'Clueless (1995)',
 'Richard III (1995)',
 'Dead Presidents (1995)',
 'Restoration (1995)',
 'Mortal Kombat (1995)',
 'To D