## Collaborative Filtering Recommendation System

In [15]:
# This is based on the idea that similar people based on the data have similar preferences.
#  It predicts which item a user will like based on the item preferences of other similar users. This information is gotten from explicit feedback, from ratings or from implicite feedback eg listening purchasing and watching.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# Filtering warnings
warnings.simplefilter(action="ignore",category=FutureWarning)

In [16]:
# Loading the ratings data
ratings = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [17]:
# Loading the movies dataset
movies = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv")
movies.head()  

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
# Getting all the ratings
n_ratings=len(ratings['rating'])
print(f"The number of ratings are: {n_ratings}")

The number of ratings are: 100836


In [19]:
# Getting the unique ratings
n_ratings_unique=ratings['rating'].unique()
sorted(n_ratings_unique.tolist())

[0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]

In [20]:
# The number of Unique movies
n_movies=len(ratings['movieId'].unique())
print(f"The number of unique movies is :{n_movies}")
# Number of unique users
n_users=len(ratings['userId'].unique())
print(f"The number of unique users is: {n_users}")

The number of unique movies is :9724
The number of unique users is: 610


In [21]:
# Getting the average ratings per user
avg_ratings=n_ratings/n_users
print(f"The average number of ratings per user is: {round(avg_ratings,2)}")
# Getting the average movie ratings per movie
avg_ratings_movie=n_ratings/n_movies
print(f"The average number of ratings per movie is: {round(avg_ratings_movie,2)}")

The average number of ratings per user is: 165.3
The average number of ratings per movie is: 10.37


In [22]:
# Assessing user frequency
user_freq=ratings[["userId","movieId"]].groupby("userId").count().reset_index()
user_freq.head()

Unnamed: 0,userId,movieId
0,1,232
1,2,29
2,3,39
3,4,216
4,5,44


In [23]:
# Assessing movie ratings
movie_ratings=ratings.groupby('movieId')[['rating']].mean()
movie_ratings
# Getting the lowest rated movie
lowest_rate=movie_ratings['rating'].idxmin()
l_movies=movies[movies['movieId']==lowest_rate]
l_movies

Unnamed: 0,movieId,title,genres
2689,3604,Gypsy (1962),Musical


In [24]:
# Highest rated movie
# Getting the lowest rated movie
highest_rate=movie_ratings['rating'].idxmax()
h_movies=movies[movies['movieId']==highest_rate]
h_movies

Unnamed: 0,movieId,title,genres
48,53,Lamerica (1994),Adventure|Drama


In [25]:
# Assessing the users who rated the movies
movie_ratings_user=ratings.groupby('movieId')[['rating','userId']].mean()
movie_ratings_user.head()
# User that rated the lowest rated movie
movie_lowest=movie_ratings_user['rating'].idxmin()
# Getting the nameof the user who rated the lowest rated movie
lowest_user=movie_ratings_user[movie_ratings_user.index==movie_lowest]
lowest_user


Unnamed: 0_level_0,rating,userId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3604,0.5,89.0


In [26]:
# User that rated the highest rated movie
movie_highest=movie_ratings_user['rating'].idxmax()
# Getting the name of the user who rated the highest rated movie
highest_user=movie_ratings_user[movie_ratings_user.index==movie_highest]
highest_user 

Unnamed: 0_level_0,rating,userId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
53,5.0,344.0


In [27]:
def creating_similarity_matrix(df):
    # # Building the  user item matrix by virtue of movies and users
    N=len(df['userId'].unique())
    # Mapping users(Ids) to indices. This is like giving each user an index
    user_mapper= dict(zip(np.unique(df["userId"]),list(range(N))))
    user_mapper
    # Movie mapper. Unique movies devoid of duplication
    M=len(df['movieId'].unique())
    # mapping movie ids to indices
    movie_mapper=dict(zip(np.unique(df['movieId']),list(range(M))))
    movie_mapper
    # # A zip function creates an iterator that will aggregate elements from two or more iterables,you  can create a list or a dictionary from it
    # # HAVING AN INVERSE OF THE SAME
    # # Mapping indices to user Ids
    inv_user_mapper=dict(zip(list(range(N)),np.unique(df['userId'])))
    # Mapping indices to movie Ids
    inv_movie_mapper=dict(zip(list(range(M)),np.unique(df['movieId'])))
    # Iterating through each index for each movie to access the movie indexes and user indexes
    user_index=[user_mapper[i] for i in df['userId']]
    movie_index=[movie_mapper[i] for i in df['movieId']]
    # Forming a matrix using the csr format which will map the movie ID's with the user Ids combined with their ratings .
    # The shape of this matrix will be  9724 by 610 (as specified by the scipy csr function)
    from scipy.sparse import csr_matrix
    # Forming the matrix with the ratings as the data (for individual movie and user ratings), the row index as the movie index and the column index as the user index
    X=csr_matrix((df['rating'], (movie_index, user_index)), shape=(M, N))
    # Results
    return X,user_mapper,movie_mapper,inv_user_mapper,inv_movie_mapper
# Calling the function
X,user_mapper,movie_mapper,inv_user_mapper,inv_movie_mapper=creating_similarity_matrix(ratings)

### Building the mock recomender system (Using item based Collaborative Filtering)

In [28]:
def similar_movies(movie_id,k,X):
    # Using Knearest neighbours to look for similar movies based on the ratings
    from sklearn.neighbors import NearestNeighbors
    # Giving each movie an index(identifier)
    movie_index=movie_mapper[movie_id]
    # Granting the movie indexes a vector based on the Sparse matrix linking each movie to a rating to identify movies with similar ratings
    movie_vector=X[movie_index]
     # Creating an empty list so as to store each similar movie for every value of k
    neighbors_list=[]
    k+=1
    # Using KNN to indentify the movies neighbours based on the vectors from the sparse matrix X
    KNN=NearestNeighbors(n_neighbors=k,algorithm="brute",metric='cosine')
    KNN.fit(X)
    # Coverting the movie_vector array to a one dimensional array
    movie_vector.reshape(1,-1)
    # Identifying the neighbours from the nearest neighbours
    the_neighbours=KNN.kneighbors(movie_vector,return_distance=False)
    the_neighbours
       # Iterating through the values of K to add the movie indices to the list
    for i in range(0,k):
        the_items=the_neighbours.item(i)
        neighbors_list.append(inv_movie_mapper[the_items])
    neighbors_list.pop(0)
    return neighbors_list
# Getting the names of those movies based on their indexes
movie_title=dict(zip(movies['movieId'],movies['title']))
movie_title
# Calling the function to get the movie Ids Similar to the movie index
movie_id=2
similar_ids=similar_movies(movie_id,10,X)
print(f"Since you watched the movie:{movie_title[movie_id]}")
print("+++++++++++++++++++++++++++++")
print("You might also consider watching: ")
# # Iterating through the similar ids to get similar movies
for x in similar_ids:
    print(movie_title[x])

Since you watched the movie:Jumanji (1995)
+++++++++++++++++++++++++++++
You might also consider watching: 
Lion King, The (1994)
Mrs. Doubtfire (1993)
Mask, The (1994)
Jurassic Park (1993)
Home Alone (1990)
Nightmare Before Christmas, The (1993)
Aladdin (1992)
Beauty and the Beast (1991)
Ace Ventura: When Nature Calls (1995)
Santa Clause, The (1994)


### The Next Phase involves automating the recomender system to multiple movies