# Import libraries

In [99]:
import pandas as pd
import numpy as np 
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

# Read in data files

In [62]:
ratings = pd.read_csv('ratings.dat', sep='::', engine='python', names=['UserId', 'MovieId', 'Rating', 'Timestamp'], header=None)
movies = pd.read_csv('movies.dat', sep='::', engine='python', names=['MovieId', 'Title', 'Genres'], header=None, encoding='latin1')
ratings = pd.merge(ratings, movies, on='MovieId')
ratings.head()

Unnamed: 0,UserId,MovieId,Rating,Timestamp,Title,Genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


# Data Exploration

In [63]:
print(f"\n Size of the movie_df dataset is {ratings.shape}")


 Size of the movie_df dataset is (1000209, 6)


In [64]:
ratings.describe()

Unnamed: 0,UserId,MovieId,Rating,Timestamp
count,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.512,1865.54,3.581564,972243700.0
std,1728.413,1096.041,1.117102,12152560.0
min,1.0,1.0,1.0,956703900.0
25%,1506.0,1030.0,3.0,965302600.0
50%,3070.0,1835.0,4.0,973018000.0
75%,4476.0,2770.0,4.0,975220900.0
max,6040.0,3952.0,5.0,1046455000.0


Based on the plot, we can see that all users possess at least 20 ratings, which guarantees the accuracy of the CF method

In [65]:
ratings.groupby('UserId')['Rating'].count().sort_values(ascending = True).head()

UserId
947     20
4068    20
2530    20
341     20
5258    20
Name: Rating, dtype: int64

In [66]:
n_users = ratings.UserId.unique().shape[0]
n_movies = ratings.MovieId.unique().shape[0]
print( str(n_users) + ' unqiue users')
print( str(n_movies) + ' unique movies')

6040 unqiue users
3706 unique movies


Now I notice that althouth we have 3706 unique movies, the movies id ranges between 1 and 3952, which indicates that the movieId is not strictly corelated with indices, so further conversion is required before building the interaction matrix

In [67]:
user_to_index = {user_id: index for index, user_id in enumerate(ratings.UserId.unique())}
movie_to_index = {movie_id: index for index, movie_id in enumerate(ratings.MovieId.unique())}
index_to_movie = {index: movie_id for movie_id, index in movie_to_index.items()}

# Create Interaction Matrix

In [68]:
rows = ratings.UserId.map(user_to_index)
cols = ratings.MovieId.map(movie_to_index)
interaction = np.zeros((n_users, n_movies))
for row in ratings.itertuples():
    interaction[user_to_index[row[1]], movie_to_index[row[2]]] = row[3]
print(interaction)

[[5. 3. 3. ... 0. 0. 0.]
 [5. 0. 0. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Sparsity Exploration

In [69]:
num_nonzero = len(interaction.nonzero()[0])
print(num_nonzero)
total_entries = interaction.shape[0] * interaction.shape[1]
sparsity = 1 - (num_nonzero / total_entries)

print(f"Sparsity of the interaction matrix: {sparsity:.4f}")

1000209
Sparsity of the interaction matrix: 0.9553


I noticed that the sparsity is ranther high for the dataset, however in the first trial we are going to use cosine similarity directly on this matrix without further transformation such as dimensionality reduction or embedding

# Create Similarity Matrix

In [70]:
rating_cosine_similarity = cosine_similarity(interaction)

In [71]:
rating_cosine_similarity[0]

array([1.        , 0.09638153, 0.11030329, ..., 0.        , 0.        ,
       0.        ])

# Design Recommender Function

In [93]:
def movie_recommender(user_item_m, X_user, user, k=10, top_n=10):
    # Get the location of the actual user in the User-Items matrix
    # Use it to index the User similarity matrix
    assert 6040 >= user >= 1
    user_index = user_to_index[user]
    user_similarities = X_user[user_index]
    # obtain the indices of the top k most similar users
    most_similar_users = user_item_m.index[user_similarities.argpartition(-k)[-k:]]
    # Obtain the mean ratings of those users for all movies
    rec_movies = user_item_m.loc[most_similar_users].mean(0).sort_values(ascending=False)
    # Discard already seen movies
    m_seen_movies = user_item_m.loc[user_index].gt(0)
    seen_movies = m_seen_movies.index[m_seen_movies].tolist()
    rec_movies = rec_movies.drop(seen_movies).head(top_n)
    # return recommendations - top similar users rated movies
    rec_movies_a=rec_movies.index.to_frame().reset_index(drop=True)
    rec_movies_a.rename(columns={rec_movies_a.columns[0]: 'MovieId'}, inplace=True)
    # since we use index to represent movie Id before, we need to convert indices back to movieId
    rec_movies_a['MovieId'] = rec_movies_a['MovieId'].map(index_to_movie)
    return rec_movies_a

# Recommend movie

In [73]:
interaction_df=pd.DataFrame(interaction)

In [98]:
user_ID=1
rec_movie_df = movie_recommender(interaction_df, rating_cosine_similarity,user_ID)
rec_movie_df

Unnamed: 0,MovieId
0,2081
1,2078
2,364
3,2096
4,1282
5,596
6,2085
7,593
8,2137
9,34


# Design a wrapper function

Use the wrapper function to automate the recommending sequence such that user only need to provide their name and the system is able to recommend movies based on user-based similarity

In [94]:
def movie_recommender_run(user_Id):
    rec_movie_df = movie_recommender(interaction_df, rating_cosine_similarity,user_Id)
    rec_movie_df = pd.merge(rec_movie_df, ratings, on='MovieId')
    rec_movies = rec_movie_df.drop_duplicates(subset=['Title'])['Title']
    return rec_movies

In [96]:
movie_recommender_run(6040)

0              Lawrence of Arabia (1962)
831         To Kill a Mockingbird (1962)
1759             Bonnie and Clyde (1967)
2445    Manchurian Candidate, The (1962)
3210                   Roger & Me (1989)
4008           African Queen, The (1951)
5065                  Being There (1979)
5656            American Graffiti (1973)
6646            Kramer Vs. Kramer (1979)
7098       Last Picture Show, The (1971)
Name: Title, dtype: object