In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate

In [4]:
#Working directory
working_directory = "C:/Users/Eric/Google Drive/Uni/Assignments/Social and Information Network Analysis\Assignment 3/ml-100k/"
#Get our data
train_set = pd.read_csv(working_directory + "u.data.csv", header=None)
movie_info = pd.read_csv(working_directory + "u.item.csv", header=None, encoding='latin-1', usecols=[0,1], names=['movieId', 'Title'])

In [5]:
#Transform training sets into user vs movie table
def pivot(data, val, ind, col):
    pivoted_data = pd.pivot_table(data, values=val, index=ind, columns=col, fill_value=0)
    return pivoted_data

In [6]:
#Calculate cosine value
def cosine(data):
    #Get cosine values
    cosined_table = pd.DataFrame(cosine_similarity(data), index=data.index.values.tolist(), columns=data.index.values.tolist())
    return cosined_table

In [7]:
def highest_n_values_for_all_users(data, n):    
    #Empty df to store the cosine
    n_cosines = pd.DataFrame()
    
    for user in data:
        #Find largest n values in each column
        temp = data.nlargest(n, user)
    
        #Renaming/cleaning stuff
        temp[str(user) + "'s Similar Users"] = temp.index
        temp = temp[[user, str(user) + "'s Similar Users"]]
        n_cosines = pd.concat([n_cosines.reset_index(drop=True), temp.reset_index(drop=True)], axis=1)
        n_cosines.rename(columns={user: 'Cosines of ' + str(user), 2*user: str(user) + "'s Similar Users"}, inplace=True)
    return n_cosines

In [8]:
def highest_n_values_for_single_user(data, n, user):
    #Remove the ratings for cells where user vs user is the same
    np.fill_diagonal(data.values, 0)
    
    #Empty df to store the cosine values
    n_cosines = pd.DataFrame()
    #Find largest n values in each column
    temp = data.nlargest(n, user)
    
    #Renaming/cleaning stuff
    temp[str(user) + "'s Similar Users"] = temp.index
    temp = temp[[str(user) + "'s Similar Users"]]
    return temp

In [9]:
def get_recommendations_by_average(pivot_data, user_cosines, user):
    #Remove any movies that the user has already rated
    pivot_data = pivot_data.loc[:, (pivot_data.loc[[user]] == 0).any(axis=0)]
    pivot_data = pivot_data.transpose()
    #This will store movies that similar users rated highly
    highly_rated_movies = pd.DataFrame()
    
    #Find ratings of similar users
    for user in user_cosines.index.values.tolist():
        highly_rated_movies[user] = pivot_data[user].round(0)
        
    #Calculate averages
    highly_rated_movies = highly_rated_movies.replace(0, np.NaN)
    highly_rated_movies['Avg Rating'] = highly_rated_movies.mean(axis=1).round(1)
    
    #Remove any movie with less than 10 ratings
    highly_rated_movies['counts'] = highly_rated_movies.count(axis=1)
    highly_rated_movies = highly_rated_movies[highly_rated_movies.counts >= 10]
    highly_rated_movies = highly_rated_movies.transpose()
    highly_rated_movies = highly_rated_movies.fillna(0)
    
    #Sort values by mean
    highly_rated_movies = highly_rated_movies.sort_values(by =['Avg Rating', 'counts'], axis=1, ascending=False)
    
    #Select 20 most highly rated movies
    highly_rated_movies = highly_rated_movies.iloc[:, : 20]
    highly_rated_movies = highly_rated_movies.loc[['Avg Rating']]
    
    #Get movie titles
    highly_rated_movies = highly_rated_movies.transpose()
    highly_rated_movies['movieId'] = highly_rated_movies.index.values.tolist()
    highly_rated_movies = pd.merge(highly_rated_movies, movie_info, on='movieId')
    highly_rated_movies = highly_rated_movies[['Title', 'Avg Rating']]
    
    #Return 20 most highly rated movies
    return highly_rated_movies

In [10]:
#Give the ratings pivot table, cosine similarity for each user, rating to recommend off, recieve 20 most frequent movies
def get_recommendations_by_frequency(pivot_data, user_cosines, rating, user):
    #Remove any movies that the user has already rated
    pivot_data = pivot_data.loc[:, (pivot_data.loc[[user]] == 0).any(axis=0)]
    pivot_data = pivot_data.transpose()
    #This will store movies that similar users rated highly
    highly_rated_movies = pd.DataFrame()
    
    #Find ratings of similar users
    for user in user_cosines.index.values.tolist():
        highly_rated_movies[user] = pivot_data[user].round(0)
    
    #Remove any ratings lower than the rating wanted
    crt_rat = 0
    while crt_rat < rating:
        highly_rated_movies = highly_rated_movies.replace(crt_rat, np.NaN)
        crt_rat = crt_rat + 1
    
    #Find frequency of ratings
    highly_rated_movies['Frequency'] = highly_rated_movies.count(axis=1)
    
    #Sort movies by frequency
    highly_rated_movies = highly_rated_movies.sort_values(by ='Frequency', axis=0, ascending=False)
    highly_rated_movies = highly_rated_movies.fillna(0)
    
    #Select 20 most frequent movies
    highly_rated_movies = highly_rated_movies.head(20)
    
    #Get Movie Titles
    highly_rated_movies = highly_rated_movies[['Frequency']]
    highly_rated_movies['movieId'] = highly_rated_movies.index.values.tolist()
    highly_rated_movies = pd.merge(highly_rated_movies, movie_info, on='movieId')
    highly_rated_movies = highly_rated_movies[['Title', 'Frequency']]
    
    #Return 20 most frequent high rated movies
    return highly_rated_movies

In [15]:
def recommender(user):
    #Pivot dataset and get cosine similarities
    pivot_data = pivot(train_set, 2, 0, 1)
    cos_sim = cosine(pivot_data)
    usr_sim = highest_n_values_for_single_user(cos_sim, 50, user)
    
    #Minimum rating for frequency recommendation
    frequency_rating = 4
    #Get recommendations
    freq_rec = get_recommendations_by_frequency(pivot_data, usr_sim, frequency_rating, user)
    avg_rec = get_recommendations_by_average(pivot_data, usr_sim, user)
    #Hybrid recommender that only outputs movies the other two systems recommend
    hybrid_rec = pd.merge(freq_rec, avg_rec, on='Title', left_index=False, right_index=False).sort_values(by="Avg Rating", ascending=False)
    
    print("Recommended movies for user " + str(user) + " based on the average rating of similar users are: \n")
    print(tabulate(avg_rec, headers=["Movie Title", "Average Rating"], showindex="never"))
    print("\nRecommended movies for user " + str(user) + " based on the most rated movies of similar users\nwith a rating of " + str(frequency_rating) + " or higher are: \n")
    print(tabulate(freq_rec, headers=["Movie Title", "Frequency"], showindex="never"))
    print("\nRecommended movies for user " + str(user) + " by both systems are: \n")
    print(tabulate(hybrid_rec, headers=["Movie Title", "Frequency", "Average Rating"], showindex="never"))

In [16]:
#Enter the number of the user you want to predict here
recommender(5)

Recommended movies for user 5 based on the average rating of similar users are: 

Movie Title                                               Average Rating
------------------------------------------------------  ----------------
Wallace & Gromit: The Best of Aardman Animation (1996)               4.8
Usual Suspects, The (1995)                                           4.5
Casablanca (1942)                                                    4.5
Schindler's List (1993)                                              4.5
Henry V (1989)                                                       4.5
Shawshank Redemption, The (1994)                                     4.4
Titanic (1997)                                                       4.4
Sling Blade (1996)                                                   4.4
City of Lost Children, The (1995)                                    4.4
Miller's Crossing (1990)                                             4.4
Terminator, The (1984)                    