In [384]:
import pandas as pd
import numpy as np
import math

ratings_path = "./ml-latest-small/ratings.csv"
movies_path = "./ml-latest-small/movies.csv"
tags_path = "./ml-latest-small/tags.csv"
similar_users_path = "./similar_users.csv"
user_coefficients_path = "./user_coefficients.csv"

Ratings = pd.DataFrame()
Movies = pd.DataFrame()
SimilarUsers = pd.DataFrame()
UserCoefficients = pd.DataFrame()

In [385]:
def load_dataset(dataset):
    dataset_name = dataset[0]
    dataset_path = dataset[1]
    try:
        data = pd.read_csv(dataset_path)
        print("{} is successfully read from memory.").format(dataset_name)
        return data
    except:
        print("CAUTION: {} cannot be read from memory.").format(dataset_name)
        return None

In [386]:
def unskewed_pearson_similarity(user1, user2):
    dot_product = user1.transpose().dot(user2)
    user1_vector_length = math.sqrt(user1.transpose().dot(user1))
    user2_vector_length = math.sqrt(user2.transpose().dot(user2))
    if user1_vector_length < 0.0000001 or user2_vector_length < 0.0000001 or (user1==user2).all():
        return 0
    else:
        return dot_product / user1_vector_length / user2_vector_length

In [387]:
def process_ratings(Ratings):
    MeanUserRating = Ratings.groupby(['userId'], as_index = False, sort = False).mean().rename(columns = {'rating': 'rating_mean'})[['userId','rating_mean']]
    Ratings = pd.merge(Ratings, MeanUserRating, on = 'userId', how = 'left', sort = False)
    Ratings['rating_adjusted'] = Ratings['rating'] - Ratings['rating_mean']
    Ratings['mean_skewness'] = (Ratings['rating_adjusted'])/(5-Ratings['rating_mean'])*(Ratings['rating_adjusted']>0).astype(float) - (Ratings['rating_adjusted'])/(Ratings['rating_mean']-0.5)*(Ratings['rating_adjusted']<0).astype(float)
    Ratings['rating_unskewed'] = Ratings['rating_adjusted'] * np.sqrt(1+(Ratings['mean_skewness']**2)*2) 
    PivotedRatingsMatrix = Ratings.pivot_table(index='movieId', columns='userId', values='rating_unskewed', fill_value=0)
    return Ratings, PivotedUserMatrix

In [388]:
def user_similarity_matrix():
    SimilarUsers = pd.DataFrame(0, index=distinct_users, columns=np.arange(distinct_users.size)+1, dtype='float')
    UserCoefficients = pd.DataFrame(0, index=distinct_users, columns=np.arange(distinct_users.size)+1, dtype='float')
    
    for user in distinct_users:
        userIndex = np.searchsorted(distinct_users, user)
        
        for user2 in distinct_users:
            user2Index = np.searchsorted(distinct_users, user2)
            
            proximity = unskewed_pearson_similarity(PivotedUserMatrix.iloc[:,userIndex], PivotedUserMatrix.iloc[:,user2Index])
            SimilarUsers[user2Index+1][userIndex+1] = proximity
            
        similarity_values = np.copy(SimilarUsers.values[userIndex])
        sorted_user_indexes = np.argsort(SimilarUsers.values[userIndex])[::-1]
        SimilarUsers.values[userIndex] = SimilarUsers.index[sorted_user_indexes]
        similarity_values.sort()
        similarity_values = similarity_values[::-1]
        UserCoefficients.values[userIndex] = similarity_values
        UserCoefficients.values[userIndex] = np.where(similarity_values > 0, UserCoefficients.values[userIndex], 0)
        SimilarUsers.values[userIndex] = np.where(similarity_values > 0, SimilarUsers.values[userIndex], 0)
        print("Calculated for {} users out of {}.").format(userIndex+1, distinct_users.size)
    
    UserCoefficients.to_csv(user_coefficients_path, index=False)
    SimilarUsers.to_csv(similar_users_path, index=False)
    return SimilarUsers, UserCoefficients

In [389]:
def load_precalculated(data, name, path, recalculator, arg):
    data = load_dataset((name, path))
    if data is None:
        print("{} data is being recalculated... It might take a while...").format(name)
        data = recalculator()[arg]
    return data

In [390]:
Ratings = load_dataset(("Ratings", ratings_path))
Ratings, PivotedUserMatrix = process_ratings(Ratings)
distinct_users = np.unique(Ratings['userId'])
distinct_movies = np.unique(Ratings['movieId'])
Movies = load_dataset(("Movies", movies_path))
SimilarUsers = load_precalculated(SimilarUsers, "SimilarUsers", similar_users_path, user_similarity_matrix, 0)
UserCoefficients = load_precalculated(UserCoefficients, "UserCoefficients", user_coefficients_path, user_similarity_matrix, 1)    

Ratings is successfully read from memory.
Movies is successfully read from memory.
CAUTION: SimilarUsers cannot be read from memory.
SimilarUsers data is being recalculated... It might take a while...
Calculated for 1 users out of 671.
Calculated for 2 users out of 671.
Calculated for 3 users out of 671.
Calculated for 4 users out of 671.
Calculated for 5 users out of 671.
Calculated for 6 users out of 671.
Calculated for 7 users out of 671.
Calculated for 8 users out of 671.
Calculated for 9 users out of 671.
Calculated for 10 users out of 671.
Calculated for 11 users out of 671.
Calculated for 12 users out of 671.
Calculated for 13 users out of 671.
Calculated for 14 users out of 671.
Calculated for 15 users out of 671.
Calculated for 16 users out of 671.
Calculated for 17 users out of 671.
Calculated for 18 users out of 671.
Calculated for 19 users out of 671.
Calculated for 20 users out of 671.
Calculated for 21 users out of 671.
Calculated for 22 users out of 671.
Calculated for 2

Calculated for 220 users out of 671.
Calculated for 221 users out of 671.
Calculated for 222 users out of 671.
Calculated for 223 users out of 671.
Calculated for 224 users out of 671.
Calculated for 225 users out of 671.
Calculated for 226 users out of 671.
Calculated for 227 users out of 671.
Calculated for 228 users out of 671.
Calculated for 229 users out of 671.
Calculated for 230 users out of 671.
Calculated for 231 users out of 671.
Calculated for 232 users out of 671.
Calculated for 233 users out of 671.
Calculated for 234 users out of 671.
Calculated for 235 users out of 671.
Calculated for 236 users out of 671.
Calculated for 237 users out of 671.
Calculated for 238 users out of 671.
Calculated for 239 users out of 671.
Calculated for 240 users out of 671.
Calculated for 241 users out of 671.
Calculated for 242 users out of 671.
Calculated for 243 users out of 671.
Calculated for 244 users out of 671.
Calculated for 245 users out of 671.
Calculated for 246 users out of 671.
C

Calculated for 442 users out of 671.
Calculated for 443 users out of 671.
Calculated for 444 users out of 671.
Calculated for 445 users out of 671.
Calculated for 446 users out of 671.
Calculated for 447 users out of 671.
Calculated for 448 users out of 671.
Calculated for 449 users out of 671.
Calculated for 450 users out of 671.
Calculated for 451 users out of 671.
Calculated for 452 users out of 671.
Calculated for 453 users out of 671.
Calculated for 454 users out of 671.
Calculated for 455 users out of 671.
Calculated for 456 users out of 671.
Calculated for 457 users out of 671.
Calculated for 458 users out of 671.
Calculated for 459 users out of 671.
Calculated for 460 users out of 671.
Calculated for 461 users out of 671.
Calculated for 462 users out of 671.
Calculated for 463 users out of 671.
Calculated for 464 users out of 671.
Calculated for 465 users out of 671.
Calculated for 466 users out of 671.
Calculated for 467 users out of 671.
Calculated for 468 users out of 671.
C

Calculated for 664 users out of 671.
Calculated for 665 users out of 671.
Calculated for 666 users out of 671.
Calculated for 667 users out of 671.
Calculated for 668 users out of 671.
Calculated for 669 users out of 671.
Calculated for 670 users out of 671.
Calculated for 671 users out of 671.
UserCoefficients is successfully read from memory.


In [391]:
def accumulate_user_recommendations(userId, recommenders):
    recommendations = np.zeros(distinct_movies.size)
    userIndex = np.searchsorted(distinct_users, userId)
    recommendersProximity = UserCoefficients.values[userIndex]
    i=0
    for recommender in recommenders:
        recommenderIndex = np.searchsorted(distinct_users, recommender)
        recommenderRatings = PivotedUserMatrix.iloc[:,recommenderIndex]
        proximity = recommendersProximity[i]
        recommendation_vector = (recommenderRatings*proximity).values
        recommendations += recommendation_vector
        i=i+1
    return recommendations

In [392]:
def user_collaborative_recommendations(userId):
    userIndex = np.searchsorted(np.unique(Ratings['userId']), userId)
    recommenders = np.extract(SimilarUsers.values[userIndex]>0, SimilarUsers.values[userIndex])
    acc_recommendations = accumulate_user_recommendations(userId, recommenders)
    user_recommendations = distinct_movies[np.argsort(acc_recommendations)][::-1]
    acc_recommendations.sort()
    sorted_recommendations = acc_recommendations[::-1]
    user_recommendations = np.extract(sorted_recommendations>0, user_recommendations)
    return user_recommendations

In [393]:
userId=6
res = user_collaborative_recommendations(userId)[:10]
res

array([4993, 5952, 7153,  318,  296,  260,  858, 1198, 1196,  527])

In [394]:
Movies[Movies['movieId'].isin(res)]

Unnamed: 0,movieId,title,genres
232,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
266,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
284,318,"Shawshank Redemption, The (1994)",Crime|Drama
472,527,Schindler's List (1993),Drama|War
695,858,"Godfather, The (1972)",Crime|Drama
953,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
955,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
3871,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
4395,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
5026,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
