In [262]:
import pandas as pd
import numpy as np
import math

ratings_path = "./ml-latest-small/ratings.csv"
movies_path = "./ml-latest-small/movies.csv"
tags_path = "./ml-latest-small/tags.csv"
top_similar_items_path = "./top_similar_items.csv"
top_movie_coefficients_path = "./top_item_coefficients.csv"

Ratings = pd.DataFrame()
Movies = pd.DataFrame()
TopSimilarItems = pd.DataFrame()
TopMovieCoefficients = pd.DataFrame()

In [263]:
def load_dataset(dataset):
    dataset_name = dataset[0]
    dataset_path = dataset[1]
    try:
        data = pd.read_csv(dataset_path)
        print("{} is successfully read from memory.").format(dataset_name)
        return data
    except:
        print("CAUTION: {} cannot be read from memory.").format(dataset_name)
        return 

In [264]:
def unskewed_pearson_similarity(movie1, movie2):
    dot_product = movie1.transpose().dot(movie2)
    item1_vector_length = math.sqrt(movie1.transpose().dot(movie1))
    item2_vector_length = math.sqrt(movie2.transpose().dot(movie2))
    if item1_vector_length < 0.0000001 or item2_vector_length < 0.0000001 or (movie1==movie2).all():
        return 0
    else:
        return dot_product / item1_vector_length / item2_vector_length

In [265]:
def process_ratings(Ratings):
    MeanUserRating = Ratings.groupby(['userId'], as_index = False, sort = False).mean().rename(columns = {'rating': 'rating_mean'})[['userId','rating_mean']]
    Ratings = pd.merge(Ratings, MeanUserRating, on = 'userId', how = 'left', sort = False)
    Ratings['rating_adjusted'] = Ratings['rating'] - Ratings['rating_mean']
    Ratings['mean_skewness'] = (Ratings['rating_adjusted'])/(5-Ratings['rating_mean'])*(Ratings['rating_adjusted']>0).astype(float) - (Ratings['rating_adjusted'])/(Ratings['rating_mean']-0.5)*(Ratings['rating_adjusted']<0).astype(float)
    Ratings['rating_unskewed'] = Ratings['rating_adjusted'] * np.sqrt(1+(Ratings['mean_skewness']**2)*2) 
    PivotedMoviesMatrix = Ratings.pivot_table(index='userId', columns='movieId', values='rating_unskewed', fill_value=0)
    return Ratings, PivotedMoviesMatrix

In [266]:
def item_similarity_matrix():
    MoviesMatrix = pd.DataFrame(0, index=distinct_movies, columns=np.arange(distinct_movies.size)+1, dtype='float')
    TopSimilarItems = pd.DataFrame(0, index=distinct_movies, columns=np.arange(1000)+1, dtype='float')
    MovieCoefficients = pd.DataFrame(0, index=distinct_movies, columns=np.arange(distinct_movies.size)+1, dtype='float')
    TopMovieCoefficients = pd.DataFrame(0, index=distinct_movies, columns=np.arange(1000)+1, dtype='float')
    
    for movie in distinct_movies:
        movieIndex = np.searchsorted(distinct_movies, movie)
        
        for movie2 in distinct_movies:
            movie2Index = np.searchsorted(distinct_movies, movie2)
            proximity = unskewed_pearson_similarity(PivotedMoviesMatrix.iloc[:,movieIndex], PivotedMoviesMatrix.iloc[:,movie2Index])
            MoviesMatrix[movie2Index+1][movieIndex+1] = proximity
            
        similarity_values = np.copy(MoviesMatrix.values[movieIndex])
        sorted_movie_indexes = np.argsort(MoviesMatrix.values[movieIndex])[::-1]
        MoviesMatrix.values[movieIndex] = MoviesMatrix.index[sorted_movie_indexes]
        similarity_values.sort()
        similarity_values = similarity_values[::-1]
        
        MovieCoefficients.values[movieIndex] = similarity_values
        MovieCoefficients.values[movieIndex] = np.where(similarity_values > 0, MovieCoefficients.values[movieIndex], 0)
        TopMovieCoefficients.values[movieIndex] = MovieCoefficients.values[movieIndex][:1000]
        
        MoviesMatrix.values[movieIndex] = np.where(similarity_values > 0, MoviesMatrix.values[movieIndex], 0)
        TopSimilarItems.values[movieIndex] = MoviesMatrix.values[movieIndex][:1000]
        print("Calculated for {} items out of {}.").format(movieIndex+1, distinct_movies.size)

    TopSimilarItems.to_csv(top_similar_items_path, index=False)
    TopMovieCoefficients.to_csv(top_movie_coefficients_path, index=False)
    return TopSimilarItems, TopMovieCoefficients

In [267]:
def load_precalculated(data, name, path, recalculator, arg):
    data = load_dataset((name, path))
    if data is None:
        print("{} data is being recalculated... It might take a while...").format(name)
        data = recalculator()[arg]   
    return data

In [268]:
Ratings = load_dataset(("Ratings", ratings_path))
Ratings, PivotedMoviesMatrix = process_ratings(Ratings)
distinct_users = np.unique(Ratings['userId'])
distinct_movies = np.unique(Ratings['movieId'])
Movies = load_dataset(("Movies", movies_path))

TopSimilarItems = load_precalculated(TopSimilarItems, "TopSimilarItems", top_similar_items_path, item_similarity_matrix, 0)
TopMovieCoefficients = load_precalculated(TopMovieCoefficients, "TopMovieCoefficients", top_movie_coefficients_path, item_similarity_matrix, 1)    

Ratings is successfully read from memory.
Movies is successfully read from memory.
TopSimilarItems is successfully read from memory.
TopMovieCoefficients is successfully read from memory.


In [269]:
def accumulate_item_recommendations(userId, user_preferences, user_rates):
    recommendations = np.zeros(distinct_movies.size)
    
    for preference, rate in zip(user_preferences, user_rates):
        preferenceIndex = np.searchsorted(distinct_movies, preference)
        preferenceTwins = TopSimilarItems.values[preferenceIndex].astype(int)
        twinsProximity = TopMovieCoefficients.values[preferenceIndex]
        
        for twinId, twinProximity in zip(preferenceTwins, twinsProximity):
            if twinId > 0:
                twinIndex = np.searchsorted(distinct_movies, twinId)
                recommendations[twinIndex] += rate*twinProximity

    return recommendations

In [270]:
def item_collaborative_recommendations(userId):
    userIndex = np.searchsorted(distinct_users, userId)
    user_rates = PivotedMoviesMatrix.values[userIndex]
    user_rates = np.where(user_rates>0, user_rates, 0)
    user_preferences = distinct_movies[np.argsort(user_rates)][::-1]
    
    user_rates.sort()
    user_rates = user_rates[::-1]
    user_rates = np.extract(user_rates>0, user_rates)
    user_preferences = user_preferences[:user_rates.size]
    
    acc_recommendations = accumulate_item_recommendations(userId, user_preferences, user_rates)
    item_recommendations = distinct_movies[np.argsort(acc_recommendations)][::-1]
    acc_recommendations.sort()
    acc_recommendations = acc_recommendations[::-1]
    item_recommendations = np.extract(acc_recommendations>0, item_recommendations)

    return item_recommendations

In [273]:
userId=2
res = item_collaborative_recommendations(userId)
Movies[Movies['movieId'].isin(res[:10])]

Unnamed: 0,movieId,title,genres
1,2,Jumanji (1995),Adventure|Children|Fantasy
258,288,Natural Born Killers (1994),Action|Crime|Thriller
383,434,Cliffhanger (1993),Action|Adventure|Thriller
447,500,Mrs. Doubtfire (1993),Comedy|Drama
2252,2808,Universal Soldier (1992),Action|Sci-Fi
2362,2943,Indochine (1992),Drama|Romance
2401,2990,Licence to Kill (1989),Action|Adventure|Thriller
2623,3267,"Mariachi, El (1992)",Action|Crime|Thriller|Western
4570,6300,Flickering Lights (Blinkende lygter) (2000),Action|Comedy|Crime
5330,8117,In China They Eat Dogs (I Kina spiser de hunde...,Action|Comedy
