# **Collaborative filtering implementation:**

**1. Library importing, data initialization and auxilary functions definition:**

In [1]:
import pandas as pd
import numpy as np
import math

ratings_path = "./ml-latest-small/ratings.csv"
movies_path = "./ml-latest-small/movies.csv"
tags_path = "./ml-latest-small/tags.csv"
similar_users_path = "./similar_users.csv"
user_coefficients_path = "./user_coefficients.csv"
top_similar_items_path = "./top_similar_items.csv"
top_movie_coefficients_path = "./top_item_coefficients.csv"

#Ratings = pd.DataFrame()
#Movies = pd.DataFrame()


# distinct_users = np.array([])
# distinct_movies = np.array([])

In [2]:
def load_dataset(dataset):
    dataset_name = dataset[0]
    dataset_path = dataset[1]
    try:
        data = pd.read_csv(dataset_path)
        print("{} is successfully read from memory.").format(dataset_name)
        return data
    except:
        print("CAUTION: {} cannot be read from memory.").format(dataset_name)
        return None

In [3]:
def load_precalculated(data, name, path, recalculator, arg):
    data = load_dataset((name, path))
    if data is None:
        print("{} data is being recalculated... It might take a while...").format(name)
        data = recalculator()[arg]
    return data

In [4]:
def process_ratings(Ratings):
    MeanUserRating = Ratings.groupby(['userId'], as_index = False, sort = False).mean().rename(columns = {'rating': 'rating_mean'})[['userId','rating_mean']]
    Ratings = pd.merge(Ratings, MeanUserRating, on = 'userId', how = 'left', sort = False)
    Ratings['rating_adjusted'] = Ratings['rating'] - Ratings['rating_mean']
    Ratings['mean_skewness'] = (Ratings['rating_adjusted'])/(5-Ratings['rating_mean'])*(Ratings['rating_adjusted']>0).astype(float) - (Ratings['rating_adjusted'])/(Ratings['rating_mean']-0.5)*(Ratings['rating_adjusted']<0).astype(float)
    Ratings['rating_unskewed'] = Ratings['rating_adjusted'] * np.sqrt(1+(Ratings['mean_skewness']**2)*2) 
    PivotedUserMatrix = Ratings.pivot_table(index='movieId', columns='userId', values='rating_unskewed', fill_value=0)
    PivotedMoviesMatrix = Ratings.pivot_table(index='userId', columns='movieId', values='rating_unskewed', fill_value=0)
    return Ratings, PivotedUserMatrix, PivotedMoviesMatrix

In [5]:
SimilarUsers = pd.DataFrame()
UserCoefficients = pd.DataFrame()
TopSimilarItems = pd.DataFrame()
TopMovieCoefficients = pd.DataFrame()

Ratings = load_dataset(("Ratings", ratings_path))
Ratings, PivotedUserMatrix, PivotedMoviesMatrix = process_ratings(Ratings)
distinct_users = np.unique(Ratings['userId'])
distinct_movies = np.unique(Ratings['movieId'])
Movies = load_dataset(("Movies", movies_path))

Ratings is successfully read from memory.
Movies is successfully read from memory.


In [6]:
def unskewed_pearson_similarity(v1, v2):
    dot_product = v1.transpose().dot(v2)
    v1_vector_length = math.sqrt(v1.transpose().dot(v1))
    v2_vector_length = math.sqrt(v2.transpose().dot(v2))
    if v1_vector_length < 0.0000001 or v2_vector_length < 0.0000001 or (v1==v2).all():
        return 0
    else:
        return dot_product / v1_vector_length / v2_vector_length

**2. Computation of User-User and Item-Item similarity matrices:** 

In [7]:
def user_similarity_matrix():
    SimilarUsers = pd.DataFrame(0, index=np.arange(distinct_users.size), columns=np.arange(distinct_users.size), dtype='float')
    UserCoefficients = pd.DataFrame(0, index=np.arange(distinct_users.size), columns=np.arange(distinct_users.size), dtype='float')
    
    for user in distinct_users:
        userIndex = np.searchsorted(distinct_users, user)
        
        for user2 in distinct_users:
            user2Index = np.searchsorted(distinct_users, user2)
            
            proximity = unskewed_pearson_similarity(PivotedUserMatrix.iloc[:,userIndex], PivotedUserMatrix.iloc[:,user2Index])
            SimilarUsers[user2Index][userIndex] = proximity
            
        similarity_values = np.copy(SimilarUsers.values[userIndex])
        SimilarUsers.values[userIndex] = np.argsort(SimilarUsers.values[userIndex])[::-1]
        similarity_values.sort()
        similarity_values = similarity_values[::-1]
        UserCoefficients.values[userIndex] = np.where(similarity_values > 0, similarity_values, 0)
        SimilarUsers.values[userIndex] = np.where(similarity_values > 0, SimilarUsers.values[userIndex], 0)
        print("Calculated for {} users out of {}.").format(userIndex+1, distinct_users.size)
    
    UserCoefficients.to_csv(user_coefficients_path, index=False)
    SimilarUsers.to_csv(similar_users_path, index=False)
    return SimilarUsers, UserCoefficients

In [8]:
def item_similarity_matrix():
    MoviesMatrix = pd.DataFrame(0, index=np.arange(distinct_movies.size), columns=np.arange(distinct_movies.size), dtype='float')
    TopSimilarItems = pd.DataFrame(0, index=np.arange(distinct_movies.size), columns=np.arange(1000), dtype='float')
    MovieCoefficients = pd.DataFrame(0, index=np.arange(distinct_movies.size), columns=np.arange(distinct_movies.size), dtype='float')
    TopMovieCoefficients = pd.DataFrame(0, index=np.arange(distinct_movies.size), columns=np.arange(1000), dtype='float')

    for movie in distinct_movies:
        movieIndex = np.searchsorted(distinct_movies, movie)

        for movie2 in distinct_movies:
            movie2Index = np.searchsorted(distinct_movies, movie2)
            proximity = unskewed_pearson_similarity(PivotedMoviesMatrix.iloc[:,movieIndex], PivotedMoviesMatrix.iloc[:,movie2Index])
            MoviesMatrix[movie2Index][movieIndex] = proximity

        similarity_values = np.copy(MoviesMatrix.values[movieIndex])
        MoviesMatrix.values[movieIndex] = np.argsort(MoviesMatrix.values[movieIndex])[::-1]
        similarity_values.sort()
        similarity_values = similarity_values[::-1]

        MovieCoefficients.values[movieIndex] = np.where(similarity_values > 0, similarity_values, 0)
        TopMovieCoefficients.values[movieIndex] = MovieCoefficients.values[movieIndex][:1000]
        MoviesMatrix.values[movieIndex] = np.where(similarity_values > 0, MoviesMatrix.values[movieIndex], 0)
        TopSimilarItems.values[movieIndex] = MoviesMatrix.values[movieIndex][:1000]
        print("Calculated for {} items out of {}.").format(movieIndex+1, distinct_movies.size)

    TopSimilarItems.to_csv(top_similar_items_path, index=False)
    TopMovieCoefficients.to_csv(top_movie_coefficients_path, index=False)
    return TopSimilarItems, TopMovieCoefficients

**3. Loading the similarity matrices from the memory, or recalculating if missing:**

In [9]:
SimilarUsers = load_precalculated(SimilarUsers, "SimilarUsers", similar_users_path, user_similarity_matrix, 0)
UserCoefficients = load_precalculated(UserCoefficients, "UserCoefficients", user_coefficients_path, user_similarity_matrix, 1)    
TopSimilarItems = load_precalculated(TopSimilarItems, "TopSimilarItems", top_similar_items_path, item_similarity_matrix, 0)
TopMovieCoefficients = load_precalculated(TopMovieCoefficients, "TopMovieCoefficients", top_movie_coefficients_path, item_similarity_matrix, 1)    

SimilarUsers is successfully read from memory.
UserCoefficients is successfully read from memory.
TopSimilarItems is successfully read from memory.
TopMovieCoefficients is successfully read from memory.


**4. Calculating top user-based and item-based collaborative recommendations for a particular user utilizing similarity matrices from the previous step:**

In [10]:
def accumulate_user_recommendations(userId, recommenders):
    recommendations = np.zeros(distinct_movies.size)
    userIndex = np.searchsorted(distinct_users, userId)
    recommendersProximity = UserCoefficients.values[userIndex]

    for recommender, proximity in zip(recommenders, recommendersProximity):
        recommenderIndex = np.searchsorted(distinct_users, recommender)
        recommenderRatings = PivotedUserMatrix.iloc[:,recommenderIndex]
        recommendation_vector = (recommenderRatings*proximity).values
        recommendations += recommendation_vector

    return recommendations

In [11]:
def accumulate_item_recommendations(userId, user_preferences, user_rates):
    recommendations = np.zeros(distinct_movies.size)
    
    for preference, rate in zip(user_preferences, user_rates):
        preferenceIndex = np.searchsorted(distinct_movies, preference)
        preferenceTwins = TopSimilarItems.values[preferenceIndex].astype(int)
        twinsProximity = TopMovieCoefficients.values[preferenceIndex]
        
        for twinId, twinProximity in zip(preferenceTwins, twinsProximity):
            if twinId > 0:
                twinIndex = np.searchsorted(distinct_movies, twinId)
                recommendations[twinIndex] += rate*twinProximity

    return recommendations

In [12]:
def user_collaborative_recommendations(userId):
    userIndex = np.searchsorted(np.unique(Ratings['userId']), userId)
    recommenders = np.extract(SimilarUsers.values[userIndex]>0, SimilarUsers.values[userIndex])
    acc_recommendations = accumulate_user_recommendations(userId, recommenders)
    user_recommendations = distinct_movies[np.argsort(acc_recommendations)][::-1]
    acc_recommendations.sort()
    sorted_recommendations = acc_recommendations[::-1]
    user_recommendations = np.extract(sorted_recommendations>0, user_recommendations)
    return user_recommendations

In [13]:
def item_collaborative_recommendations(userId):
    userIndex = np.searchsorted(distinct_users, userId)
    user_rates = PivotedMoviesMatrix.values[userIndex]
    user_rates = np.where(user_rates>0, user_rates, 0)
    user_preferences = distinct_movies[np.argsort(user_rates)][::-1]
    
    user_rates.sort()
    user_rates = user_rates[::-1]
    user_rates = np.extract(user_rates>0, user_rates)
    user_preferences = user_preferences[:user_rates.size]
    
    acc_recommendations = accumulate_item_recommendations(userId, user_preferences, user_rates)
    item_recommendations = distinct_movies[np.argsort(acc_recommendations)][::-1]
    acc_recommendations.sort()
    acc_recommendations = acc_recommendations[::-1]
    item_recommendations = np.extract(acc_recommendations>0, item_recommendations)

    return item_recommendations

**Examples:**

In [110]:
def user_profile(userId):
    userIndex = np.searchsorted(distinct_users, userId)
    user_rates = PivotedMoviesMatrix.values[userIndex]
    user_rates = np.where(user_rates!=0, user_rates, 0)
    userChoices = distinct_movies[np.argsort(user_rates)][::-1]
    user_rates.sort()
    user_rates = user_rates[::-1]
    user_rates = np.extract(user_rates!=0, user_rates)
    userChoices = userChoices[:user_rates.size]
    userPreferences = pd.DataFrame(data={'movieId':userChoices, 'rating':user_rates})
    UserProfile = Movies[Movies['movieId'].isin(userChoices)].set_index('movieId').join(userPreferences.set_index('movieId')).sort_values('rating', ascending=False)
    return UserProfile

In [113]:
userId=1
user_profile(userId)

Unnamed: 0_level_0,title,genres,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1172,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,1.89087
1953,"French Connection, The (1971)",Action|Crime|Thriller,1.89087
2105,Tron (1982),Action|Adventure|Sci-Fi,1.89087
1339,Dracula (Bram Stoker's Dracula) (1992),Fantasy|Horror|Romance|Thriller,1.083462
1029,Dumbo (1941),Animation|Children|Drama|Musical,0.464933
2150,"Gods Must Be Crazy, The (1980)",Adventure|Comedy,0.464933
3671,Blazing Saddles (1974),Comedy|Western,0.464933
1061,Sleepers (1996),Thriller,0.464933
3792,Duel in the Sun (1946),Drama|Romance|Western,-0.05003
3785,Scary Movie (2000),Comedy|Horror,-0.05003


In [114]:
users_recommend = user_collaborative_recommendations(userId)[:20]
Movies[Movies['movieId'].isin(users_recommend)]

Unnamed: 0,movieId,title,genres
31,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
45,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
48,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
100,110,Braveheart (1995),Action|Drama|War
101,111,Taxi Driver (1976),Crime|Drama|Thriller
232,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
266,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
284,318,"Shawshank Redemption, The (1994)",Crime|Drama
321,356,Forrest Gump (1994),Comedy|Drama|Romance|War
427,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller


In [115]:
movies_recommend = item_collaborative_recommendations(userId)[:20]
Movies[Movies['movieId'].isin(movies_recommend)]

Unnamed: 0,movieId,title,genres
641,775,Spirits of the Dead (1968),Horror|Mystery
1216,1513,Romy and Michele's High School Reunion (1997),Comedy
5175,7541,100 Girls (2000),Comedy|Romance
5192,7613,White Palace (1990),Drama
5204,7669,Pride and Prejudice (1995),Drama|Romance
5216,7728,"Postman Always Rings Twice, The (1946)",Crime|Drama|Film-Noir|Thriller
5262,7872,Getting It Right (1989),Comedy|Drama
5274,7914,Berlin: Symphony of a Great City (Berlin: Die ...,Documentary
5328,8093,Shiri (Swiri) (1999),Action|Drama|Romance|Thriller
5330,8117,In China They Eat Dogs (I Kina spiser de hunde...,Action|Comedy
