# **Collaborative filtering implementation:**

**1. Library importing, data initialization and auxilary functions definition:**

In [76]:
import pandas as pd
import numpy as np
import math

ratings_path = "./ml-latest-small/ratings.csv"
movies_path = "./ml-latest-small/movies.csv"
tags_path = "./ml-latest-small/tags.csv"
similar_users_path = "./similar_users.csv"
user_coefficients_path = "./user_coefficients.csv"
top_similar_items_path = "./top_similar_items.csv"
top_movie_coefficients_path = "./top_item_coefficients.csv"

user_recommendations_path = "./user_recommendations/"
item_recommendations_path = "./item_recommendations/"

In [77]:
def load_dataset(dataset):
    dataset_name = dataset[0]
    dataset_path = dataset[1]
    try:
        data = pd.read_csv(dataset_path)
        print("{} is successfully read from memory.").format(dataset_name)
        return data
    except:
        print("CAUTION: {} cannot be read from memory.").format(dataset_name)
        return None

In [78]:
def load_precalculated(data, name, path, recalculator, arg):
    data = load_dataset((name, path))
    if data is None:
        print("{} data is being recalculated... It might take a while...").format(name)
        data = recalculator()[arg]
    return data

In [79]:
def process_ratings(Ratings):
    MeanUserRating = Ratings.groupby(['userId'], as_index = False, sort = False).mean()\
    .rename(columns = {'rating': 'rating_mean'})[['userId','rating_mean']]
    
    Ratings = pd.merge(Ratings, MeanUserRating, on = 'userId', how = 'left', sort = False)
    Ratings['rating_adjusted'] = Ratings['rating'] - Ratings['rating_mean']
    Ratings['mean_skewness'] = (Ratings['rating_adjusted'])/(5-Ratings['rating_mean'])*(Ratings['rating_adjusted']>0).astype(float)\
    - (Ratings['rating_adjusted'])/(Ratings['rating_mean']-0.5)*(Ratings['rating_adjusted']<0).astype(float)
    
    Ratings['rating_unskewed'] = Ratings['rating_adjusted'] * np.sqrt(1+(Ratings['mean_skewness']**2)*2) 
    PivotedUserMatrix = Ratings.pivot_table(index='movieId', columns='userId', values='rating_unskewed', fill_value=0)
    PivotedMoviesMatrix = Ratings.pivot_table(index='userId', columns='movieId', values='rating_unskewed', fill_value=0)
    return Ratings, PivotedUserMatrix, PivotedMoviesMatrix

In [80]:
SimilarUsers = pd.DataFrame()
UserCoefficients = pd.DataFrame()
TopSimilarItems = pd.DataFrame()
TopMovieCoefficients = pd.DataFrame()

Ratings = load_dataset(("Ratings", ratings_path))
Ratings, PivotedUserMatrix, PivotedMoviesMatrix = process_ratings(Ratings)
distinct_users = np.unique(Ratings['userId'])
distinct_movies = np.unique(Ratings['movieId'])
Movies = load_dataset(("Movies", movies_path))

Ratings is successfully read from memory.
Movies is successfully read from memory.


In [81]:
def calc_vectors_length(matrix):
    vectors_length = pd.DataFrame(columns=['length'], dtype='float')
    for column in matrix:
        vector = matrix[column]
        vector_length = math.sqrt(vector*vector)
        vectors_length = vectors_length.append({'length': vector_length}, ignore_index=True)
    return vectors_length

In [82]:
def unskewed_pearson_similarity(v1, v2, v1_length, v2_length):
    dot_product = v1*v2
    if v1_length < 0.0000001 or v2_length < 0.0000001 or (v1==v2).all():
        return 0
    else:
        return dot_product / v1_length / v2_length

**2. Computation of User-User and Item-Item similarity matrices:** 

In [83]:
def user_similarity_matrix():
    SimilarUsers = pd.DataFrame(0, index=np.arange(distinct_users.size), columns=np.arange(distinct_users.size), dtype='float')
    UserCoefficients = pd.DataFrame(0, index=np.arange(distinct_users.size), columns=np.arange(distinct_users.size), dtype='float')
    user_vectors_length = calc_vectors_length(PivotedUserMatrix)
    
    for user in distinct_users:
        userIndex = np.searchsorted(distinct_users, user)
        
        for user2 in distinct_users:
            user2Index = np.searchsorted(distinct_users, user2)
            
            proximity = unskewed_pearson_similarity(PivotedUserMatrix.iloc[:,userIndex],\
                                                    PivotedUserMatrix.iloc[:,user2Index],\
                                                    user_vectors_length.values[userIndex][0],\
                                                    user_vectors_length.values[user2Index][0])
            SimilarUsers[user2Index][userIndex] = proximity
            
        similarity_values = np.copy(SimilarUsers.values[userIndex])
        SimilarUsers.values[userIndex] = np.argsort(SimilarUsers.values[userIndex])[::-1]
        similarity_values.sort()
        similarity_values = similarity_values[::-1]
        UserCoefficients.values[userIndex] = np.where(similarity_values > 0, similarity_values, 0)
        SimilarUsers.values[userIndex] = np.where(similarity_values > 0, SimilarUsers.values[userIndex], 0)
        print("Calculated for {} users out of {}.").format(userIndex+1, distinct_users.size)
    
    UserCoefficients.to_csv(user_coefficients_path, index=False)
    SimilarUsers.to_csv(similar_users_path, index=False)
    return SimilarUsers, UserCoefficients

In [84]:
def item_similarity_matrix():
    MoviesMatrix = pd.DataFrame(0, index=np.arange(distinct_movies.size), columns=np.arange(distinct_movies.size), dtype='float')
    TopSimilarItems = pd.DataFrame(0, index=np.arange(distinct_movies.size), columns=np.arange(1000), dtype='float')
    MovieCoefficients = pd.DataFrame(0, index=np.arange(distinct_movies.size), columns=np.arange(distinct_movies.size), dtype='float')
    TopMovieCoefficients = pd.DataFrame(0, index=np.arange(distinct_movies.size), columns=np.arange(1000), dtype='float')
    movie_vectors_length = calc_vectors_length(PivotedMoviesMatrix)
    
    for movie in distinct_movies:
        movieIndex = np.searchsorted(distinct_movies, movie)

        for movie2 in distinct_movies:
            movie2Index = np.searchsorted(distinct_movies, movie2)
            
            proximity = unskewed_pearson_similarity(PivotedMoviesMatrix.iloc[:,movieIndex],\
                                                    PivotedMoviesMatrix.iloc[:,movie2Index],\
                                                    movie_vectors_length.values[movieIndex][0],\
                                                    movie_vectors_length.values[movie2Index][0])
            MoviesMatrix[movie2Index][movieIndex] = proximity

        similarity_values = np.copy(MoviesMatrix.values[movieIndex])
        MoviesMatrix.values[movieIndex] = np.argsort(MoviesMatrix.values[movieIndex])[::-1]
        similarity_values.sort()
        similarity_values = similarity_values[::-1]

        MovieCoefficients.values[movieIndex] = np.where(similarity_values > 0, similarity_values, 0)
        TopMovieCoefficients.values[movieIndex] = MovieCoefficients.values[movieIndex][:1000]
        MoviesMatrix.values[movieIndex] = np.where(similarity_values > 0, MoviesMatrix.values[movieIndex], 0)
        TopSimilarItems.values[movieIndex] = MoviesMatrix.values[movieIndex][:1000]
        print("Calculated for {} items out of {}.").format(movieIndex+1, distinct_movies.size)

    TopSimilarItems.to_csv(top_similar_items_path, index=False)
    TopMovieCoefficients.to_csv(top_movie_coefficients_path, index=False)
    return TopSimilarItems, TopMovieCoefficients

**3. Loading the similarity matrices from the memory, or recalculating if missing:**

In [85]:
SimilarUsers = load_precalculated(SimilarUsers, "SimilarUsers", similar_users_path, user_similarity_matrix, 0)
UserCoefficients = load_precalculated(UserCoefficients, "UserCoefficients", user_coefficients_path, user_similarity_matrix, 1)    
TopSimilarItems = load_precalculated(TopSimilarItems, "TopSimilarItems", top_similar_items_path, item_similarity_matrix, 0)
TopMovieCoefficients = load_precalculated(TopMovieCoefficients, "TopMovieCoefficients", top_movie_coefficients_path, item_similarity_matrix, 1)    

SimilarUsers is successfully read from memory.
UserCoefficients is successfully read from memory.
TopSimilarItems is successfully read from memory.
TopMovieCoefficients is successfully read from memory.


**4. Calculating top user-based and item-based collaborative recommendations for a particular user utilizing similarity matrices from the previous step:**

In [183]:
def accumulate_user_recommendations(userId, recommenders):
    recommendations = np.zeros(distinct_movies.size)
    userIndex = np.searchsorted(distinct_users, userId)
    recommendersProximity = UserCoefficients.values[userIndex]

    for recommender, proximity in zip(recommenders, recommendersProximity):
        recommenderIndex = np.searchsorted(distinct_users, recommender)
        recommenderRatings = PivotedUserMatrix.iloc[:,recommenderIndex]
        recommendation_vector = (recommenderRatings*proximity).values
        recommendations += recommendation_vector

    return recommendations

In [184]:
def accumulate_item_recommendations(userId, user_preferences, user_rates):
    recommendations = np.zeros(distinct_movies.size)
    
    for preference, rate in zip(user_preferences, user_rates):
        preferenceIndex = np.searchsorted(distinct_movies, preference)
        preferenceTwins = TopSimilarItems.values[preferenceIndex].astype(int)
        twinsProximity = TopMovieCoefficients.values[preferenceIndex]
        
        for twinId, twinProximity in zip(preferenceTwins, twinsProximity):
            if twinId > 0:
                twinIndex = np.searchsorted(distinct_movies, twinId)
                recommendations[twinIndex] += rate*twinProximity

    return recommendations

In [193]:
def user_collaborative_recommendations(userId):
    userIndex = np.searchsorted(np.unique(Ratings['userId']), userId)
    recommenders = np.extract((SimilarUsers.values[userIndex])>0, SimilarUsers.values[userIndex])
    acc_recommendations = accumulate_user_recommendations(userId, recommenders)
    user_recommendations = distinct_movies[np.argsort(acc_recommendations)][::-1]
    acc_recommendations.sort()
    sorted_recommendations = acc_recommendations[::-1]
    user_recommendations = np.extract(sorted_recommendations>0, user_recommendations)
    return user_recommendations[:1000]

In [186]:
def item_collaborative_recommendations(userId):
    userIndex = np.searchsorted(distinct_users, userId)
    user_rates = PivotedMoviesMatrix.values[userIndex]
    user_rates = np.where(user_rates>0, user_rates, 0)
    user_preferences = distinct_movies[np.argsort(user_rates)][::-1]
    
    user_rates.sort()
    user_rates = user_rates[::-1]
    user_rates = np.extract(user_rates>0, user_rates)
    user_preferences = user_preferences[:user_rates.size]
    
    acc_recommendations = accumulate_item_recommendations(userId, user_preferences, user_rates)
    item_recommendations = distinct_movies[np.argsort(acc_recommendations)][::-1]
    acc_recommendations.sort()
    acc_recommendations = acc_recommendations[::-1]
    item_recommendations = np.extract(acc_recommendations>0, item_recommendations)

    return item_recommendations[:1000]

In [197]:
#def generate_recommendation_files():
for userId in PivotedMoviesMatrix.index[35:]:
    print(user_collaborative_recommendations(userId)[:20])
#         pd.DataFrame({'user_recommendation': user_collaborative_recommendations(userId)}, dtype='int')\
#         .to_csv(user_recommendations_path + '/' + str(userId), index=False)

#         pd.DataFrame({'item_recommendation': item_collaborative_recommendations(userId)}, dtype='int')\
#         .to_csv(item_recommendations_path + '/' + str(userId), index=False)

#generate_recommendation_files()

[ 318  296  593  356  858  260   50  527 2858  608 1196 2571  589 1210
 2762    1 1198 1221   47 2959]
[ 318  296  593  527  356  608   50 2959 2858  858  260 2571 1196 1198
 4993  589 1197 1704  111   47]
[ 318  296  260  858  356  593 2571  527  608 1196   50 2858  110 2762
 1198 1210 2959 1197 4993  912]
[ 318  296  593  527  608  858  356  260 1196 1198 2571   50   47 1213
 1221  110  912 2858 1270 1197]
[ 318  296  858  527  593 2858   50  260 2959 2571  608  111 1193 1270
 1198   47  356 4993 4226 1968]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]

**Examples:**

In [91]:
def user_profile(userId):
    userIndex = np.searchsorted(distinct_users, userId)
    user_rates = PivotedMoviesMatrix.values[userIndex]
    userChoices = distinct_movies[np.argsort(user_rates)]
    user_rates.sort()
    userChoices = np.extract(user_rates!=0, userChoices)
    user_rates = np.extract(user_rates!=0, user_rates)
    userPreferences = pd.DataFrame(data={'movieId':userChoices, 'rating':user_rates})
    UserProfile = Movies[Movies['movieId'].isin(userChoices)].set_index('movieId').join(userPreferences.set_index('movieId')).sort_values('rating', ascending=False)
    return UserProfile

In [19]:
userId=0
user_profile(userId)

Unnamed: 0_level_0,title,genres,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1172,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,1.89087
1953,"French Connection, The (1971)",Action|Crime|Thriller,1.89087
2105,Tron (1982),Action|Adventure|Sci-Fi,1.89087
1339,Dracula (Bram Stoker's Dracula) (1992),Fantasy|Horror|Romance|Thriller,1.083462
1029,Dumbo (1941),Animation|Children|Drama|Musical,0.464933
2150,"Gods Must Be Crazy, The (1980)",Adventure|Comedy,0.464933
3671,Blazing Saddles (1974),Comedy|Western,0.464933
1061,Sleepers (1996),Thriller,0.464933
3792,Duel in the Sun (1946),Drama|Romance|Western,-0.05003
3785,Scary Movie (2000),Comedy|Horror,-0.05003


In [154]:
users_recommend = pd.read_csv(user_recommendations_path + '/' + str(userId))
users_recommend = users_recommend.as_matrix().flatten()[:]
Movies[Movies['movieId'].isin(users_recommend)]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
5,6,Heat (1995),Action|Crime|Thriller
15,16,Casino (1995),Crime|Drama
16,17,Sense and Sensibility (1995),Drama|Romance
24,25,Leaving Las Vegas (1995),Drama|Romance
27,28,Persuasion (1995),Drama|Romance
28,29,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
31,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
34,36,Dead Man Walking (1995),Crime|Drama
37,39,Clueless (1995),Comedy|Romance


In [155]:
movies_recommend = pd.read_csv(item_recommendations_path + '/' + str(1))
movies_recommend = movies_recommend.as_matrix().flatten()[:]
Movies[Movies['movieId'].isin(movies_recommend)]

Unnamed: 0,movieId,title,genres
33,35,Carrington (1995),Drama|Romance
72,78,"Crossing Guard, The (1995)",Action|Crime|Drama|Thriller
97,105,"Bridges of Madison County, The (1995)",Drama|Romance
103,113,Before and After (1996),Drama|Mystery
115,129,Pie in the Sky (1996),Comedy|Romance
176,200,"Tie That Binds, The (1995)",Thriller
187,213,Burnt by the Sun (Utomlyonnye solntsem) (1994),Drama
192,218,Boys on the Side (1995),Comedy|Drama
238,266,Legends of the Fall (1994),Drama|Romance|War|Western
278,312,Stuart Saves His Family (1995),Comedy


In [188]:
for userId in PivotedMoviesMatrix.index:
    users_recommend = pd.read_csv(user_recommendations_path + '/' + str(userId))
    users_recommend = users_recommend.as_matrix().flatten()[:]
    #print (userId, users_recommend[:10])
    movies_recommend = pd.read_csv(item_recommendations_path + '/' + str(userId))
    movies_recommend = movies_recommend.as_matrix().flatten()[:]
    in_common = np.intersect1d(users_recommend, movies_recommend, assume_unique=False)
    print (in_common.size)

107
149
109
113
106
110
119
110
129
138
106
120
113
139
103
129
115
138
118
121
116
124
104
153
133
117
134
138
115
110
126
124
96
115
224
153
120
102
133
87
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
