In [354]:
import pandas as pd
import numpy as np
import math

ratings_path = "./ml-latest-small/ratings.csv"
movies_path = "./ml-latest-small/movies.csv"
tags_path = "./ml-latest-small/tags.csv"
similar_users_path = "./similar_users.csv"

Ratings = pd.DataFrame()
SimilarUsers = pd.DataFrame()

In [362]:
def load_dataset(dataset):
    dataset_name = dataset[0]
    dataset_path = dataset[1]
    try:
        data = pd.read_csv(dataset_path)
        print("{} is successfully read from memory.").format(dataset_name)
        return data
    except:
        print("CAUTION: {} cannot be read from memory.").format(dataset_name)
        return None

Ratings = load_dataset(("Ratings", ratings_path))
SimilarUsers = load_dataset(("SimilarUsers", similar_users_path))
if SimilarUsers.empty:
    print("User similarity matrix is being recalculated...\n It might last a while...")
    SimilarUsers = user_similarity_matrix()

Ratings is successfully read from memory.
SimilarUsers is successfully read from memory.


In [353]:
def user_similarity_matrix():
    MeanUserRating = Ratings.groupby(['userId'], as_index = False, sort = False).mean().rename(columns = {'rating': 'rating_mean'})[['userId','rating_mean']]
    Ratings = pd.merge(Ratings, MeanUserRating, on = 'userId', how = 'left', sort = False)
    Ratings['rating_adjusted'] = Ratings['rating'] - Ratings['rating_mean']
    Ratings['mean_skewness'] = (Ratings['rating_adjusted'])/(5-Ratings['rating_mean'])*(Ratings['rating_adjusted']>0).astype(float) - (Ratings['rating_adjusted'])/(Ratings['rating_mean']-0.5)*(Ratings['rating_adjusted']<0).astype(float)
    Ratings['rating_unskewed'] = Ratings['rating_adjusted'] * np.sqrt(1+(Ratings['mean_skewness']**2)*2) 
    distinct_users = np.unique(Ratings['userId'])

    similar_users = pd.DataFrame()

    for user1 in distinct_users:
        user1_data = Ratings[Ratings['userId']==user1]
        user1_data = user1_data.rename(columns={'rating_adjusted':'rating_adjusted1'})
        user1_data = user1_data.rename(columns={'rating_unskewed':'rating_unskewed1'})
        user1_vector_length = np.sqrt(np.sum(np.square(user1_data['rating_adjusted1']), axis=0))

        for user2 in distinct_users:
            user2_data = Ratings[Ratings['userId']==user2]

            user2_data = user2_data.rename(columns={'rating_adjusted':'rating_adjusted2'})
            user2_data = user2_data.rename(columns={'rating_unskewed':'rating_unskewed2'})
            user2_data = user2_data.rename(columns={'userId':'userId2'}) 
            user2_vector_length = np.sqrt(np.sum(np.square(user2_data['rating_adjusted2']), axis=0))

            user_data = pd.merge(user1_data,user2_data[['rating_adjusted2','userId2','movieId', 'rating_unskewed2']], on = 'movieId', how = 'inner', sort = False)
            user_data['vector_product']=(user_data['rating_adjusted1']*user_data['rating_adjusted2'])
            user_data['unskewed_product']=(user_data['rating_unskewed1']*user_data['rating_unskewed2'])
            user_data = user_data.groupby(['userId','userId2'], as_index = False, sort = False)['vector_product', 'unskewed_product'].agg('sum')
            user_data['pearson'] = user_data['vector_product']/(user1_vector_length*user2_vector_length)
            user_data['unskewed_pearson'] = user_data['unskewed_product']/(user1_vector_length*user2_vector_length)
            user_data = user_data.drop(columns=['vector_product', 'unskewed_product'])
            similar_users = similar_users.append(user_data, ignore_index=True)

    SimilarUsers = similar_users[similar_users['pearson']<0.99999]
    SimilarUsers.to_csv(similar_users_path)
    return SimilarUsers

In [319]:
def user_twins(userId):
    top_twins = SimilarUsers[SimilarUsers['userId']==userId].sort_values(['unskewed_pearson'], ascending=False).head(30)
    top_twins = top_twins[top_twins['pearson']>0]
    top_twins = top_twins[top_twins['unskewed_pearson']>0].drop(columns=['userId', 'pearson']).as_matrix()
    return top_twins

def user_collaborative_recommendations(userId):
    twin_recommendations = pd.DataFrame()
    top_twins = user_twins(userId)
    for twin in top_twins:
        twin_data = Ratings[Ratings['userId']==twin[0].astype(int)]
        twin_data = twin_data[twin_data['rating_unskewed']>0].sort_values(['rating_unskewed'], ascending=False)
        twin_data['proximity_weight'] = twin[1]
        twin_data['proximity_rating'] = twin_data['rating_unskewed']*twin[1]
        twin_recommendations = twin_recommendations.append(twin_data, ignore_index=True)
    twin_recommendations = twin_recommendations.sort_values(['proximity_rating'], ascending=False)
    twin_recommendations_acc = twin_recommendations.groupby(['movieId'], as_index = False, sort = False)['proximity_rating'].agg('sum').sort_values(['proximity_rating'], ascending=False).iloc[:, 0].values
    return twin_recommendations_acc