In [1]:
# importing packages

import numpy as np
import pandas as pd
from math import sqrt
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


# column headers for the dataset
data_cols = ['userID', 'movieID', 'rating', 'timestamp']
item_cols = ['movieID', 'movie_title', 'release_date', 'video_release_date','IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama','Fantasy','Film-Noir','Horror', 'Musical','Mystery','Romance ','Sci-Fi','Thriller', 'War', 'Western']
user_cols = ['userID', 'age', 'gender', 'occupation', 'zip_code']

# importing the data files onto dataframes
data = pd.read_csv('./ml-100k/u.data', sep='\t', names=data_cols, encoding='latin-1')
item = pd.read_csv('./ml-100k/u.item', sep='|', names=item_cols, encoding='latin-1')
users = pd.read_csv('./ml-100k/u.user', sep='|', names=user_cols, encoding='latin-1')

item['movieID'] = item['movieID'].apply(pd.to_numeric)

# merging 3 data sets
dataset = pd.merge(pd.merge(item, data), users)

n_users = data.userID.unique().shape[0]
n_movies = data.movieID.unique().shape[0]

print(n_users, n_movies)
#########################################################################
# splitting data to testing and training data
########################################################################
train_data, test_data = train_test_split(data, test_size=0.05, random_state=42)
print("1")
print(train_data.shape)
print(test_data.shape)

train_data_matrix = np.zeros((n_users, n_movies))
for line in train_data.itertuples():
    # print(line)
    train_data_matrix[line[1]-1, line[2]-1] = line[3]
    # -1 bcoz there is no user or movie with id 0

test_data_matrix = np.zeros((n_users, n_movies))
for line in test_data.itertuples():
    # print(line)
    test_data_matrix[line[1]-1, line[2]-1] = line[3]
    # -1 bcoz there is no user or movie with id 0

# print(train_data_matrix)
#########################################################################
# using pairwise distance from from sklearn
#########################################################################
user_similarity = 1 - pairwise_distances(train_data_matrix, metric='cosine')
movie_similarity = 1 - pairwise_distances(train_data_matrix.T, metric='cosine')
# print(user_similarity)
#print(movie_similarity)
user_similarity_crr =1 - pairwise_distances(train_data_matrix, metric='correlation')
movie_similarity_crr = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
movie_similarity_crr[np.isnan(movie_similarity_crr)] = 0
print(user_similarity_crr.shape)
print(movie_similarity_crr.shape)

#########################################################################
# predicting the ratings
#########################################################################
def predict ( ratings, similarity, type='user' ):

    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        # here axis = 1 means along the row. Default is axis = 0 which means along the column
        #You use np.newaxis so that mean_user_rating has same format as ratings         
        #print(ratings.shape)  (943, 1682)
        #print(mean_user_rating[:, np.newaxis].shape) (943,1)


        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
        #print(pred.shape)
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
        #print(pred.shape)
    return pred

movie_prediction = predict (train_data_matrix, movie_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')


movie_prediction_crr = predict(train_data_matrix, movie_similarity_crr, type='item')
user_prediction_crr = predict(train_data_matrix, user_similarity_crr, type='user')

# print(movie_prediction[ :2])
# print(user_prediction)
# print(movie_prediction_crr[ :2])

#########################################################################
# Rmse
#########################################################################

def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

print ('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print ('User-based CF RMSE: ' + str(rmse(user_prediction, train_data_matrix)))
print ('Item-based CF RMSE: ' + str(rmse(movie_prediction, test_data_matrix)))

print ('User-based CRR CF RMSE: ' + str(rmse(user_prediction_crr, test_data_matrix)))
print ('Item-based CRR CF RMSE: ' + str(rmse(movie_prediction_crr, test_data_matrix)))


def predict_topk(ratings, similarity, kind='user', k=50):
    pred = np.zeros(ratings.shape)
    if kind == 'user':
        for i in range(ratings.shape[0]):
            # [:total no of elements required*-1 : -1 for descending order]
            top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
            for j in range(ratings.shape[1]):
                pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 
                pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
    if kind == 'item':
        for j in range(ratings.shape[1]):
            top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
            for i in range(ratings.shape[0]):
                pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 
                pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))        
    
    return pred

pred = predict_topk(train_data_matrix, user_similarity, kind='user', k=40)
print("user_pred")
print(pred)
print(pred.shape)
print ('Top-k User-based CF RMSE: ' + str(rmse(pred, test_data_matrix)))
'''
pred = predict_topk(train_data_matrix, movie_similarity_crr, kind='item', k=50)
print("item pred")
print(pred)
print(pred.shape)
print ('Top-k Item-based CF RMSE: ' + str(rmse(pred, test_data_matrix)))
'''

#########################################################################
# Recommending movies to a user with idx
######################################################################### 
idx_to_movie = {}
with open('./ml-100k/u.item', 'r') as f:
    for line in f.readlines():
        info = line.split('|')
        idx_to_movie[int(info[0])-1] = info[1]

def top_k_movies(similarity, mapper, movie_idx, k=5):
    return [mapper[x] for x in np.argsort(similarity[movie_idx,:])[:-k-1:-1]]

idx = 11
movies = top_k_movies(pred, idx_to_movie, idx)
# posters = tuple(Image(url=get_poster(movie, base_url)) for movie in movies)
print(movies[:])



















#########################################################################
# Model based approach SVD Singular Value Decomposition
#########################################################################
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(pred, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print ('Item-based CF RMSE: ' + str(rmse(X_pred, test_data_matrix)))
























'''
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # userID starts at 1, not 0
    predictions_df = pd.DataFrame(predictions_df)
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.userID == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieID', right_on = 'movieID').
                     sort_values(['rating'], ascending=False)
                 )

    print ('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print ('Recommending the highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['movieID'].isin(user_full['movieID'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieID',
               right_on = 'movieID').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

predictions = recommend_movies(pred, 837, item, data, 10)

print(predictions[:])




'''

''' def pearson_correlation(user1, user2):
    df1= dataset.loc[(dataset.userID == user1 ),'movieID'].tolist()
    df2= dataset.loc[(dataset.userID == user2 ),'movieID'].tolist()

    print(len(df1))
    print(len(df2))

    both_rated = { }
    for i in df1:
        if i in df2:
            both_rated[i] = 1       
    
    number_of_rating = len(both_rated)
    print(df1)
    print(df2)
    print(number_of_rating)

    if number_of_rating == 0:
        return 0 
    # Add up all the preferences of each user
    #print(user1_preferences_sum = sum(df1[item] for item in both_rated))
    #print(user2_preferences_sum = sum(df2[item] for item in both_rated))

    # Sum up the squares of preferences of each user
    user1_square_preferences_sum = sum([pow(df1[item],2) for item in both_rated])
    user2_square_preferences_sum = sum([pow(df2[item],2) for item in both_rated])

    # Sum up the product value of both preferences for each item
    product_sum_of_both_users = sum( [dataset[person1][item] * dataset[person2][item] for item in both_rated] )
 
    # Calculate the pearson score
    numerator_value = product_sum_of_both_users - ( user1_preferences_sum*user2_preferences_sum/number_of_ratings )
    denominator_value = sqrt( (user1_square_preferences_sum - pow(user1_preferences_sum,2)/number_of_ratings) * (user2_square_preferences_sum - pow(user2_preferences_sum,2)/number_of_ratings) )
    if denominator_value == 0:
        return 0
    else:
        r = numerator_value/denominator_value
        return r

print( pearson_correlation(1, 2))
 

pearson_correlation(4, 2)


# check_both_rated(1, 2) '''

943 1682
(95000, 4)
(5000, 4)
(943, 943)
(1682, 1682)
User-based CF RMSE: 2.763148978623458
User-based CF RMSE: 2.732867708896345
Item-based CF RMSE: 3.002423579565451
User-based CRR CF RMSE: 2.7458228780557237
Item-based CRR CF RMSE: 3.0764570775868783
user_pred
[[3.79010024 1.87940521 1.47018281 ... 0.         0.07344591 0.08130924]
 [2.12586135 0.         0.33903937 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.06027481 0.         0.        ]
 ...
 [3.76242197 0.         0.42610321 ... 0.         0.         0.        ]
 [2.13469403 0.7544691  0.14160944 ... 0.         0.         0.        ]
 [3.13629225 2.3165088  1.54567795 ... 0.         0.07197124 0.0703887 ]]
(943, 1682)
Top-k User-based CF RMSE: 2.3201094526589223
['Star Wars (1977)', 'Raiders of the Lost Ark (1981)', 'Empire Strikes Back, The (1980)', 'Silence of the Lambs, The (1991)', 'Forrest Gump (1994)']
Item-based CF RMSE: 2.268194993696417


" def pearson_correlation(user1, user2):\n    df1= dataset.loc[(dataset.userID == user1 ),'movieID'].tolist()\n    df2= dataset.loc[(dataset.userID == user2 ),'movieID'].tolist()\n\n    print(len(df1))\n    print(len(df2))\n\n    both_rated = { }\n    for i in df1:\n        if i in df2:\n            both_rated[i] = 1       \n    \n    number_of_rating = len(both_rated)\n    print(df1)\n    print(df2)\n    print(number_of_rating)\n\n    if number_of_rating == 0:\n        return 0 \n    # Add up all the preferences of each user\n    #print(user1_preferences_sum = sum(df1[item] for item in both_rated))\n    #print(user2_preferences_sum = sum(df2[item] for item in both_rated))\n\n    # Sum up the squares of preferences of each user\n    user1_square_preferences_sum = sum([pow(df1[item],2) for item in both_rated])\n    user2_square_preferences_sum = sum([pow(df2[item],2) for item in both_rated])\n\n    # Sum up the product value of both preferences for each item\n    product_sum_of_both_use