# Recommender for MovieLens

### Getting MovieLens data

* Download the movielens 100k dataset from this link: [ml-100k.zip](http://files.grouplens.org/datasets/movielens/ml-100k.zip)

* Upload ml-100k.zip using "My Data" to /resources/data 

* Extract using the following cell:

In [None]:
!unzip /resources/data/ml-100k.zip -d /resources/data

Archive:  /resources/data/ml-100k.zip
replace /resources/data/ml-100k/allbut.pl? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

### Building the recommender

In [88]:
# import required libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from heapq import nlargest
from sklearn.metrics import mean_squared_error
from math import sqrt
import os.path
import scipy
import scipy.stats as st


In [89]:
# define constant for movie lends 100K directory
MOVIELENS_DIR = "/resources/data/ml-100k/"

## Loading the data

First, we inspect the directory content

In [3]:
!ls $MOVIELENS_DIR

allbut.pl  u1.base  u2.test  u4.base  u5.test  ub.base	u.genre  u.occupation
mku.sh	   u1.test  u3.base  u4.test  ua.base  ub.test	u.info	 u.user
README	   u2.base  u3.test  u5.base  ua.test  u.data	u.item


We then load the full MovieLens 100K dataset to find the number of users and items

In [90]:
fields = ['userID', 'itemID', 'rating', 'timestamp']
ratingDF = pd.read_csv(os.path.join(MOVIELENS_DIR, 'u.data'), sep='\t', names=fields)

ratingDF.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [91]:
numUsers = len(ratingDF.userID.unique())
numItems = len(ratingDF.itemID.unique())

print("Number of users:", numUsers)
print("Number of items:", numItems)

Number of users: 943
Number of items: 1682


In [92]:
fieldsMovies = ['movieID', 'movieTitle', 'releaseDate', 'videoReleaseDate', 'IMDbURL', 'unknown', 'action', 'adventure',
          'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'filmNoir', 'horror',
          'musical', 'mystery', 'romance','sciFi', 'thriller', 'war', 'western']
moviesDF = pd.read_csv(os.path.join(MOVIELENS_DIR, 'u.item'), sep='|', names=fieldsMovies, encoding='latin-1')

moviesDF.head()

Unnamed: 0,movieID,movieTitle,releaseDate,videoReleaseDate,IMDbURL,unknown,action,adventure,animation,childrens,...,fantasy,filmNoir,horror,musical,mystery,romance,sciFi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Then, we load a train-test split

In [93]:
trainDF = pd.read_csv(os.path.join(MOVIELENS_DIR, 'u1.base'), sep='\t', names=fields)
testDF = pd.read_csv(os.path.join(MOVIELENS_DIR, 'u1.test'), sep='\t', names=fields)

# test number of records (total should be 100K)
print("# of lines in train:", trainDF.shape[0])
print("# of lines in test:", testDF.shape[0])

# of lines in train: 80000
# of lines in test: 20000


## Building User-to-Item Rating Matrix

In [94]:
def buildUserItemMatrix(dataset, numUsers, numItems):
    # Initialize a of size (numUsers, numItems) to zeros
    matrix = np.zeros((numUsers, numItems), dtype=np.int8)
    
    # Populate the matrix based on the dataset
    for (index, userID, itemID, rating, timestamp) in dataset.itertuples():
        matrix[userID-1, itemID-1] = rating
    return matrix

In [95]:
trainUserItemMatrix = buildUserItemMatrix(trainDF, numUsers, numItems)
testUserItemMatrix = buildUserItemMatrix(testDF, numUsers, numItems)

## Baseline solution - Average User Rating

In [97]:
def predictByUserAverage(trainSet, numUsers, numItems):
    # Initialize the predicted rating matrix with zeros
    predictionMatrix = np.zeros((numUsers, numItems))
    
    for (user,item), rating in np.ndenumerate(trainSet):
        # Predict rating for every item that wasn't ranked by the user (rating == 0)
        if rating == 0:
            # Extract the items the user already rated
            userVector = trainSet[user, :]
            ratedItems = userVector[userVector.nonzero()]
            
            # If not empty, calculate average and set as rating for the current item
            if ratedItems.size == 0:
                itemAvg = 0
            else:
                itemAvg = ratedItems.mean()
            predictionMatrix[user, item] = itemAvg
            
        # report progress every 100 users
        if (user % 100 == 0 and item == 1):
            print ("calculated %d users" % (user,))
    
    return predictionMatrix

In [98]:
userAvgPreiction = predictByUserAverage(trainUserItemMatrix, numUsers, numItems)

calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


## How well did we do?

In [99]:
def rmse(pred, test):
    # calculate RMSE for all the items in the test dataset
    predItems = pred[test.nonzero()].flatten() 
    testItems = test[test.nonzero()].flatten()
    return sqrt(mean_squared_error(predItems, testItems))

## User-User Similarity

In [100]:
userSimilarity = 1 - pairwise_distances(trainUserItemMatrix, metric='cosine')

In [101]:
def predictByUserSimilarity(trainSet, numUsers, numItems, similarity):
    # Initialize the predicted rating matrix with zeros
    predictionMatrix = np.zeros((numUsers, numItems))
    
    for (user,item), rating in np.ndenumerate(trainSet):
        # Predict rating for every item that wasn't ranked by the user (rating == 0)
        if rating == 0:
            # Extract the users that provided rating for this item
            itemVector = trainSet[:,item]
           
            usersRatings = itemVector[itemVector.nonzero()]
            
            # Get the similarity score for each of the users that provided rating for this item
           
            usersSim = similarity[user,:][itemVector.nonzero()]
            
            # If there no users that ranked this item, use user's average
            if len(usersSim) == 0:
                userVector = trainSet[user, :]
                ratedItems = userVector[userVector.nonzero()]
                
                # If the user didnt rated any item use 0, otherwise use average
                if len(ratedItems) == 0:
                    predictionMatrix[user,item] = 0
                else:
                    predictionMatrix[user,item] = ratedItems.mean()
            else:
                # predict score based on user-user similarity
                predictionMatrix[user,item] = (usersRatings*usersSim).sum() / usersSim.sum()
        
        # report progress every 100 users
        if (user % 100 == 0 and item == 1):
            print ("calculated %d users" % (user,))
    

    return predictionMatrix

In [32]:
userSimPreiction = predictByUserSimilarity(trainUserItemMatrix, numUsers, numItems, userSimilarity)

calculated 0 users
20


In [17]:
rmse(userSimPreiction, testUserItemMatrix)

print(st.t.interval(0.95, len(popprecision)-1, loc=rmse(userSimPreiction, testUserItemMatrix), scale=st.sem(popprecision)))

1.026449013124381

## Precision@k and Recall@k

In [102]:
def avgPrecisionAtK(testSet, prediction, k):
    # Initialize sum and count vars for average calculation
    sumPrecisions = 0
    countPrecisions = 0
    
    # Define function for converting 1-5 rating to 0/1 (like / don't like)
    vf = np.vectorize(lambda x: 1 if x >= 4 else 0)
    
    for userID in range(numUsers):
        # Pick top K based on predicted rating
        userVector = prediction[userID,:]
        topK = nlargest(k, range(len(userVector)), userVector.take)
        
        # Convert test set ratings to like / don't like
        userTestVector = vf(testSet[userID,:]).nonzero()[0]
        
        # Calculate precision
        precision = len([item for item in topK if item in userTestVector])/len(topK)
        
        # Update sum and count
        sumPrecisions += precision
        countPrecisions += 1
        
    # Return average P@k
    return sumPrecisions/countPrecisions

In [103]:
def avgRecallAtK(testSet, prediction, k):
    # Initialize sum and count vars for average calculation
    sumRecalls = 0
    countRecalls = 0
    
    # Define function for converting 1-5 rating to 0/1 (like / don't like)
    vf = np.vectorize(lambda x: 1 if x >= 4 else 0)
    
    for userID in range(numUsers):
        # Pick top K based on predicted rating
        userVector = prediction[userID,:]
        topK = nlargest(k, range(len(userVector)), userVector.take)
        
        # Convert test set ratings to like / don't like
        userTestVector = vf(testSet[userID,:]).nonzero()[0]
        
        # Ignore user if has no ratings in the test set
        if (len(userTestVector) == 0):
            continue
        
        # Calculate recall
        recall = len([item for item in topK if item in userTestVector])/len(userTestVector)
        
        # Update sum and count
        sumRecalls += recall
        countRecalls += 1
    
    # Return average R@k
    return sumRecalls/countRecalls

In [None]:
print("k\tP@k\tR@k")
for k in [25, 50, 100, 250, 500]:
    print("%d\t%.3lf\t%.3lf" % (k, avgPrecisionAtK(testUserItemMatrix, userSimPreiction, k), avgRecallAtK(testUserItemMatrix, userSimPreiction, k)))

## Popularity Based Recommendations

In [21]:
def predictByPopularity(trainSet, numUsers, numItems):
    # Initialize the predicted rating matrix with zeros
    predictionMatrix = np.zeros((numUsers, numItems))
    
    # Define function for converting 1-5 rating to 0/1 (like / don't like)
    vf = np.vectorize(lambda x: 1 if x >= 4 else 0)
    
    # For every item calculate the number of people liked (4-5) divided by the number of people that rated
    itemPopularity = np.zeros((numItems))
    for item in range(numItems):
        numOfUsersRated = len(trainSet[:, item].nonzero()[0])
        numOfUsersLiked = len(vf(trainSet[:, item]).nonzero()[0])
        if numOfUsersRated == 0:
            itemPopularity[item] = 0
        else:
            itemPopularity[item] = numOfUsersLiked/numOfUsersRated
    
    for (user,item), rating in np.ndenumerate(trainSet):
        # Predict rating for every item that wasn't ranked by the user (rating == 0)
        if rating == 0:
            predictionMatrix[user, item] = itemPopularity[item]
            
        # report progress every 100 users
        if (user % 100 == 0 and item == 1):
            print ("calculated %d users" % (user,))
    
    return predictionMatrix

In [63]:
popPreiction = predictByPopularity(trainUserItemMatrix, numUsers, numItems)

calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users


## Making Recommendations for a User

In [34]:
def userTopK(prediction, moviesDataset, userID, k):
    # Pick top K based on predicted rating
    userVector = prediction[userID+1,:]
    topK = nlargest(k, range(len(userVector)), userVector.take)
    namesTopK = list(map(lambda x: moviesDataset[moviesDataset.movieID == x+1]["movieTitle"].values[0], topK))
    return namesTopK

In [35]:
# recommend for userID 350 according to popularity recommender
userTopK(popPreiction, moviesDF, 350, 10)

['Loch Ness (1995)',
 'Perfect Candidate, A (1996)',
 'Love and Death on Long Island (1997)',
 'Crossfire (1947)',
 'Celestial Clockwork (1994)',
 'They Made Me a Criminal (1939)',
 'Last Time I Saw Paris, The (1954)',
 'Innocents, The (1961)',
 "Jupiter's Wife (1994)",
 'Prefontaine (1997)']

In [None]:
# recommend for userID 350 according to average rating recommender
userTopK(userAvgPreiction, moviesDF, 350, 10)

In [None]:
# recommend for userID 350 according to user similarity recommender
userTopK(userSimPreiction, moviesDF, 350, 10)

## Evaluating the other datasets

In [58]:
datasetsFileNames = [('u1.base', 'u1.test'),  
                     ('u2.base', 'u2.test'),
                     ('u3.base', 'u3.test'),
                     ('u4.base', 'u4.test'),
                     ('u5.base', 'u5.test')]

In [84]:
simrmseList = []
eucrmseList = []
manrsmeList = []
for trainFileName, testFileName in datasetsFileNames:
    curTrainDF = pd.read_csv(os.path.join(MOVIELENS_DIR, trainFileName), sep='\t', names=fields)
    curTestDF = pd.read_csv(os.path.join(MOVIELENS_DIR, testFileName), sep='\t', names=fields)
    curTrainUserItemMatrix = buildUserItemMatrix(curTrainDF, numUsers, numItems)
    curTestUserItemMatrix = buildUserItemMatrix(curTestDF, numUsers, numItems)
    
     
    curUserSimilarity = 1 - pairwise_distances(curTrainUserItemMatrix, metric='cosine')
    curUserSimPreiction = predictByUserSimilarity(curTrainUserItemMatrix, numUsers, numItems, curUserSimilarity)
    simRMSE = rmse(curUserSimPreiction, curTestUserItemMatrix)
    simrmseList.append(simRMSE)
    
    cureucSimilarity = 1/(1 + pairwise_distances(curTrainUserItemMatrix, metric='euclidean'))
    cureucSimPreiction = predictByUserSimilarity(curTrainUserItemMatrix, numUsers, numItems, cureucSimilarity)
    eucRMSE = rmse(cureucSimPreiction, curTestUserItemMatrix)
    eucrmseList.append(eucRMSE)
    
    curmanSimilarity = 1 - (pairwise_distances(curTrainUserItemMatrix, metric='manhattan') * .1)
    curmanSimPreiction = predictByUserSimilarity(curTrainUserItemMatrix, numUsers, numItems, curmanSimilarity)
    manRMSE = rmse(curmanSimPreiction, curTestUserItemMatrix)
    manrsmeList.append(manRMSE)    
    

calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users
calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users
calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users
calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users
calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
ca

In [87]:
manavg = np.mean(manrsmeList)
eucavg = np.mean(eucrmseList)
cosavg = np.mean(simrmseList)

CIman = st.t.interval(0.95, len(manrsmeList)-1, loc=np.mean(manrsmeList), scale=st.sem(manrsmeList))
CIeuc = st.t.interval(0.95, len(eucrmseList)-1, loc=np.mean(eucrmseList), scale=st.sem(eucrmseList))
CIcos = st.t.interval(0.95, len(simrmseList)-1, loc=np.mean(simrmseList), scale=st.sem(simrmseList))

print(manavg)
print(eucavg)
print(cosavg)
print(CIman)
print(CIeuc)
print(CIcos)

1.02692517384
1.0224877621
1.01735412166
(1.0198755618891411, 1.0339747857902604)
(1.013706565892023, 1.031268958300787)
(1.0090130802261479, 1.0256951630950137)


## Item-Item

In [49]:
itemSimilarity = 1 - (pairwise_distances(trainUserItemMatrix.T, metric='cosine') )
def predictByItemSimilarity(trainSet, numUsers, numItems, similarity):
    # Initialize the predicted rating matrix with zeros
    predictionMatrix = np.zeros((numItems, numUsers))
    
    for (user,item), rating in np.ndenumerate(trainSet):
        # Predict rating for every user that wasn't ranked by the user (rating == 0)
        if rating == 0:
            # Extract the users that provided rating for this item
            itemVector = trainSet[:,item]
            
            usersRatings = itemVector[itemVector.nonzero()]
            
            # Get the similarity score for each of the items that provided rating for this item
           
            usersSim = similarity[user,:][itemVector.nonzero()]
             
            # If there no items that were ranked by this user, use item's average
            if len(usersSim) == 0:
                userVector = trainSet[user, :]
                ratedItems = userVector[userVector.nonzero()]
                
                # If the items werent rated use 0, otherwise use average
                if len(ratedItems) == 0:
                    predictionMatrix[user,item] = 0
                else:
                    predictionMatrix[user,item] = ratedItems.mean()
            else:
                # predict score based on item-item similarity
                if(usersSim.sum() == 0):
                    predictionMatrix[user,item] = 0
                else:
                    predictionMatrix[user,item] = (usersRatings*usersSim).sum() / usersSim.sum()
        
        # report progress every 100 users
        if (user % 100 == 0 and item == 1):
            print ("calculated %d users" % (user,))
    

    return predictionMatrix

In [50]:
itemSimPreiction = predictByItemSimilarity(trainUserItemMatrix.T, numUsers, numItems, itemSimilarity)

calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users
calculated 1000 users
calculated 1100 users
calculated 1200 users
calculated 1300 users
calculated 1400 users
calculated 1500 users
calculated 1600 users


In [51]:
rmse(itemSimPreiction, testUserItemMatrix.T)

1.0377631264364242

In [30]:
def userTopK(prediction, moviesDataset, userID, k):
    # Pick top K based on predicted rating
    userVector = prediction[userID+1,:]
    topK = nlargest(k, range(len(userVector)), userVector.take)
    namesTopK = list(map(lambda x: moviesDataset[moviesDataset.movieID == x+1]["movieTitle"].values[0], topK))
    return namesTopK

## Predicting Movies Using Item-Item

In [37]:
userTopK(itemSimPreiction, moviesDF, 3, 10)

['Venice/Venice (1992)',
 'Chairman of the Board (1998)',
 'Chairman of the Board (1998)',
 'Santa with Muscles (1996)',
 'Ballad of Narayama, The (Narayama Bushiko) (1958)',
 'Falling in Love Again (1980)',
 'Convent, The (Convento, O) (1995)',
 'Letter From Death Row, A (1998)',
 'Innocent Sleep, The (1995)',
 'Turning, The (1992)']

In [76]:
movie = userSimilarity[:,5]

movie = np.argsort(movie)

for a in range(5):
    
    print(moviesDF.movieTitle[movie[a]])

Scream of Stone (Schrei aus Stein) (1991)
Other Voices, Other Rooms (1997)
Big Bang Theory, The (1994)
Chairman of the Board (1998)
Marked for Death (1990)


In [60]:
rmseList2 = []
for trainFileName, testFileName in datasetsFileNames:
    curTrainDF = pd.read_csv(os.path.join(MOVIELENS_DIR, trainFileName), sep='\t', names=fields)
    curTestDF = pd.read_csv(os.path.join(MOVIELENS_DIR, testFileName), sep='\t', names=fields)
    curTrainUserItemMatrix = buildUserItemMatrix(curTrainDF, numUsers, numItems)
    curTestUserItemMatrix = buildUserItemMatrix(curTestDF, numUsers, numItems)
    
     
    curUserSimilarity = 1 - pairwise_distances(curTrainUserItemMatrix.T, metric='cosine')
    curUserSimPreiction = predictByItemSimilarity(curTrainUserItemMatrix.T, numUsers, numItems, curUserSimilarity)
    simRMSE = rmse(curUserSimPreiction, curTestUserItemMatrix.T)
    
    rmseList2.append(simRMSE)

calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users
calculated 1000 users
calculated 1100 users
calculated 1200 users
calculated 1300 users
calculated 1400 users
calculated 1500 users
calculated 1600 users
calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users
calculated 1000 users
calculated 1100 users
calculated 1200 users
calculated 1300 users
calculated 1400 users
calculated 1500 users
calculated 1600 users
calculated 0 users
calculated 100 users
calculated 200 users
calculated 300 users
calculated 400 users
calculated 500 users
calculated 600 users
calculated 700 users
calculated 800 users
calculated 900 users
calculated 1000 users
calculated 1100 users
calculated 1200 users
ca

## RMSE Stat for UserAvgPrediction

In [None]:
rmseList3 = []
for trainFileName, testFileName in datasetsFileNames:
    curTrainDF = pd.read_csv(os.path.join(MOVIELENS_DIR, trainFileName), sep='\t', names=fields)
    curTestDF = pd.read_csv(os.path.join(MOVIELENS_DIR, testFileName), sep='\t', names=fields)
    curTrainUserItemMatrix = buildUserItemMatrix(curTrainDF, numUsers, numItems)
    curTestUserItemMatrix = buildUserItemMatrix(curTestDF, numUsers, numItems)
    userAvgPreiction = predictByUserAverage(curTrainUserItemMatrix, numUsers, numItems)
    rmseList3.append(rmse(userAvgPreiction, curTestUserItemMatrix))

In [107]:
print(np.mean(rmseList3))

print(st.t.interval(0.95, len(rmseList3)-1, loc=np.mean(rmseList3), scale=st.sem(rmseList3)))

1.04371765616
(1.0289303496379316, 1.0585049626810734)
