In [21]:
import numpy as np
import pandas as pd

In [22]:
## Dataset is downloaded from Movielens

## we are intersered in 2 data files that are u.data that contains userID, movieID, and the rating
## and the date on which the rating was given 

## u.item has a bunch of movie related details, that are title , genre, imdb link etc.
## we will just use this file for movie title.

In [23]:
dataFile="u.data"
data=pd.read_csv(dataFile, sep ='\t', header=None, names = ['userId','itemId','rating','timestamp'])
data.head()

Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [24]:
movieInfoFile="u.item"
movieInfo=pd.read_csv(movieInfoFile,sep='|', header=None, index_col= False,  names=['itemId','title'], usecols=[0,1])
movieInfo.head()

Unnamed: 0,itemId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [25]:
data=pd.merge(data,movieInfo, left_on="itemId", right_on="itemId")
data.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [26]:
## indexing data in data frames
userId=data.userId #panda series that containd=s only one column
userId2=data[["userId"]] #panda dataframe object of rows and column

In [27]:
userId.head()

0    196
1     63
2    226
3    154
4    306
Name: userId, dtype: int64

In [28]:
userId2.head()

Unnamed: 0,userId
0,196
1,63
2,226
3,154
4,306


In [29]:
type(userId)

pandas.core.series.Series

In [30]:
type(userId2)

pandas.core.frame.DataFrame

In [31]:
data=pd.DataFrame.sort_values(data,["userId","itemId"],ascending=[0,1])
data.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
23781,943,2,5,888639953,GoldenEye (1995)
65410,943,9,3,875501960,Dead Man Walking (1995)
35098,943,11,4,888639000,Seven (Se7en) (1995)
43773,943,12,5,888639093,"Usual Suspects, The (1995)"
57040,943,22,4,888639042,Braveheart (1995)


In [33]:
numUser=max(data.userId)
numItem=max(data.itemId)

MoviePerUser=data.itemId.value_counts()
UserPerMovie=data.title.value_counts()

UserPerMovie.head()

Star Wars (1977)             583
Contact (1997)               509
Fargo (1996)                 508
Return of the Jedi (1983)    507
Liar Liar (1997)             485
Name: title, dtype: int64

In [35]:
def favouriteMovies(activeUser, N):
    topMovies=pd.DataFrame.sort_values(data[data.userId==activeUser],["rating"],ascending=[0])[ : N]
    return list(topMovies.title)
print ( favouriteMovies(1,5))

['Toy Story (1995)', 'Maya Lin: A Strong Clear Vision (1994)', 'Empire Strikes Back, The (1980)', 'Delicatessen (1991)', 'Cinema Paradiso (1988)']


In [36]:
## Now we will creste a table in which each row corresponds to each user and each column represent each movie
## we will do it using pivot_table function in panda

userIdRatingMatrix = pd.pivot_table(data, values='rating', index=["userId"], columns=["itemId"])

## this basically creates a table that consists of rating of every user corresponding to every movie that th
## user has rated and in case the user has not rated any particular movie that particular cell will consiste=s of NaN
## and in case of multiple rating by any particular user for a movie will simply giv avg of all those rating.

userIdRatingMatrix.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [37]:
## function to compute similarity between 2 user
from scipy.spatial.distance import correlation
def similarity(user1, user2):
    user1=np.array(user1)-np.nanmean(user1)
    user2=np.array(user2)-np.nanmean(user2)
    
    commonId=[i for i in range (len(user1)) if user1[i]>0 and user2[i]>0]
    
    user1=[user1[i] for i in commonId]
    user2=[user2[i] for i in commonId]
    
    if len(user1)==0:
        return 0;
    else:
        return correlation(user1, user2)
similarity(userIdRatingMatrix.loc[5],userIdRatingMatrix.loc[5])
#for i in userIdRatingMatrix.index:
 #   print (i)

0.0

In [38]:
def NearestNeighbourRating(activeUser,K):
    similarityMatrix = pd.DataFrame(index=userIdRatingMatrix.index, columns=['similarity'])
    for i in userIdRatingMatrix.index:
        similarityMatrix.loc[i]=similarity(userIdRatingMatrix.loc[i],userIdRatingMatrix.loc[activeUser])
    similarityMatrix =  pd.DataFrame.sort_values(similarityMatrix,['similarity'], ascending=[0])
    nearestNeighbours =similarityMatrix[:K]
    neighbourItemRatingMatrix = userIdRatingMatrix.loc[nearestNeighbours.index]
    
    predictedItemRating = pd.DataFrame(index=userIdRatingMatrix.columns, columns=["Rating"])
    ## list of columns of user item rating matrix.
    ##predictedRating = np.nanmean(userIdRatingMatrix.loc[activeUser])
    for i in userIdRatingMatrix.columns:
        predictedRating = np.nanmean(userIdRatingMatrix.loc[activeUser])
        for j in neighbourItemRatingMatrix.index:
            if userIdRatingMatrix.loc[j,i]>0:
                predictedRating += ((userIdRatingMatrix.loc[j,i]-np.nanmean(userIdRatingMatrix.loc[j]))*nearestNeighbours.loc[j,'similarity'])
        predictedItemRating.loc[i, "Rating"]=predictedRating
    return predictedItemRating
    

NearestNeighbourRating(5,10)
    
    

  dist = 1.0 - uv / np.sqrt(uu * vv)


Unnamed: 0_level_0,Rating
itemId,Unnamed: 1_level_1
1,7.29552
2,2.87429
3,2.87429
4,2.87429
5,2.87429
6,2.87429
7,-2.34085
8,2.87429
9,-1.42448
10,2.87429


In [41]:
def topNRecommendation(activeUser, N):
    predictedRating=NearestNeighbourRating(activeUser,N)
    moviesAlreadyWatched = list( userIdRatingMatrix.loc[activeUser].loc[userIdRatingMatrix.loc[activeUser]>0].index )
    predictedRating = predictedRating.drop(moviesAlreadyWatched)
    topRecommendation = pd.DataFrame.sort_values(predictedRating,['Rating'],ascending=[0])[:N]
    topRecommendationTitle = (movieInfo.loc[movieInfo.itemId.isin(topRecommendation.index)])
    return list(topRecommendationTitle.title)
    

In [42]:
activeUser = 5
print (favouriteMovies(activeUser,5) ,"\n" ,topNRecommendation(activeUser,3) )

  dist = 1.0 - uv / np.sqrt(uu * vv)


['Men in Black (1997)', 'Blade Runner (1982)', 'Empire Strikes Back, The (1980)', 'Wrong Trousers, The (1993)', 'Blues Brothers, The (1980)'] 
 ['Truth About Cats & Dogs, The (1996)', 'Jerry Maguire (1996)', 'Scream (1996)']


In [43]:
## the method used til now is based on finding the nearest k neighbours, the finds the rating for any of the unrated movie
## by the user on the basis of matching the closest k neighbours that have the nearly same behavior of rating movies as the user
## for whom we are predicting...
## Now another method i.e Latent based filtening is used.

In [49]:
def xrange(x):
    return iter(range(x))


def matrixFactorization(R, K, steps=10, gamma=0.001, lamda=0.2):
    # R is user item rating matrix, having each row containing different
    # rating tha a user has given to different movies
    # K is number of factors on the basis of which we will be finding the recommendation for a user
    # SGD is used to minimize the error...
    N=len(R.index)
    M=len(R.columns)
    P=pd.DataFrame(np.random.rand(N,K),index = R.index)
    Q=pd.DataFrame(np.random.rand(M,K),index = R.columns)
    
    for step in xrange(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij = R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i] = P.loc[i]-gamma*(eij*Q.loc[j] - lamda*P.loc[i])
                    Q.loc[j] = Q.loc[j]-gamma*(eij*P.loc[i] - lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e = e + pow(R.loc[i,j] - np.dot(P.loc[i],Q.loc[j]),2) +lamda*(pow(np.linalg.norm(P.loc[i]) + np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        print (step)
    return P,Q


(P,Q) = matrixFactorization(userIdRatingMatrix , K=2, gamma=0.001, lamda=0.02, steps=10)
                    
                    
                    
                    
                    
                    
                    
                    

0
1
2
3




4
5
6
7
8
9


In [54]:
activeUser = 1
predictedItemRating = pd.DataFrame(np.dot(P.loc[activeUser],Q.T),index=Q.index, columns=['Rating'])
topRecommendation = pd.DataFrame.sort_values(predictedItemRating,['Rating'], ascending=[0])[:3]
topRecommendedMovies = movieInfo.loc[movieInfo.itemId.isin(topRecommendation.index)]
topRecommendedMovies

Unnamed: 0,itemId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
