In [12]:
# A dictionary of movie critics and their ratings of a small set of movies

critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 3.5},
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
'The Night Listener': 4.5, 'Superman Returns': 4.0,
'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 2.0},
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

In [13]:
# Euclidean distance score

from math import sqrt

# Return a distance-based similarity score for person 1 and person 2

def sim_distance(prefs, person1, person2):
    si = dict()
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1

    if len(si)==0: return 0

    # Add up the squares of all the differences
    sum_of_squares = sum([pow(prefs[person1][item]-prefs[person2][item],2) for item in prefs[person1] if item in prefs[person2]])
    return (1/(1+sum_of_squares))

sim_distance(critics, 'Lisa Rose', 'Gene Seymour')

0.14814814814814814

In [10]:
"""Pearson correlation score - works better where the data isn't well normalized 
e.g. if critics are routinely more harsh than average. Pearson correlation score corrects for grade inflation (can still be
perfect correlation if the difference between scores is consistent) whereas Euclidean score will say two critics are dissimilar
because one is consistently harsher than the other, even if their tastes are very similar. 
Pearson correlation coefficient - how much variables change together divided by the product of how much they vary individually"""

# Returns the Pearson correlation coefficient for person 1 and person 2

def sim_pearson(prefs, p1, p2):
    si = dict()
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item]=1
            
    if len(si) == 0: return 0
    
    sum1 = sum([prefs[p1][it] for it in si])
    sum2 = sum([prefs[p2][it] for it in si])
    
    sum1Sq = sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq = sum([pow(prefs[p2][it],2) for it in si])
    
    pSum = sum([prefs[p1][it]*prefs[p2][it] for it in si])
    
    num = pSum - ((sum1*sum2)/len(si))
    den = sqrt((sum1Sq - pow(sum1,2)/len(si))*(sum2Sq-pow(sum2,2)/len(si)))
    if den == 0: return 0
    
    I = num/den
    
    return I

sim_pearson(critics, 'Lisa Rose', 'Gene Seymour')

0.39605901719066977

In [14]:
# To get an ordered list of people with similar tastes to the specified person

def topmatches(prefs, person, n=5, similarity = sim_pearson):
    scores = [(similarity(prefs, person, other), other) for other in prefs if other!=person]
    
    scores.sort()
    scores.reverse()
    return scores[0:n]

topmatches(critics, 'Toby', n=3)

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig')]

In [15]:
# Get recommendations for a person by using a weighted average of every other user's rankings

def getrecommendations(prefs, person, similarity = sim_pearson):
    totals = dict()
    simSums = dict()
    
    for other in prefs:
        if other==person: continue
        sim = similarity(prefs, person, other)
        
        if sim<=0: continue
        for item in prefs[other]:
            if item not in prefs[person] or prefs[person][item]==0:
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                
                simSums.setdefault(item,0)
                simSums[item]+=sim
                
    rankings = [(total/simSums[item],item) for item,total in totals.items()]
    
    rankings.sort()
    rankings.reverse()
    
    return rankings

getrecommendations(critics, 'Toby')

[(3.3477895267131017, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.530980703765565, 'Just My Luck')]

In [16]:
def transformprefs(prefs):
    result=dict()
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            result[item][person] = prefs[person][item]
        
    return result

movies = transformprefs(critics)
topmatches(movies, 'Superman Returns')
getrecommendations(movies, 'Just My Luck')

[(4.0, 'Michael Phillips'), (3.0, 'Jack Matthews')]

In [17]:
#Use transformprefs to give a list of items with their user rating, then loop over every item through topmatches to get most 
# similar items along with their similarity scores
# Item-based filtering outperforms user-based filtering in sparse datasets, perform equally in dense datasets. Item-based filtering
# is significantly faster on a large dataset
def calculateSimilarItems(prefs, n=10):
    result = dict()
    itemPrefs = transformprefs(prefs)
    c=0
    for item in itemPrefs:
        c+=1
        if c%100==0: print(c, len(itemPrefs))
        scores = topmatches(itemPrefs, item, n=n, similarity = sim_distance)
        result[item] = scores
    return result

itemsim = calculateSimilarItems(critics)
print(itemsim)

{'Lady in the Water': [(0.4, 'You, Me and Dupree'), (0.2857142857142857, 'The Night Listener'), (0.2222222222222222, 'Snakes on a Plane'), (0.2222222222222222, 'Just My Luck'), (0.09090909090909091, 'Superman Returns')], 'Snakes on a Plane': [(0.2222222222222222, 'Lady in the Water'), (0.18181818181818182, 'The Night Listener'), (0.16666666666666666, 'Superman Returns'), (0.10526315789473684, 'Just My Luck'), (0.05128205128205128, 'You, Me and Dupree')], 'Just My Luck': [(0.2222222222222222, 'Lady in the Water'), (0.18181818181818182, 'You, Me and Dupree'), (0.15384615384615385, 'The Night Listener'), (0.10526315789473684, 'Snakes on a Plane'), (0.06451612903225806, 'Superman Returns')], 'Superman Returns': [(0.16666666666666666, 'Snakes on a Plane'), (0.10256410256410256, 'The Night Listener'), (0.09090909090909091, 'Lady in the Water'), (0.06451612903225806, 'Just My Luck'), (0.05333333333333334, 'You, Me and Dupree')], 'You, Me and Dupree': [(0.4, 'Lady in the Water'), (0.1818181818

In [18]:
# Give recommendations using the item similarity dictionary without going through the whole database

def getRecommendedItems(prefs, itemMatch, user):
    userRatings = prefs[user]
    scores = dict()
    totalSim = dict()
    
    # Loop over items rated by this user
    for item, rating in userRatings.items():
        # Loop over items similar to this one
        for similarity, item2 in itemMatch[item]:
            # Ignore if this user has already rated this item
            if item2 in userRatings: continue
            
            # Weighted sum of rating times similarity
            scores.setdefault(item2,0)
            scores[item2]+=similarity*rating
            
            # Sum of all the similarities
            totalSim.setdefault(item2,0)
            totalSim[item2]+=similarity
            
    # Divide each total score by total weighting to get an average
    rankings = [(score/totalSim[item],item) for item,score in scores.items()]
        
    # Return the rankings from highest to lowest
    rankings.sort()
    rankings.reverse()
    return rankings    

getRecommendedItems(critics, itemsim, 'Toby')    

[(3.182634730538922, 'The Night Listener'),
 (2.5983318700614575, 'Just My Luck'),
 (2.4730878186968837, 'Lady in the Water')]

In [26]:
new_path = '/Users/siyunhe/project/Summer2020/SomerCloud/Algorithm/recommendation-system/Non-Deeplearning/Franceshe_fork/CollectiveIntelligence/Ch2_RecommendationEngines/data/movielens'

#def loadMovieLens(path = '\\data\\movielens'):
def loadMovieLens(path = new_path):
    movies = dict()
    for line in open(path+'/u.item', encoding = "ISO-8859-1"):
        id, title = line.split('|')[0:2]
        movies[id] = title
        
    prefs = dict()
    for line in open(path+'/u.data',):
        user, movieid, rating, ts = line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]] = float(rating)
    return prefs

prefs=loadMovieLens()
#prefs['87']
#getrecommendations(prefs, '87')[0:30]
itemsim = calculateSimilarItems(prefs, n=50)
getRecommendedItems(prefs, itemsim, '87')[0:30]

100 1664
200 1664
300 1664
400 1664
500 1664
600 1664
700 1664
800 1664
900 1664
1000 1664
1100 1664
1200 1664
1300 1664
1400 1664
1500 1664
1600 1664


[(5.0, "What's Eating Gilbert Grape (1993)"),
 (5.0, 'Vertigo (1958)'),
 (5.0, 'Usual Suspects, The (1995)'),
 (5.0, 'Toy Story (1995)'),
 (5.0, 'Titanic (1997)'),
 (5.0, 'Sword in the Stone, The (1963)'),
 (5.0, 'Stand by Me (1986)'),
 (5.0, 'Sling Blade (1996)'),
 (5.0, 'Silence of the Lambs, The (1991)'),
 (5.0, 'Shining, The (1980)'),
 (5.0, 'Shine (1996)'),
 (5.0, 'Sense and Sensibility (1995)'),
 (5.0, 'Scream (1996)'),
 (5.0, 'Rumble in the Bronx (1995)'),
 (5.0, 'Rock, The (1996)'),
 (5.0, 'Robin Hood: Prince of Thieves (1991)'),
 (5.0, 'Reservoir Dogs (1992)'),
 (5.0, 'Police Story 4: Project S (Chao ji ji hua) (1993)'),
 (5.0, 'House of the Spirits, The (1993)'),
 (5.0, 'Fresh (1994)'),
 (5.0, 'Denise Calls Up (1995)'),
 (5.0, 'Day the Sun Turned Cold, The (Tianguo niezi) (1994)'),
 (5.0, 'Before the Rain (Pred dozhdot) (1994)'),
 (5.0, 'Assignment, The (1997)'),
 (5.0, '1-900 (1994)'),
 (4.875, "Ed's Next Move (1996)"),
 (4.833333333333333, 'Anna (1996)'),
 (4.8, 'Dark City 