In [None]:
import warnings
warnings.filterwarnings("ignore")
import scikits.crab
from scikits.crab.datasets import load_sample_movies

data = load_sample_movies()
from scikits.crab.models import MatrixPreferenceDataModel
model = MatrixPreferenceDataModel(data.data)
from scikits.crab.metrics import pearson_correlation
#from scikits.crab.metrics.pairwise import euclidean_distances
from scikits.crab.similarities import UserSimilarity
print 'Build the similarity'
similarity = UserSimilarity(model, pearson_correlation)

from scikits.crab.recommenders.knn import UserBasedRecommender
print 'Build the User based recommender'
recommender = UserBasedRecommender(model, similarity, with_preference=True)
print 'Build the User based metrics'
from scikits.crab.metrics.classes import CfEvaluator
evaluator = CfEvaluator()
for k in [1,3,5,10]:
    all_scores = evaluator.evaluate(recommender=recommender, permutation=False, at=k)
    print 'k :', k, ' ', all_scores
print 'Recommend items for the users'
# for user in data.data.keys():
recommender.recommend(5)

Build the similarity
Build the User based recommender
Build the User based metrics
metrics
k : 1   {'recall': 1.0, 'precision': 1.0}
metrics
k : 3   {'recall': 1.0, 'precision': 1.0}
metrics
k : 5   {'recall': nan, 'precision': nan}
metrics
k : 10   {'recall': nan, 'precision': nan}
Recommend items for the users


[(5, 3.3477895267131013), (1, 2.8572508984333034), (6, 2.4473604699719846)]

In [None]:
import warnings
warnings.filterwarnings("ignore")
import scikits.crab
from scikits.crab.datasets import load_movielens_r100k

data = load_movielens_r100k()
from scikits.crab.models import MatrixPreferenceDataModel
model = MatrixPreferenceDataModel(data.data)
from scikits.crab.metrics import pearson_correlation
#from scikits.crab.metrics.pairwise import euclidean_distances
from scikits.crab.similarities import UserSimilarity
print 'Build the similarity'
similarity = UserSimilarity(model, pearson_correlation)

from scikits.crab.recommenders.knn import UserBasedRecommender
print 'Build the User based recommender'
recommender = UserBasedRecommender(model, similarity, with_preference=True)
print 'Build the User based metrics'
from scikits.crab.metrics.classes import CfEvaluator
evaluator = CfEvaluator()
for k in [1,3,5,10]:
    all_scores = evaluator.evaluate(recommender=recommender, permutation=False, at=k)
    print 'k :', k, ' ', all_scores
print 'Recommend items for the users'
# for user in data.data.keys():
recommender.recommend(5)

Build the similarity
Build the User based recommender
Build the User based metrics
metrics


In [None]:
k

In [5]:
from math import sqrt
import codecs, csv

def get_data():
    return [line.strip().split('::') for line in open('movielens/ratings.dat')]
def get_movies():
    return [line.strip().split('::') for line in codecs.open('movielens/movies.dat', 'r', 'utf-8')]
# --- Configuration

k = 50
theuser = 6041 #944

# --- Distance measures
def avg(numbers, n = 2):
    return (sum(numbers) + (3.0 * n)) / float(len(numbers) + n)

def minkowski(rating1, rating2, r = 3):
    """Computes the Minkowski distance.
    Both rating1 and rating2 are dictionaries of the form
    {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}"""
    
    distance = 0
    commonRatings = False
    for key in rating1:
        if key in rating2:
            distance += pow(abs(rating1[key] - rating2[key]), r)
            commonRatings = True
    if commonRatings:
        return pow(float(distance),  1/r)
    else:
        return 1000000 #Indicates no ratings in common

def pearson(rating1, rating2):
    sum_xy = 0
    sum_x = 0
    sum_y = 0
    sum_x2 = 0
    sum_y2 = 0
    n = 0
    for key in rating1:
        if key in rating2:
            n += 1
            x = rating1[key]
            y = rating2[key]
            sum_xy += x * y
            sum_x += x
            sum_y += y
            sum_x2 += x ** 2
            sum_y2 += y ** 2
            
    # now compute denominator
    if n == 0:
        return 1000000
    denominator = (sqrt(sum_x2 - (sum_x**2) / n) *
                  sqrt(sum_y2 -(sum_y**2) / n))
    if denominator == 0:
        return 100000000
    else:
        sim = (sum_xy - (sum_x * sum_y) / n) / denominator
        return 1.0 - sim

def lmg_rmse(rating1, rating2):
    max_rating = 5.0
    sum = 0
    count = 0
    for (key, rating) in rating1.items():
        if key in rating2:
            sum += (rating2[key] - rating) ** 2
            count += 1

    if not count:
        return 1000000 # no common ratings, so distance is huge
    
    return sqrt(sum / float(count)) + (max_rating / count)

distance = lmg_rmse
    
# --- Load user ratings
users = {} # userid -> {movie : rating, movie : rating, ...}
for (user, movie, rating, time) in get_data():
    user = int(user)
    movie = int(movie)
    
    ratings = users.get(user)
    if not ratings:
        ratings = {}
        users[user] = ratings

    ratings[movie] = int(float(rating))

# --- Find k nearest neighbours
neighbours = []
theratings = users[theuser]
for (user, ratings) in users.items():
    if user == theuser:
        continue

    neighbours.append((distance(theratings, ratings), user, ratings))

neighbours.sort()
neighbours = neighbours[ : k]

# --- Load movies
movies = {}
for row in get_movies():
    movie = int(row[0])
    title = row[1]
    movies[movie] = title

# --- Go through neighbours
neigh_ratings = {} # movie -> [r1, r2, r3]
for ix in range(k):
    (dist, user, ratings) = neighbours[ix]
    
    print "===== %s ==================================================" % ix
    print "User #", user, ", distance:", dist
    #print 'minkowski', minkowski(theratings, ratings)
    #print 'pearson', pearson(theratings, ratings)
    
    for (movie, rating) in ratings.items():
        common = ''
        if theratings.has_key(movie):
            common = '   YOUR: %s' % theratings[movie]
        if common:
            print movies[movie], rating, common

        rs = neigh_ratings.get(movie)
        if not rs:
            rs = []
            neigh_ratings[movie] = rs
        rs.append(rating)

# --- Find highest averages
averages = [(avg(ratings), movie) for (movie, ratings) in neigh_ratings.items()]
averages.sort()
averages.reverse()

print "===== RECOMMENDATIONS =================================================="
count = 0
for (average, movie) in averages:
    if movie in theratings:
        continue

    print movies[movie], average
    count += 1
    if count > 10:
        break

print "===== DON'T SEE THESE! ================================================="
count = 0
averages.reverse()
for (average, movie) in averages:
    if movie in theratings:
        continue

    print movies[movie], average
    count += 1
    if count > 10:
        break

User # 14361 , distance: 0.555555555556
Silence of the Lambs, The (1991) 4    YOUR: 4
Forrest Gump (1994) 4    YOUR: 4
Enemy at the Gates (2001) 4    YOUR: 4
Memento (2000) 4    YOUR: 4
Full Metal Jacket (1987) 4    YOUR: 4
Terminator, The (1984) 4    YOUR: 4
Die Hard 2 (1990) 4    YOUR: 4
Total Recall (1990) 4    YOUR: 4
Snatch (2000) 4    YOUR: 4
User # 54405 , distance: 0.601148555936
Ace Ventura: When Nature Calls (1995) 3    YOUR: 2
Enemy at the Gates (2001) 4    YOUR: 4
Memento (2000) 4    YOUR: 4
Along Came a Spider (2001) 4    YOUR: 4
Mummy Returns, The (2001) 4    YOUR: 3
Waterworld (1995) 3    YOUR: 3
Dumb & Dumber (1994) 3    YOUR: 2
Ace Ventura: Pet Detective (1994) 3    YOUR: 2
Forrest Gump (1994) 4    YOUR: 4
Crocodile Dundee (1986) 4    YOUR: 3
Silence of the Lambs, The (1991) 4    YOUR: 4
American Pie (1999) 4    YOUR: 4
Last Action Hero (1993) 4    YOUR: 4
Total Recall (1990) 4    YOUR: 4
Citizen Kane (1941) 4    YOUR: 4
RoboCop (1987) 4    YOUR: 4
Any Given Sunday (19