In [1]:
from engine.models.distance import DistanceMatrix
from engine.models.movie import Movie

In [2]:
folder = '/media/mariam/Files/ran/clacket-save'
matrix = DistanceMatrix(folder)
matrix.load()

In [3]:
movies = ['1'] + list(matrix.matrix['1'].keys())

In [4]:
import random
from math import ceil

training_size = ceil(0.7 * len(movies))
training_set = random.sample(movies, training_size)
# testing_set = [x for x in movies if x not in training_set]
testing_set = movies

In [8]:
import sys

def k_nearest(matrix, id, k=5, thresh=None):
    distances = matrix.get_all(id)
    nearest_k = sorted(distances, key=lambda x: x[1])[1:k+1] # start from 1 because 0 will always be itself
    return [x[0] for x in nearest_k] if thresh is None else [x[0] for x in nearest_k if x[1] >= thresh]

def k_nearest_all(training_set, matrix, k=5):
    i = 1
    n = len(testing_set)
    for movie_id in testing_set:
        sys.stdout.write('\rCalculating nearest K for {0} out of {1}..'.format(i, n))
        sys.stdout.flush()
        movie = Movie.load(folder, movie_id)
        nearest = k_nearest(matrix, movie_id, k=k)
        users = []
        if not hasattr(movie, 'predictions') or movie.predictions is None or type(movie.predictions) == list:
            movie.predictions = {}
        for near_movie_id in nearest:
            near_movie = Movie.load(folder, near_movie_id)
            positive_users = near_movie.ratings[near_movie.ratings['rating'] > 3]['user_id'].tolist()
            users.extend(positive_users)
        movie.predictions[k] = users
        movie.save(folder)
        i += 1

def measure_predictions(movie_id, min, k):
    movie = Movie.load(folder, movie_id)
    intersection = movie.ratings[movie.ratings['user_id'].isin(movie.predictions[k])]
    hits = intersection[intersection['rating'] >= min].shape[0]
    misses = intersection[intersection['rating'] < min].shape[0]
    return (hits, misses)

def all_predictions(testing_set, min=4, k=5):
    hits = 0
    misses = 0
    for movie_id in testing_set:
        m_hits, m_misses = measure_predictions(movie_id, min, k)
        hits += m_hits
        misses += m_misses
    return (hits, misses)

def print_perc(predictions):
    hits, misses = predictions
    total = hits + misses
    hits_p = (hits/total) * 100
    misses_p = (misses/total) * 100
    print('Total hits: {0} ({1}%)\nTotal misses: {2} ({3}%)'.format(hits, hits_p, misses, misses_p))

def predict_and_print(training_set, matrix, k=5):
    print('k={0}:'.format(k))
    print('-----')

    k_nearest_all(training_set, matrix, k=k)
    print('Done.')

    predictions_4 = all_predictions(training_set, min=4, k=k)
    print('Results when counting good as 4 or 5 stars:')
    print_perc(predictions_4)

    predictions_3 = all_predictions(training_set, min=3, k=k)
    print('Results when counting good as 3, 4, or 5 stars:')
    print_perc(predictions_3)

In [None]:
predict_and_print(training_set, matrix, k=6)

k=6:
-----
Calculating nearest K for 9399 out of 9399..Done.
Results when counting good as 4 or 5 stars:
Total hits: 11946801 (78.53188870464925%)
Total misses: 3265874 (21.468111295350752%)


In [6]:
predict_and_print(training_set, matrix, k=5)

k=5:
-----
Calculating nearest K for 9399 out of 9399..Done.
Results when counting good as 4 or 5 stars:
Total hits: 11198580 (80.8065749679565%)
Total misses: 2659921 (19.19342503204351%)
Results when counting good as 3, 4, or 5 stars:
Total hits: 13140942 (94.82224664846508%)
Total misses: 717559 (5.177753351534918%)


In [9]:
predict_and_print(training_set, matrix, k=4)

k=4:
-----
Calculating nearest K for 9399 out of 9399..Done.
Results when counting good as 4 or 5 stars:
Total hits: 10494093 (83.93752780495952%)
Total misses: 2008173 (16.062472195040485%)
Results when counting good as 3, 4, or 5 stars:
Total hits: 11963726 (95.69246087069337%)
Total misses: 538540 (4.307539129306639%)


In [10]:
predict_and_print(training_set, matrix, k=3)

k=3:
-----
Calculating nearest K for 9399 out of 9399..Done.
Results when counting good as 4 or 5 stars:
Total hits: 9447972 (87.95226359989644%)
Total misses: 1294187 (12.047736400103554%)
Results when counting good as 3, 4, or 5 stars:
Total hits: 10402355 (96.83672528027188%)
Total misses: 339804 (3.163274719728129%)
