In [40]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix
import numpy as np
import random
import math

In [41]:
top_n = 10
user_amount = 1000
song_amount = 384546

songs = dict()
users = dict()
neighbours_sim = dict()

user_song_matrix = lil_matrix((user_amount, song_amount), dtype=np.float64)
test_data = lil_matrix((user_amount, song_amount), dtype=np.float64)

In [42]:
def get_id(dict, key):
    if key not in dict:
        dict[key] = len(dict)
    return dict[key]


def load_data():
    with open('train_triplets.txt', 'r') as f:
        for line in f:
            user, song, play_count = line.split('\t')

            user = get_id(users, user)
            song = get_id(songs, song)

            if (user + 1) % (user_amount + 1) == 0:
                break

            user_song_matrix[user, song] = play_count
            if random.randint(1, 10) > 8:
                test_data[user, song] = play_count

load_data()
print("Amount of added records in matrix: ", user_song_matrix.nnz)

Amount of added records in matrix:  49631


In [43]:
# get top_n neighbours ordered by cos_sim
def get_n_neighbours(user):
    neighbours = []
    for neighbour in range(user_song_matrix.shape[0]):
        cos = cosine_similarity(user_song_matrix[user], user_song_matrix[neighbour])
        neighbours.append((cos, neighbour))
    return sorted(neighbours, reverse=True)[1:(top_n + 1)]

# save neighbours for each user for faster calculations
def calc_neighbours():
    for user in range(user_song_matrix.shape[0]):
        neighbours_sim[user] = get_n_neighbours(user)

print ("Start calculations ...\n")
calc_neighbours()
print("Top_n neighbors for first user (similarity, neighbour): ", neighbours_sim[0])


Start calculations ...

Top_n neighbors for first user (similarity, neighbour):  [(array([[ 0.24231188]]), 848), (array([[ 0.21761762]]), 920), (array([[ 0.10190103]]), 76), (array([[ 0.10073114]]), 471), (array([[ 0.08920516]]), 657), (array([[ 0.07323109]]), 335), (array([[ 0.0719195]]), 472), (array([[ 0.07163353]]), 414), (array([[ 0.0680847]]), 438), (array([[ 0.04950556]]), 101)]


In [44]:
# get probable rating for song (according to slide 11)
# r = sum(coefficient_of_nbhd_sim * nbhd_r)/(sum(abs(coefficients_of_similarities)))
# (mean-centering) we can also calculate rate as
# ra = user_avg_r + sum(coeff * (nbhd_r - avg_nbhd_r))/sum(abs(coefficients_of_similarities)))
def get_rate(user, song):
    neighbours = neighbours_sim[user]
    upper_sum = 0.0
    lower_sum = 0.0
    # user_avg_rate = user_song_matrix[user].sum() / user_song_matrix[user].nnz
    for (cos, neighbour) in neighbours:
        rate = user_song_matrix[neighbour, song]
        # ratings_sum = user_song_matrix[neighbour].sum()
        # ratings_amount = user_song_matrix[neighbour].nnz
        # avg_rate = ratings_sum / ratings_amount
        if rate != 0:
            # upper_sum += cos * (rate - avg_rate)
            upper_sum += cos * (rate)
            lower_sum += abs(cos)
    if lower_sum == 0:
        return 0
    # return user_avg_rate + upper_sum / lower_sum
    return upper_sum / lower_sum


In [45]:
# get top_n songs for user
def get_top_n(user):
    top_n_songs = []
    for song in songs:
        song_id = get_id(songs, song)
        if user_song_matrix[user, song_id] == 0:
            top_n_songs.append((get_rate(user, song_id), song))
    return sorted(top_n_songs, reverse=True)[:top_n]

top = get_top_n(1)
print("Top_N songs for first user: \n \t\tsong\t\t|\trating")
for (rating, song) in top:
    print(song, "\t\t", str(rating[0][0]))

Top_N songs for first user: 
 		song		|	rating
SOLPVAQ12AB017EB35 		 18.0
SOAFNSV12A8159E996 		 11.0
SOUFIYP12A6D4FB033 		 10.0
SOYIZSN12A6701E0BB 		 7.0
SOLRGVL12A8C143BC3 		 6.0
SOZFSQT12A67ADE650 		 5.0
SOYZKHZ12A6310ECA2 		 5.0
SOYLDJC12A6701E2FF 		 5.0
SOYDNWS12A6D4F8882 		 5.0
SOXXWJG12A8C13DAA5 		 5.0


In [46]:
# slide 5
# rmse = sqrt (sum((real - predected)^2)/amount_of_ratings)
def rmse():
    rates_am = 0
    sqr_sum = 0

    for user in range(user_song_matrix.shape[0]):
        rated_by_user = user_song_matrix[user].tocsr().indices
        for song in rated_by_user:
            if test_data[user, song] == 0:
                predicted = get_rate(user, song)
                real = user_song_matrix[user, song]
                rates_am += 1
                sqr_sum += math.pow((real - predicted), 2)

    return math.sqrt(sqr_sum / rates_am)


print("Calculated RMSE: ", str(rmse()))

Calculated RMSE:  11.621103735233664
