In [11]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from tqdm.notebook import tqdm
from typing import Callable, List

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.sparse as scs

In [12]:
path_ratings = '..\Data\music_dataset.csv'
path_track_info = '..\Data\\tracks_info.csv'

In [13]:
ratings = pd.read_csv(path_ratings)
ratings.head()

Unnamed: 0,userId,trackId
0,0,14
1,0,95
2,0,219
3,0,220
4,0,404


In [14]:
tracks_info = pd.read_csv(path_track_info)
tracks_info.head()

Unnamed: 0,id,name,artists
0,0,What There Is,['a-ha']
1,1,I'll Play The Blues For You,['Albert King']
2,2,Breaking Up Somebody's Home,['Albert King']
3,3,Imma Be,['Black Eyed Peas']
4,4,Boom Boom Pow,['Black Eyed Peas']


Для оценки качества рекомендаций мы будем использовать метрику $MAP@k$.

$$
MAP@k = \frac{1}{N} \sum_{u = 1}^N AP_u@k
$$
$$
AP_u@k = \frac{1}{\min(k, n_u)} \sum_{i=1}^k r_u(i) p_u@i
$$
$$p_u@k = \dfrac{1}{k}\sum_{j=1}^k r_u(j)$$


*   $N$ - количество пользователей.
*   $n_u$ - число релевантных треков пользователя $u$ на тестовом промежутке.
*   $r_u(i)$ - бинарная величина: относится ли трек на позиции $i$ к релевантным.

In [8]:
def apk(relevant: List[int], predicted: List[int], k: int) -> float:
    if len(predicted) > k:
        predicted = predicted[:k]
        
    score = 0
    num_hits = 0

    for i, p in enumerate(predicted):
        if p in relevant:
            num_hits += 1
            score += num_hits / (i + 1)

    return score / min(len(relevant), k)

def mapk(relevant: List[List[int]], predicted: List[List[int]], k: int = 20):
    ap_list = [apk(r, p, k) for r, p in zip(relevant, predicted)]
    return np.mean(ap_list)


def jaccard(ratings: np.array, user_vector: np.array) -> np.array:
    user_vector = user_vector.reshape(1, -1)
    and_matrix = np.logical_and(ratings, user_vector)
    or_matrix = np.logical_or(ratings, user_vector)

    distance_vector = np.sum(and_matrix, axis=1) / np.sum(or_matrix, axis=1)
    distance_vector[distance_vector == 1.] = 0
    
    return distance_vector

NameError: name 'List' is not defined