In [32]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm
from typing import Callable, List

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.sparse as scs


In [33]:

def apk(relevant: List[int], predicted: List[int], k: int) -> float:
    if len(predicted) > k:
        predicted = predicted[:k]
        
    score = 0
    num_hits = 0

    for i, p in enumerate(predicted):
        if p in relevant:
            num_hits += 1
            score += num_hits / (i + 1)

    return score / min(len(relevant), k)

def mapk(relevant: List[List[int]], predicted: List[List[int]], k: int = 20):
    ap_list = [apk(r, p, k) for r, p in zip(relevant, predicted)]
    return np.mean(ap_list)


def jaccard(ratings: np.array, user_vector: np.array) -> np.array:
    user_vector = user_vector.reshape(1, -1)
    and_matrix = np.logical_and(ratings, user_vector)
    or_matrix = np.logical_or(ratings, user_vector)

    distance_vector = np.sum(and_matrix, axis=1) / np.sum(or_matrix, axis=1)
    distance_vector[distance_vector == 1.] = 0
    
    return distance_vector


In [34]:
path_ratings = '..\Data\music_dataset.csv'
path_track_info = '..\Data\\tracks_info.csv'

In [35]:
ratings = pd.read_csv(path_ratings)
ratings.head()

Unnamed: 0,userId,trackId
0,0,14
1,0,95
2,0,219
3,0,220
4,0,404


In [36]:
tracks_info = pd.read_csv(path_track_info)
tracks_info.head()

Unnamed: 0,id,name,artists
0,0,What There Is,['a-ha']
1,1,I'll Play The Blues For You,['Albert King']
2,2,Breaking Up Somebody's Home,['Albert King']
3,3,Imma Be,['Black Eyed Peas']
4,4,Boom Boom Pow,['Black Eyed Peas']


Для оценки качества рекомендаций мы будем использовать метрику $MAP@k$.

$$
MAP@k = \frac{1}{N} \sum_{u = 1}^N AP_u@k
$$
$$
AP_u@k = \frac{1}{\min(k, n_u)} \sum_{i=1}^k r_u(i) p_u@i
$$
$$p_u@k = \dfrac{1}{k}\sum_{j=1}^k r_u(j)$$


*   $N$ - количество пользователей.
*   $n_u$ - число релевантных треков пользователя $u$ на тестовом промежутке.
*   $r_u(i)$ - бинарная величина: относится ли трек на позиции $i$ к релевантным.

In [37]:
ratings

Unnamed: 0,userId,trackId
0,0,14
1,0,95
2,0,219
3,0,220
4,0,404
...,...,...
141449,240,69609
141450,240,69843
141451,240,70180
141452,240,70233


In [38]:
def train_test_split(ratings):
    train_ratings, test_ratings = [], []
    num_test_samples = 50

    # getting train samples
    for userId, user_data in tqdm(ratings.groupby('userId')):
        train_ratings += [user_data[:-num_test_samples]]

    train_ratings = pd.concat(train_ratings).reset_index(drop=True)
    all_train_items = train_ratings['trackId'].unique()

    # getting train samples
    # we drop all tracks that are not presented it the training samples,
    # because we won't be able to learn representations for them
    for userId, user_data in tqdm(ratings.groupby('userId')):
        test_items = user_data[-num_test_samples:]
        test_items = test_items[np.isin(test_items['trackId'], all_train_items)]
        test_ratings += [test_items]

    test_ratings = pd.concat(test_ratings).reset_index(drop=True)

    return train_ratings, test_ratings

In [39]:
class User2User:
    def __init__(self, ratings: pd.DataFrame, alpha=0.02):
        self.ratings = ratings
        self.n_users = len(np.unique(self.ratings['userId']))
        self.n_items = len(np.unique(self.ratings['trackId']))

        self.R = np.zeros((self.n_users, self.n_items))
        self.R[self.ratings['userId'], self.ratings['trackId']] = 1.

        self.similarity_func = jaccard
        self.alpha = alpha

    def remove_train_items(self, preds: List[List[int]], k: int):
        """
        param preds: [n_users, n_items] - recommended items for each user
        param k: int
        return: np.array [n_users, k] - recommended items without training examples
        """
        new_preds = np.zeros((len(preds), k), dtype=int)
        for user_id, user_data in self.ratings.groupby('userId'):
            user_preds = preds[user_id]
            new_preds[user_id] = user_preds[~np.in1d(user_preds, user_data['trackId'])][:k]

        return new_preds

    def get_test_recommendations(self, k: int):
        test_preds = []
        
        # your code here: (￣▽￣)/♫•*¨*•.¸¸♪
        # apply recommend along every user
        # remove train (listened items) items

        recommendation_matrix = np.zeros((self.n_users, self.n_items), dtype=int)
        for uid in range(self.n_users):
            user_recommendations = self.recommend(uid)
            recommendation_matrix[uid] = np.squeeze(user_recommendations)
        test_preds[:self.n_users] = self.remove_train_items(recommendation_matrix, k).tolist()
        
        return test_preds
    
    def similarity(self, user_vector: np.array):
        """
        user_vector: [n_items]
        """
        distance = self.similarity_func(self.R, user_vector)
        similar_objects = np.argwhere(distance >= self.alpha) # similar users in our case
        return similar_objects

    def recommend(self, uid: int):
        similar_users_index = np.squeeze(self.similarity(self.R[uid]))
        similar_users_matrix = self.R[similar_users_index]
        similar_users_distance = self.similarity_func(similar_users_matrix, self.R[uid])
        
        if similar_users_matrix.ndim == 1:
            recommended_tracks = (similar_users_distance * similar_users_matrix) / (np.abs(similar_users_distance).sum() + 1e-4)
        else:
            weighted_ratings = np.dot(similar_users_distance.T, similar_users_matrix)
            sum_similarities = np.abs(similar_users_distance).sum() + 1e-4
            recommended_tracks = weighted_ratings / sum_similarities
    
        recommended_tracks = np.argsort(-recommended_tracks)
        
        return recommended_tracks

In [40]:
train_ratings, test_ratings = train_test_split(ratings)

100%|██████████| 241/241 [00:00<00:00, 30153.25it/s]
100%|██████████| 241/241 [00:00<00:00, 4228.38it/s]


In [41]:
redundant_rows = np.where(~np.isin(tracks_info['id'], train_ratings['trackId'].unique()))[0]
tracks_info.drop(redundant_rows, inplace=True)
tracks_info = tracks_info.reset_index(drop=True)

In [42]:
def ids_encoder(ratings):
    users = sorted(ratings['userId'].unique())
    items = sorted(ratings['trackId'].unique())

    # create users and items encoders
    uencoder = LabelEncoder()
    iencoder = LabelEncoder()

    # fit users and items ids to the corresponding encoder
    uencoder.fit(users)
    iencoder.fit(items)

    return uencoder, iencoder

In [43]:
uencoder, iencoder = ids_encoder(train_ratings)
train_ratings['trackId'] = iencoder.transform(train_ratings['trackId'].tolist())
test_ratings['trackId'] = iencoder.transform(test_ratings['trackId'].tolist())
tracks_info['id'] = iencoder.transform(tracks_info['id'].tolist())

In [44]:
test_relevant = []
test_users = []
for user_id, user_data in test_ratings.groupby('userId'):
    test_relevant += [user_data['trackId'].tolist()]
    test_users.append(user_id)

In [45]:
model = User2User(train_ratings)

In [46]:
pred_recs = model.get_test_recommendations(40)

In [47]:
filtered_pred_recs = [pred_recs[i] for i in range(len(pred_recs)) if i in test_users]

In [48]:
mapk(test_relevant, filtered_pred_recs, 25)

0.0011257155633638825

In [49]:
user_id = np.random.randint(0, model.n_users)

In [50]:
listened_tracks = train_ratings[train_ratings.userId == user_id].trackId[:15]

print('Already listened tracks:')

tracks_info.loc[listened_tracks][['name', 'artists']]

Already listened tracks:


Unnamed: 0,name,artists
81,Ain't No Mountain High Enough,"['Marvin Gaye', 'Tammi Terrell']"
117,Everybody Wants To Rule The World,['Tears For Fears']
149,Wouldn't It Be Nice,['The Beach Boys']
150,Nights in White Satin,['The Moody Blues']
257,The Lady In Red,['Chris De Burgh']
258,What A Wonderful World,['Louis Armstrong']
261,California Dreamin',['The Mamas & The Papas']
362,Don't Let Me Be Misunderstood,['Nina Simone']
381,Dream A Little Dream Of Me,['The Mamas & The Papas']
581,Gimme! Gimme! Gimme! (A Man After Midnight),['ABBA']


In [51]:
preds = model.get_test_recommendations(15)

print('Predicted tracks:')

tracks_info.loc[preds[user_id]][['name', 'artists']]

Predicted tracks:


Unnamed: 0,name,artists
805,Zombie,['The Cranberries']
1073,Smells Like Teen Spirit,['Nirvana']
9693,Another One Bites The Dust,['Queen']
8263,Shape Of My Heart,['Sting']
1019,It's My Life,['Bon Jovi']
164,My Favourite Game,['The Cardigans']
2555,Californication,['Red Hot Chili Peppers']
3211,Crazy,['Gnarls Barkley']
7780,ИСКАЛА,['Земфира']
2550,Otherside,['Red Hot Chili Peppers']


In [52]:
test_tracks = test_ratings[test_ratings.userId == user_id].trackId[:15]

print('Test-time tracks:')

tracks_info.loc[test_tracks][['name', 'artists']]

Test-time tracks:


Unnamed: 0,name,artists
51929,Mr. Sandman,['The Chordettes']
52345,Time Waits For No One,['Freddie Mercury']
53102,Аugust,['Intelligency']
53347,All For Us,"['Labrinth', 'Zendaya']"
53887,Clandestina,"['FILV', 'Edmofo']"
53975,Love Me Like There's No Tomorrow,['Freddie Mercury']
54406,So So,['Rusowsky']
54483,When I R.I.P.,['Labrinth']
54805,Made In Heaven,['Freddie Mercury']
54809,Mr. Bad Guy,['Freddie Mercury']
