# Лабораторная работа


Создание рекомендательной модели.


# Цель лабораторной работы:


изучение разработки рекомендательных моделей.

# Требования к отчету:

Отчет по лабораторной работе должен содержать:

титульный лист;
описание задания;
текст программы;
экранные формы с примерами выполнения программы.

# Задание:

Выбрать произвольный набор данных (датасет), предназначенный для построения рекомендательных моделей.
Опираясь на материалы лекции, сформировать рекомендации для одного пользователя (объекта) двумя произвольными способами.
Сравнить полученные рекомендации (если это возможно, то с применением метрик).

In [25]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from scipy import stats
from IPython.display import Image
from IPython.display import Image
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.datasets import load_iris, load_boston
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
%matplotlib inline 
sns.set(style="ticks")

# Чтение и обработка данных

In [26]:
data = pd.read_csv(r'C:\Users\asus\Desktop\iu5\MMO\lab4\rym_clean1.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,position,release_name,artist_name,release_date,release_type,primary_genres,secondary_genres,descriptors,avg_rating,rating_count,review_count
0,1,1,OK Computer,Radiohead,1997-06-16,album,"Alternative Rock, Art Rock",,"melancholic, anxious, futuristic, malevocals, ...",4.24,74027,1541
1,2,2,Kid A,Radiohead,2000-10-03,album,"Art Rock, Experimental Rock, Electronic","Ambient, Electronic, IDM","cold, melancholic, futuristic, anxious, atmosp...",4.23,61658,751
2,3,3,The Dark Side of the Moon,Pink Floyd,1973-03-23,album,"Art Rock, Progressive Rock","Psychedelic Rock, Space Rock","philosophical, atmospheric, introspective, exi...",4.21,60192,1557
3,4,4,Loveless,My Bloody Valentine,1991-11-11,album,"Shoegaze, Noise Pop","Dream Pop, Neo-Psychedelia","noisy, ethereal, atmospheric, romantic, love, ...",4.24,53174,1264
4,5,5,My Beautiful Dark Twisted Fantasy,Kanye West,2010-11-22,album,"Pop Rap, Hip Hop",Art Pop,"epic, boastful, passionate, sampling, hedonist...",4.09,52149,638


In [27]:
data.shape

(5000, 12)

In [63]:
description_data = data[data['descriptors'].notnull()]
description_data.shape

(5000, 12)

In [64]:
title = description_data['primary_genres'].values
title[0:5]

array(['Alternative Rock, Art Rock',
       'Art Rock, Experimental Rock, Electronic',
       'Art Rock, Progressive Rock', 'Shoegaze, Noise Pop',
       'Pop Rap, Hip Hop'], dtype=object)

In [65]:
descriptions = description_data['descriptors'].values
descriptions[0:5]

array(['melancholic, anxious, futuristic, malevocals, existential, alienation, atmospheric, lonely, cold, pessimistic',
       'cold, melancholic, futuristic, anxious, atmospheric, sombre, cryptic, abstract, introspective, malevocals',
       'philosophical, atmospheric, introspective, existential, mellow, conceptalbum, malevocals, psychedelic, progressive, epic',
       'noisy, ethereal, atmospheric, romantic, love, dense, hypnotic, psychedelic, femalevocals, lush',
       'epic, boastful, passionate, sampling, hedonistic, vulgar, anthemic, melodic, malevocals, introspective'],
      dtype=object)

In [66]:
description_data.keys()

Index(['Unnamed: 0', 'position', 'release_name', 'artist_name', 'release_date',
       'release_type', 'primary_genres', 'secondary_genres', 'descriptors',
       'avg_rating', 'rating_count', 'review_count'],
      dtype='object')

In [67]:
wine_ids = description_data['Unnamed: 0'].values
wine_ids

array([   1,    2,    3, ..., 4998, 4999, 5000], dtype=int64)

In [68]:
%%time
tfidf = TfidfVectorizer()
description_matrix = tfidf.fit_transform(descriptions)
description_matrix

Wall time: 65.8 ms


<5000x185 sparse matrix of type '<class 'numpy.float64'>'
	with 48545 stored elements in Compressed Sparse Row format>

In [69]:
description_matrix

<5000x185 sparse matrix of type '<class 'numpy.float64'>'
	with 48545 stored elements in Compressed Sparse Row format>

# Фильтрация на основе содержания. Метод k-ближайших соседей

In [70]:
class SimplerKnnRecomender:
  def __init__(self, X_matrix, X_ids, X_title, X_overview):
        """
        Входные параметры:
        X_matrix - обучающая выборка (матрица объект-признак)
        X_ids - массив идентификаторов объектов
        X_title - массив названий объектов
        X_overview - массив описаний объектов
        """
        #Сохраняем параметры в переменных объекта
        self._X_matrix = X_matrix
        self.df = pd.DataFrame(
            {'id': pd.Series(X_ids, dtype='int'),
            'title': pd.Series(X_title, dtype='str'),
            'overview': pd.Series(X_overview, dtype='str'),
            'dist': pd.Series([], dtype='float')})
  
  def recommend_for_single_object(self, K: int, \
                X_matrix_object, cos_flag = True, manh_flag = False):
        """
        Метод формирования рекомендаций для одного объекта.
        Входные параметры:
        K - количество рекомендуемых соседей 
        X_matrix_object - строка матрицы объект-признак, соответствующая объекту
        cos_flag - флаг вычисления косинусного расстояния
        manh_flag - флаг вычисления манхэттэнского расстояния
        Возвращаемое значение: K найденных соседей
        """
        
        scale = 1000000
        # Вычисляем косинусную близость
        if cos_flag:
            dist = cosine_similarity(self._X_matrix, X_matrix_object)
            self.df['dist'] = dist * scale
            res = self.df.sort_values(by='dist', ascending=False)
            # Не учитываем рекомендации с единичным расстоянием,
            # так как это искомый объект
            res = res[res['dist'] < scale]
        
        else:
            if manh_flag:
                dist = manhattan_distances(self._X_matrix, X_matrix_object)
            else:
                dist = euclidean_distances(self._X_matrix, X_matrix_object)
            self.df['dist'] = dist * scale
            res = self.df.sort_values(by='dist', ascending=True)
            # Не учитываем рекомендации с единичным расстоянием,
            # так как это искомый объект
            res = res[res['dist'] > 0.0]            
        
        # Оставляем К первых рекомендаций
        res = res.head(K)
        return res

In [71]:
test_id = 11
print(title[test_id])
print(descriptions[test_id])

Progressive Rock, Art Rock
fantasy, epic, progressive, complex, philosophical, surreal, poetic, malevocals, melancholic, abstract


In [72]:
test_matrix = description_matrix[test_id]
test_matrix

<1x185 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [73]:
skr1 = SimplerKnnRecomender(description_matrix, wine_ids, title, descriptions)

In [76]:
# в порядке убывания схожести на основе косинусного сходства
rec1 = skr1.recommend_for_single_object(15, test_matrix)
rec1

Unnamed: 0,id,title,overview,dist
1327,1328,Progressive Rock,"fantasy, surreal, complex, uncommontimesignatu...",660513.690373
1602,1603,"Hard Rock, Progressive Rock","malevocals, energetic, progressive, fantasy, m...",645920.613596
568,569,"Progressive Rock, Symphonic Prog","epic, fantasy, uncommontimesignatures, complex...",636683.141562
960,961,"Symphonic Prog, Progressive Rock","epic, progressive, complex, technical, uncommo...",619207.945847
122,123,"Symphonic Prog, Progressive Rock","epic, uplifting, complex, technical, progressi...",609191.614561
3263,3264,Progressive Rock,"philosophical, malevocals, complex, fantasy, p...",596640.451178
2376,2377,Progressive Rock,"philosophical, complex, poetic, dark, malevoca...",595706.008204
965,966,"Progressive Rock, Hard Rock","progressive, technical, fantasy, sciencefictio...",592362.078492
2728,2729,Progressive Rock,"malevocals, progressive, complex, philosophica...",589774.660102
1261,1262,Progressive Rock,"dark, philosophical, complex, lonely, epic, ma...",568270.761513


In [77]:
# При поиске с помощью Евклидова расстояния получаем такой же результат
rec2 = skr1.recommend_for_single_object(15, test_matrix, cos_flag = False)
rec2

Unnamed: 0,id,title,overview,dist
1327,1328,Progressive Rock,"fantasy, surreal, complex, uncommontimesignatu...",823997.948574
1602,1603,"Hard Rock, Progressive Rock","malevocals, energetic, progressive, fantasy, m...",841521.700735
568,569,"Progressive Rock, Symphonic Prog","epic, fantasy, uncommontimesignatures, complex...",852428.130036
960,961,"Symphonic Prog, Progressive Rock","epic, progressive, complex, technical, uncommo...",872687.864191
122,123,"Symphonic Prog, Progressive Rock","epic, uplifting, complex, technical, progressi...",884090.929078
3263,3264,Progressive Rock,"philosophical, malevocals, complex, fantasy, p...",898175.426987
2376,2377,Progressive Rock,"philosophical, complex, poetic, dark, malevoca...",899215.204271
965,966,"Progressive Rock, Hard Rock","progressive, technical, fantasy, sciencefictio...",902926.266655
2728,2729,Progressive Rock,"malevocals, progressive, complex, philosophica...",905787.32592
1261,1262,Progressive Rock,"dark, philosophical, complex, lonely, epic, ma...",929224.664425


In [78]:
# Манхэттэнское расстояние дает несколько иные результаты поиска
rec3 = skr1.recommend_for_single_object(15, test_matrix, 
                                        cos_flag = False, manh_flag = True)
rec3

Unnamed: 0,id,title,overview,dist
1327,1328,Progressive Rock,"fantasy, surreal, complex, uncommontimesignatu...",2073638.0
1602,1603,"Hard Rock, Progressive Rock","malevocals, energetic, progressive, fantasy, m...",2347261.0
568,569,"Progressive Rock, Symphonic Prog","epic, fantasy, uncommontimesignatures, complex...",2397473.0
960,961,"Symphonic Prog, Progressive Rock","epic, progressive, complex, technical, uncommo...",2424417.0
122,123,"Symphonic Prog, Progressive Rock","epic, uplifting, complex, technical, progressi...",2507759.0
3263,3264,Progressive Rock,"philosophical, malevocals, complex, fantasy, p...",2508233.0
2728,2729,Progressive Rock,"malevocals, progressive, complex, philosophica...",2573049.0
965,966,"Progressive Rock, Hard Rock","progressive, technical, fantasy, sciencefictio...",2587397.0
2376,2377,Progressive Rock,"philosophical, complex, poetic, dark, malevoca...",2597139.0
1261,1262,Progressive Rock,"dark, philosophical, complex, lonely, epic, ma...",2623166.0


# Коллаборативная фильтрация. Метод на основе сингулярного разложения

In [79]:
data.head()

Unnamed: 0.1,Unnamed: 0,position,release_name,artist_name,release_date,release_type,primary_genres,secondary_genres,descriptors,avg_rating,rating_count,review_count
0,1,1,OK Computer,Radiohead,1997-06-16,album,"Alternative Rock, Art Rock",,"melancholic, anxious, futuristic, malevocals, ...",4.24,74027,1541
1,2,2,Kid A,Radiohead,2000-10-03,album,"Art Rock, Experimental Rock, Electronic","Ambient, Electronic, IDM","cold, melancholic, futuristic, anxious, atmosp...",4.23,61658,751
2,3,3,The Dark Side of the Moon,Pink Floyd,1973-03-23,album,"Art Rock, Progressive Rock","Psychedelic Rock, Space Rock","philosophical, atmospheric, introspective, exi...",4.21,60192,1557
3,4,4,Loveless,My Bloody Valentine,1991-11-11,album,"Shoegaze, Noise Pop","Dream Pop, Neo-Psychedelia","noisy, ethereal, atmospheric, romantic, love, ...",4.24,53174,1264
4,5,5,My Beautiful Dark Twisted Fantasy,Kanye West,2010-11-22,album,"Pop Rap, Hip Hop",Art Pop,"epic, boastful, passionate, sampling, hedonist...",4.09,52149,638


In [136]:
data3 = data[3000:4000]

In [144]:
# Количество уникальных дегустаторов
len(data3['artist_name'].unique())

792

In [145]:
len(data3['primary_genres'].unique())

673

In [147]:

def create_utility_matrix(data):
    itemField = 'primary_genres'
    userField = 'artist_name'
    valueField = 'review_count'  
    
    userList = data[userField].tolist()
    itemList = data[itemField].tolist()
    valueList = data[valueField].tolist()    
    
    users = list(set(userList))
    items = list(set(itemList))    
    
    users_index = {users[i]: i for i in range(len(users))}    
    pd_dict = {item: [0.0 for i in range(len(users))] for item in items}    
    
    for i in range(0,data.shape[0]):
        item = itemList[i]
        user = userList[i]
        value = valueList[i]    
        pd_dict[item][users_index[user]] = value    
    
    X = pd.DataFrame(pd_dict)
    X.index = users
        
    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}
    
    return X, users_index, items_index

In [148]:
%%time
user_item_matrix, users_index, items_index = create_utility_matrix(data3)

Wall time: 128 ms


In [149]:
user_item_matrix

Unnamed: 0,"Nu Metal, Rap Metal","Djent, Progressive Metal","Singer-Songwriter, Ambient Pop",Bossanova,"Experimental Hip Hop, Pop Rap","Singer-Songwriter, Folk Rock, Roots Rock","Experimental, Ambient","Psychedelic Pop, Neo-Psychedelia",Screamo,"Art Rock, Glam Rock, Pop Rock",...,"Avant-Garde Jazz, Grindcore","Atmospheric Black Metal, Post-Metal","Post-Punk, Blackgaze","Country Rock, Singer-Songwriter, Country","Bedroom Pop, Neo-Soul","Power Pop, Indie Pop","Psychedelic Pop, Hypnagogic Pop",Symphonic Black Metal,"Industrial Rock, Alternative Metal","Indie Pop, Pop Rock, Soundtracks"
Travis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Damon Albarn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Beck,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ben Folds Five,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Le Tigre,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Atlas Sound,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Julia Holter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Rolling Stones,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Superorganism,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [150]:
user_item_matrix__test = user_item_matrix.loc[['Atlas Sound']]
user_item_matrix__test

Unnamed: 0,"Nu Metal, Rap Metal","Djent, Progressive Metal","Singer-Songwriter, Ambient Pop",Bossanova,"Experimental Hip Hop, Pop Rap","Singer-Songwriter, Folk Rock, Roots Rock","Experimental, Ambient","Psychedelic Pop, Neo-Psychedelia",Screamo,"Art Rock, Glam Rock, Pop Rock",...,"Avant-Garde Jazz, Grindcore","Atmospheric Black Metal, Post-Metal","Post-Punk, Blackgaze","Country Rock, Singer-Songwriter, Country","Bedroom Pop, Neo-Soul","Power Pop, Indie Pop","Psychedelic Pop, Hypnagogic Pop",Symphonic Black Metal,"Industrial Rock, Alternative Metal","Indie Pop, Pop Rock, Soundtracks"
Atlas Sound,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [151]:
artist_name = np.delete(data3['artist_name'].unique(), 0)
artist_name = np.delete(artist_name, 6)
artist_name

array(['Ozzy Osbourne', "death's dynamic shroud.wmv", 'Neil Young',
       'Future', 'Fugazi', 'MIKE', 'Nightwish', 'Dire Straits',
       'Stereolab', 'Amy Winehouse', 'Iron & Wine', 'Mos Def',
       'Depeche Mode', 'The Black Keys', 'Unknown Mortal Orchestra',
       'Lemon Demon', 'Summoning', 'Alex Cameron', 'Camel', 'Iceage',
       'The Horace Silver Quintet', 'Jeff Rosenstock', 'John Cale',
       'Marina and the Diamonds', 'Katatonia', 'WHY?', 'Jorge Ben',
       'Okkervil River', 'Sparks', 'Built to Spill', '070 Shake', 'Japan',
       'The Chemical Brothers', 'The Fall',
       'Elvis Costello & The Attractions',
       'King Gizzard & The Lizard Wizard', 'The Decemberists', 'Yes',
       'A Winged Victory for the Sullen', 'Happy Mondays', 'Funkadelic',
       'Rush', 'Nas', 'Sun Kil Moon', 'Type O Negative',
       'The New Pornographers', 'Kaiser Chiefs', 'Ryoji Ikeda', 'yeule',
       'Smashing Pumpkins', 'Julia Holter', 'Naked City', 'AIR',
       'A Perfect Circle', 'Ge

In [152]:
user_item_matrix__train = user_item_matrix.loc[artist_name]
user_item_matrix__train

Unnamed: 0,"Nu Metal, Rap Metal","Djent, Progressive Metal","Singer-Songwriter, Ambient Pop",Bossanova,"Experimental Hip Hop, Pop Rap","Singer-Songwriter, Folk Rock, Roots Rock","Experimental, Ambient","Psychedelic Pop, Neo-Psychedelia",Screamo,"Art Rock, Glam Rock, Pop Rock",...,"Avant-Garde Jazz, Grindcore","Atmospheric Black Metal, Post-Metal","Post-Punk, Blackgaze","Country Rock, Singer-Songwriter, Country","Bedroom Pop, Neo-Soul","Power Pop, Indie Pop","Psychedelic Pop, Hypnagogic Pop",Symphonic Black Metal,"Industrial Rock, Alternative Metal","Indie Pop, Pop Rock, Soundtracks"
Ozzy Osbourne,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
death's dynamic shroud.wmv,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Neil Young,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Future,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fugazi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Stone Temple Pilots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bullet for My Valentine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jordaan Mason & The Horse Museum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Buffalo Springfield,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
%%time
U, S, VT = np.linalg.svd(user_item_matrix__train.T)
V = VT.T

Wall time: 383 ms


In [154]:
U.shape

(673, 673)

In [155]:
# Матрица соотношения между объектами и латентными факторами
V.shape

(790, 790)

In [156]:
S.shape

(673,)

In [157]:
Sigma = np.diag(S)
Sigma.shape

(673, 673)

In [158]:
Sigma

array([[3.31160846e+02, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 3.19292687e+02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.67145687e+02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        8.05901878e-15, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 4.06049289e-15, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 3.80919034e-15]])

In [159]:
# Используем 3 первых сингулярных значения
r=3
Ur = U[:, :r]
Sr = Sigma[:r, :r]
Vr = V[:, :r]
# Матрица соотношения между новым дегустатором и латентными факторами
test_user = np.mat(user_item_matrix__test.values)
test_user.shape, test_user

((1, 673),
 matrix([[ 0.,  0.,  0.,  0.,  0.,  0.,  0., 61.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,

In [160]:
tmp = test_user * Ur * np.linalg.inv(Sr)
tmp

matrix([[ 3.12825654e-17, -9.54975222e-19, -4.10890695e-18]])

In [161]:
test_user_result = np.array([tmp[0,0], tmp[0,1], tmp[0,2]])
test_user_result

array([ 3.12825654e-17, -9.54975222e-19, -4.10890695e-18])

In [162]:
# Вычисляем косинусную близость между текущим дегустатором 
# и остальными дегустаторами
cos_sim = cosine_similarity(Vr, test_user_result.reshape(1, -1))
cos_sim[:10]

array([[ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.        ],
       [ 0.11737584],
       [-0.85471808],
       [ 0.98545249],
       [ 0.83682442],
       [ 0.6394072 ],
       [-0.96589831]])

In [163]:
# Преобразуем размерность массива
cos_sim_list = cos_sim.reshape(-1, cos_sim.shape[0])[0]
cos_sim_list[:10]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.11737584,
       -0.85471808,  0.98545249,  0.83682442,  0.6394072 , -0.96589831])

In [164]:
# Находим наиболее близкого дегустатора
recommended_user_id = np.argsort(-cos_sim_list)[0]
recommended_user_id

63

In [165]:
test_user

matrix([[ 0.,  0.,  0.,  0.,  0.,  0.,  0., 61.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,

In [166]:
# Получение названия вина
wine_list = list(user_item_matrix.columns)
def film_name_by_movieid(ind):
    try:
        wine = wine_list[ind]
        #print(wineId)
        #flt_links = data3[data['movieId'] == wineId]
        #tmdbId = int(flt_links['tmdbId'].values[0])
        #md_links = df_md[df_md['id'] == tmdbId]
        #res = md_links['title'].values[0]
        return wine
    except:
        return ''

In [167]:
# Вина, которые оценивал текущий дегустатор:
i=1
for idx, item in enumerate(np.ndarray.flatten(np.array(test_user))):
    if item > 0:
        film_title = film_name_by_movieid(idx)
        print('{} - {} - {}'.format(idx, film_title, item))
        if i==20:
            break
        else:
            i+=1

7 - Psychedelic Pop, Neo-Psychedelia - 61.0


In [168]:
# Вина, которые оценивал наиболее схожий дегустатор:
i=1
recommended_user_item_matrix = user_item_matrix.loc[['Atlas Sound']]
for idx, item in enumerate(np.ndarray.flatten(np.array(recommended_user_item_matrix))):
    if item > 0:
        film_title = film_name_by_movieid(idx)
        print('{} - {} - {}'.format(idx, film_title, item))
        if i==20:
            break
        else:
            i+=1

7 - Psychedelic Pop, Neo-Psychedelia - 61.0


Как видно, фильтрация на основе содержания и коллаборативная фильтрация показывают различные результаты работы в рамках рекомендательных систем