In [2]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from scipy import stats
from IPython.display import Image
from IPython.display import Image
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.datasets import load_iris, load_boston
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score 
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from surprise import SVD, Dataset, Reader
from surprise.model_selection import PredefinedKFold
from collections import defaultdict
from surprise.accuracy import rmse
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
%matplotlib inline 
sns.set(style="ticks")

# Лабораторная работа 5 [Введение в рекомендательные системы](https://ru.wikipedia.org/wiki/%D0%A0%D0%B5%D0%BA%D0%BE%D0%BC%D0%B5%D0%BD%D0%B4%D0%B0%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D0%B0%D1%8F_%D1%81%D0%B8%D1%81%D1%82%D0%B5%D0%BC%D0%B0) с использованием методов машинного обучения

## Набор данных

В качестве набора данных будем использовать набор данных о рекомендациях игр - https://www.kaggle.com/datasets/tamber/steam-video-games?select=steam-200k.csv

Steam is the world's most popular PC Gaming hub, with over 6,000 games and a community of millions of gamers. With a massive collection that includes everything from AAA blockbusters to small indie titles, great discovery tools are a highly valuable asset for Steam. 

This dataset is a list of user behaviors, with columns: user-id, game-title, behavior-name, value. The behaviors included are 'purchase' and 'play'. The value indicates the degree to which the behavior was performed - in the case of 'purchase' the value is always 1, and in the case of 'play' the value represents the number of hours the user has played the game.

- User ID, 
- Name of the steam game, 
- behavior name (purchase/play), 
- Hours if behavior is play, 1.0 if behavior is purchase

### Чтение данных

In [3]:
df_steam = pd.read_csv("data/steam-200k.csv", delimiter=None, header=None, error_bad_lines=False)
df_steam.columns = ['user-id','game-title','behavior','value','unknown']
df_steam['user-id'] = pd.to_numeric(df_steam['user-id'], errors='coerce')
#df_md_all['imdbId'] = df_md_all['imdb_id'].str.replace(r"[a-zA-Z_]{1,}",'',regex=True)
#df_md_all['imdbId'] = pd.to_numeric(df_md_all['imdbId'], errors='coerce')



  df_steam = pd.read_csv("data/steam-200k.csv", delimiter=None, header=None, error_bad_lines=False)


In [4]:
df_steam.head()

Unnamed: 0,user-id,game-title,behavior,value,unknown
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0


In [5]:
df_steam.shape

(200000, 5)

In [6]:
df_steam.value_counts(['user-id'])

user-id  
62990992     1573
33865373      949
11403772      906
30246419      901
47457723      855
             ... 
159983691       1
239444411       1
159966899       1
211153613       1
281302904       1
Length: 12393, dtype: int64

### Векторизация описания фильмов

In [7]:
%%time
tfidfv = TfidfVectorizer()
overview_matrix = tfidfv.fit_transform(df_steam['game-title'])
overview_matrix

CPU times: user 398 ms, sys: 5.78 ms, total: 403 ms
Wall time: 403 ms


<200000x5034 sparse matrix of type '<class 'numpy.float64'>'
	with 578509 stored elements in Compressed Sparse Row format>

В результате векторизации только по описанию фильма получилось около 30000 признаков, поэтому в рекомендательных системах часто применяют методы понижения размерности.

## Фильтрация на основе содержания

### Пример реализации

Реализуем класс для формирования рекомендаций на основе метода ближайших соседей:

In [8]:
class SimpleKNNRecommender:
    
    def __init__(self, X_matrix, X_title):
        """
        Входные параметры:
        X_matrix - обучающая выборка (матрица объект-признак)
        X_ids - массив идентификаторов объектов
        X_title - массив названий объектов
        X_overview - массив описаний объектов
        """
        #Сохраняем параметры в переменных объекта
        self._X_matrix = X_matrix
        self.df = pd.DataFrame(
            {'title': pd.Series(X_title, dtype='str'),
            'dist': pd.Series([], dtype='float')})
            
            
    def recommend_for_single_object(self, K: int, \
                X_matrix_object, cos_flag = True, manh_flag = False):
        """
        Метод формирования рекомендаций для одного объекта.
        Входные параметры:
        K - количество рекомендуемых соседей 
        X_matrix_object - строка матрицы объект-признак, соответствующая объекту
        cos_flag - флаг вычисления косинусного расстояния
        manh_flag - флаг вычисления манхэттэнского расстояния
        Возвращаемое значение: K найденных соседей
        """
        
        scale = 1000000
        # Вычисляем косинусную близость
        if cos_flag:
            dist = cosine_similarity(self._X_matrix, X_matrix_object)
            self.df['dist'] = dist * scale
            res = self.df.sort_values(by='dist', ascending=False)
            # Не учитываем рекомендации с единичным расстоянием,
            # так как это искомый объект
            res = res[res['dist'] < scale]
        
        else:
            if manh_flag:
                dist = manhattan_distances(self._X_matrix, X_matrix_object)
            else:
                dist = euclidean_distances(self._X_matrix, X_matrix_object)
            self.df['dist'] = dist * scale
            res = self.df.sort_values(by='dist', ascending=True)
            # Не учитываем рекомендации с единичным расстоянием,
            # так как это искомый объект
            res = res[res['dist'] > 0.0]            
        
        # Оставляем К первых рекомендаций
        res = res.head(K)
        return res

In [9]:
title = df_steam['game-title'].values
title

array(['The Elder Scrolls V Skyrim', 'The Elder Scrolls V Skyrim',
       'Fallout 4', ..., 'Grand Theft Auto Vice City', 'RUSH', 'RUSH'],
      dtype=object)

In [10]:
skr1 = SimpleKNNRecommender(overview_matrix,title)

In [11]:
# Тестовый пример - Gratuitous Space Battles
gsbid = 245
title[gsbid]

'Gratuitous Space Battles'

In [12]:
gsbid_matrix = overview_matrix[gsbid]
gsbid_matrix

<1x5034 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [13]:
# 15 фильмов похожих на звездные войны
# в порядке убывания схожести на основе косинусного сходства
rec1 = skr1.recommend_for_single_object(15, gsbid_matrix)
rec1

Unnamed: 0,title,dist
124584,Gratuitous Tank Battles,694016.147397
151281,Gratuitous Tank Battles,694016.147397
122898,Gratuitous Tank Battles,694016.147397
21952,Gratuitous Tank Battles,694016.147397
121130,Gratuitous Tank Battles,694016.147397
191208,World of Battles,467103.432724
3496,World of Battles,467103.432724
37988,World of Battles,467103.432724
49492,World of Battles,467103.432724
11108,World of Battles,467103.432724


In [14]:
# 15 фильмов похожих на звездные войны
# в порядке убывания схожести на основе косинусного сходства
rec2 = skr1.recommend_for_single_object(15, gsbid_matrix,cos_flag = False)
rec2

Unnamed: 0,title,dist
121130,Gratuitous Tank Battles,782283.647538
122898,Gratuitous Tank Battles,782283.647538
21952,Gratuitous Tank Battles,782283.647538
124584,Gratuitous Tank Battles,782283.647538
151281,Gratuitous Tank Battles,782283.647538
117448,F.E.A.R. 3,1000000.0
151275,F.E.A.R. 3,1000000.0
89987,F.E.A.R. 3,1000000.0
89543,F.E.A.R.,1000000.0
29690,F.E.A.R.,1000000.0


In [15]:
# Манхэттэнское расстояние дает нерелевантные результаты поиска
rec3 = skr1.recommend_for_single_object(15, gsbid_matrix, 
                                        cos_flag = False, manh_flag = True)
rec3

Unnamed: 0,title,dist
122898,Gratuitous Tank Battles,1254699.0
151281,Gratuitous Tank Battles,1254699.0
21952,Gratuitous Tank Battles,1254699.0
121130,Gratuitous Tank Battles,1254699.0
124584,Gratuitous Tank Battles,1254699.0
8329,F.E.A.R. 3,1707084.0
39213,F.E.A.R. 3,1707084.0
39212,F.E.A.R. 3,1707084.0
184505,F.E.A.R.,1707084.0
18826,F.E.A.R. 3,1707084.0


### Выводы
- Фильтрация на основе содержания позволяет получать рекомендации на основе известных объектов предметной области.
- Для фильтрации на основе содержания очень важны и способы векторизации объектов и выбор метрики для вычисления расстояния между объектами.
- В отличие коллаборативной фильтрации, фильтрация на основе содержания не использует данные о предпочтениях других пользователях рекомендательной системы.

## Коллаборативная фильтрация

Приведенный на рисунке пример коллаборативной фильтрации **не требует машинного обучения**. Эта задача решается путем запросов к базе данных:
1. На основе корзины текущего пользователя получить список пользователей, которые покупали такие же товары.
2. Для найденных пользователей получить список других товаров и порекомендовать их текущему пользователю.

Но в настоящее время для решения задачи используются методы машинного обучения. Результаты могут быть получены проще, быстрее и т.д.

#### Подготовка данных

In [16]:
df_steam.head()

Unnamed: 0,user-id,game-title,behavior,value,unknown
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0


In [17]:
# Количество уникальных пользователей
len(df_steam['user-id'].unique())

12393

In [18]:
# Количество уникальных фильмов
len(df_steam['game-title'].unique())

5155

In [19]:
# Сформируем матрицу взаимодействий на основе рейтингов
# Используется идея из статьи - https://towardsdatascience.com/beginners-guide-to-creating-an-svd-recommender-system-1fd7326d1f65
def create_utility_matrix(data):
    itemField = 'game-title'
    userField = 'user-id'
    valueField = 'value'  
    
    userList = data[userField].tolist()
    itemList = data[itemField].tolist()
    valueList = data[valueField].tolist()    
    
    users = list(set(userList))
    items = list(set(itemList))    
    
    users_index = {users[i]: i for i in range(len(users))}    
    pd_dict = {item: [0.0 for i in range(len(users))] for item in items}    
    
    for i in range(0,data.shape[0]):
        item = itemList[i]
        user = userList[i]
        value = valueList[i]    
        pd_dict[item][users_index[user]] = value    
    
    X = pd.DataFrame(pd_dict)
    X.index = users
        
    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}
    
    return X, users_index, items_index

In [20]:
%%time
user_item_matrix, users_index, items_index = create_utility_matrix(df_steam)

CPU times: user 4.27 s, sys: 189 ms, total: 4.46 s
Wall time: 4.43 s


In [91]:
user_item_matrix

Unnamed: 0,Bejeweled Deluxe,F1 2014,Summoner,Adventure Time Explore the Dungeon Because I DONT KNOW!,The Blue Flamingo,Greed Black Border,Sonic Generations,Puzzle Chronicles,Mushroom 11,Franchise Hockey Manager 2014,...,Mount & Blade,Runaway The Dream of the Turtle,"On the Rain-Slick Precipice of Darkness, Episode One",Eador. Masters of the Broken World,MLB 2K11,Waves,The Movies Stunts and Effects,Vanguard Saga of Heroes F2P,Tom Clancy's Ghost Recon Phantoms - EU Substance with Style pack (Assault),Super Laser Racer
179044354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
284164106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
234749963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27099151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
240058387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31817706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
143917036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74874862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
122486775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [167]:
# Выделение тестовой строки
user_item_matrix__test = user_item_matrix.iloc[[2]]
user_item_matrix__test

Unnamed: 0,Bejeweled Deluxe,F1 2014,Summoner,Adventure Time Explore the Dungeon Because I DONT KNOW!,The Blue Flamingo,Greed Black Border,Sonic Generations,Puzzle Chronicles,Mushroom 11,Franchise Hockey Manager 2014,...,Mount & Blade,Runaway The Dream of the Turtle,"On the Rain-Slick Precipice of Darkness, Episode One",Eador. Masters of the Broken World,MLB 2K11,Waves,The Movies Stunts and Effects,Vanguard Saga of Heroes F2P,Tom Clancy's Ghost Recon Phantoms - EU Substance with Style pack (Assault),Super Laser Racer
234749963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [168]:
# Оставшаяся часть матрицы для обучения
user_item_matrix__train = user_item_matrix[:2000]
user_item_matrix__train

Unnamed: 0,Bejeweled Deluxe,F1 2014,Summoner,Adventure Time Explore the Dungeon Because I DONT KNOW!,The Blue Flamingo,Greed Black Border,Sonic Generations,Puzzle Chronicles,Mushroom 11,Franchise Hockey Manager 2014,...,Mount & Blade,Runaway The Dream of the Turtle,"On the Rain-Slick Precipice of Darkness, Episode One",Eador. Masters of the Broken World,MLB 2K11,Waves,The Movies Stunts and Effects,Vanguard Saga of Heroes F2P,Tom Clancy's Ghost Recon Phantoms - EU Substance with Style pack (Assault),Super Laser Racer
179044354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
284164106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
234749963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27099151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
240058387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182654153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
149689548,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276567244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74126541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Построение модели на основе SVD

Используется идея из [статьи.](https://www.kaggle.com/vincentman0403/recommendation-example-by-svd)

In [169]:
%%time
U, S, VT = np.linalg.svd(user_item_matrix__train.T)
V = VT.T

CPU times: user 45.6 s, sys: 3.86 s, total: 49.5 s
Wall time: 7.04 s


In [170]:
# Матрица соотношения между пользователями и латентными факторами
U.shape

(5155, 5155)

In [171]:
# Матрица соотношения между объектами и латентными факторами
V.shape

(2000, 2000)

In [172]:
S.shape

(2000,)

In [173]:
Sigma = np.diag(S)
Sigma.shape

(2000, 2000)

In [174]:
# Диагональная матрица сингулярных значений
Sigma

array([[1.78021766e+04, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 8.73331312e+03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 8.31335053e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.04526152e-14, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 6.29406094e-15, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 5.13988714e-16]])

In [175]:
# Используем 3 первых сингулярных значения
r=3
Ur = U[:, :r]
Sr = Sigma[:r, :r]
Vr = V[:, :r]

In [176]:
# Матрица соотношения между новым пользователем и латентными факторами
test_user = np.mat(user_item_matrix__test.values)
test_user.shape, test_user

((1, 5155), matrix([[0., 0., 0., ..., 0., 0., 0.]]))

In [177]:
tmp = test_user * Ur * np.linalg.inv(Sr)
tmp

matrix([[-3.52052377e-06,  1.01926410e-04, -3.45713892e-04]])

In [178]:
test_user_result = np.array([tmp[0,0], tmp[0,1], tmp[0,2]])
test_user_result

array([-3.52052377e-06,  1.01926410e-04, -3.45713892e-04])

In [179]:
# Вычисляем косинусную близость между текущим пользователем 
# и остальными пользователями
cos_sim = cosine_similarity(Vr, test_user_result.reshape(1, -1))
cos_sim[:10]

array([[-0.0269311 ],
       [ 0.96862239],
       [ 1.        ],
       [ 0.82459459],
       [ 0.97767118],
       [ 0.88622784],
       [ 0.97831841],
       [ 0.98312858],
       [ 0.97036242],
       [-0.0269311 ]])

In [180]:
# Преобразуем размерность массива
cos_sim_list = cos_sim.reshape(-1, cos_sim.shape[0])[0]
cos_sim_list[:10]

array([-0.0269311 ,  0.96862239,  1.        ,  0.82459459,  0.97767118,
        0.88622784,  0.97831841,  0.98312858,  0.97036242, -0.0269311 ])

In [181]:
# Находим наиболее близкого пользователя
recommended_user_id = np.argsort(-cos_sim_list)[0]
recommended_user_id

2

In [182]:
# Получение названия игры
gameId_list = list(user_item_matrix.columns)
gameId_list[:10]

['Bejeweled Deluxe',
 'F1 2014',
 'Summoner',
 'Adventure Time  Explore the Dungeon Because I DONT KNOW!',
 'The Blue Flamingo',
 'Greed Black Border',
 'Sonic Generations',
 'Puzzle Chronicles',
 'Mushroom 11',
 'Franchise Hockey Manager 2014']

In [183]:
# Игры, которые оценивал текущий пользователь:
i=1
for idx, item in enumerate(np.ndarray.flatten(np.array(test_user))):
    if item > 0:
        film_title = gameId_list[idx]
        print('{} - {} - {}'.format(idx, film_title, item))
        if i==20:
            break
        else:
            i+=1

1358 - Kerbal Space Program - 263.0
1435 - Arma 3 Karts - 1.0
1999 - Stranded Deep - 161.0
2485 - Arma 3 Marksmen - 1.0
3167 - Arma 3 Zeus - 1.0
4075 - Arma 3 Helicopters - 1.0
4241 - Arma 3 - 769.0


In [184]:
# Игры, которые оценивал наиболее схожий пользователь:
i=1
recommended_user_item_matrix = user_item_matrix.iloc[[recommended_user_id+1]]
for idx, item in enumerate(np.ndarray.flatten(np.array(recommended_user_item_matrix))):
    if item > 0:
        film_title = gameId_list[idx]
        print('{} - {} - {}'.format(idx, film_title, item))
        if i==20:
            break
        else:
            i+=1

1714 - Half-Life 2 Lost Coast - 1.0
1955 - Half-Life 2 Episode One - 1.0
2219 - Half-Life 2 - 1.0
2783 - Half-Life Deathmatch Source - 1.0
2889 - Half-Life Source - 1.0
3829 - Half-Life 2 Deathmatch - 0.4
