In [396]:
%reset

<big>Построение коллаборативной рекомендательной системы для кинотеатра</big>

In [397]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from dataclasses import dataclass
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

<big>Data processing</big>

In [398]:
movies_data = pd.read_csv("../datasets/films_rating/movies.csv")
print(list(movies_data.columns))

ratings_data = pd.read_csv("../datasets/films_rating/ratings.csv")
print(list(ratings_data.columns))

['movieId', 'title', 'genres']
['userId', 'movieId', 'rating', 'timestamp']


In [399]:
# Удалим geners из movies_data, т.к он не нужен
movies_data.drop(columns="genres", inplace=True)

# Удалим timestamp из ratings_data, т.к он не нужен
ratings_data.drop(columns="timestamp", inplace=True)

In [400]:
# Создадим матрицу предпочтений:
# Для этого создадим сводную таблицу файла rating_data
user_item_matrix = ratings_data.pivot(index='movieId', columns='userId', values='rating')
user_item_matrix.fillna(0, inplace=True)
user_item_matrix.head()


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [401]:
print(user_item_matrix.shape)

(9724, 610)


In [402]:
# Т.к. могут быть неактивные пользователи, которые ставят мало оценок
# Посчитаем, сколько оценок было у каждого пользователя:
user_votes = ratings_data.groupby('userId')['rating'].agg('count')
movies_votes = ratings_data.groupby('movieId')['rating'].agg('count')


In [403]:
# Используем фильтр mask для отсева пользователей, у которых мало оценок:
user_mask = user_votes[user_votes> 50].index
movies_mask = movies_votes[movies_votes > 10].index

In [404]:
# Отберем активных пользователей
user_item_matrix = user_item_matrix.loc[:, user_mask]

# Отберем хорошо оцененные фильмы
user_item_matrix = user_item_matrix.loc[movies_mask, :]

In [405]:
user_item_matrix

userId,1,4,6,7,10,11,15,16,17,18,...,600,601,602,603,604,605,606,607,608,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,3.5,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,4.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
176371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
177765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
179819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [406]:
# Поскольку мы имеем разреженную матрицу высокой размерности, используем scipy.sparse.csr
csr_data = csr_matrix(user_item_matrix.values)


In [407]:
# Сбросим индекс для удобного фильма поиска по индексу
user_item_matrix = user_item_matrix.rename_axis(None, axis=1).reset_index()
user_item_matrix

Unnamed: 0,movieId,1,4,6,7,10,11,15,16,17,...,600,601,602,603,604,605,606,607,608,610
0,1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
1,2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
2,3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
4,6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2116,174055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2117,176371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2118,177765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2119,179819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<big>Составление рекомендаций</big>

In [408]:
# Создадим класс данных для удобного использования класса MovieRecommender
@dataclass
class Data:
    movie_data: pd.DataFrame
    ratings_data: pd.DataFrame
    user_item_matrix: pd.DataFrame
    csr_data: pd.DataFrame

In [414]:
class MovieRecommender(NearestNeighbors):
    def __init__(self,**kwargs):
        super(MovieRecommender, self).__init__(**kwargs)

    def get_recommendation(self, data, film_name='Matrix', n=10) -> pd.DataFrame:
        try:
            # Найдем индекс фильма в матрице предпочтений:
            # data.movie_data['title'] = data.movie_data.title.str.lower()
            movie_search = movies_data[data.movie_data.title.str.lower().str.contains(film_name.lower())]
            # print("Найденные фильмы (будет взят первый):\n", movie_search, '\n')
            # Берем первый вариант для примера: получим индекс в датасете movie_search
            movie_id = movie_search.iloc[0]['movieId']

            # Получим индекс фильма в матрице предпочтений
            movie_id = data.user_item_matrix[data.user_item_matrix['movieId'] == movie_id].index[0]

        except IndexError:
            print("Фильма нет в базе данных. К сожалению, мы не можем вам ничего порекомендовать")
            return pd.DataFrame()

        else:
            # Ищем ближайших соседей фильма матрица:
            distances, indices = self.kneighbors(data.csr_data[movie_id], n_neighbors=n + 1)  # + 1, т.к. алгоритм считает расстояние до самого себя

            # Уберем лишние измерения с помощью squeeze и запакуем всё в 1 список
            indices = indices.squeeze().tolist()
            distances = distances.squeeze().tolist()
            nearest_films_data = list(zip(indices, distances))

            # Отсортируем данные по расстоянию в порядке убывания:
            nearest_films_data = sorted(nearest_films_data, key=lambda x: x[1], reverse=True)[:-1]

            # Получим список фильмов, которые нам нужно рекомендовать:
            recom_list = self.find_indices(data, nearest_films_data)

            return pd.DataFrame(recom_list, index=range(1, n + 1))

    @staticmethod
    def find_indices(data, nearest_films_data):
        recom_list = []
        for ind_dist in nearest_films_data:
            try:
                # Ищем индекс фильма в матрице предпочтений
                current_movie_id = data.user_item_matrix.iloc[ind_dist[0]]['movieId']

                # Ищем тот же индекс в матрице фильмов
                movie_id = data.movie_data[data.movie_data['movieId'] == current_movie_id].index
            except IndexError:
                # Забираем название и расстояние до этого фильма
                title = ''
                dist = 0
            else:
                # Забираем название и расстояние до этого фильма
                title = data.movie_data.iloc[movie_id]['title'].values[0]
                dist = ind_dist[1]

            recom_list.append({'Title': title, 'Distance': dist})
        return recom_list





In [416]:
# Зададим сколько рекомендаций хотим получать:
recommendations_n = 10

# Зададим фильм, на основе которого хотим получать рекомендацию:
search_word = 'Lord of the rings'

data = Data(movie_data=movies_data,
            ratings_data=ratings_data,
            user_item_matrix=user_item_matrix,
            csr_data=csr_data)

# metric='cosine' - косинусное сходство
# algorithm='brute' - алгоритм полного перебора
# n_jobs=-1 - вычисления будут вестись на всех свободных ядрах процессора
# MovieRecommender - класс, который наследуем методы и аттрибуты класса sklearn.neighbors import NearestNeighbors
Movie_Recommender = MovieRecommender(metric='cosine', algorithm='brute', n_neighbors=20)
Movie_Recommender.fit(data.csr_data)
recommendation = Movie_Recommender.get_recommendation(data, film_name=search_word, n=recommendations_n)
print(f"Рекомендации по фильму {search_word}:\n", recommendation)


Рекомендации по фильму Lord of the rings:
                                     Title  Distance
1                    Bottle Rocket (1996)  0.578306
2   Invasion of the Body Snatchers (1956)  0.575582
3                      Logan's Run (1976)  0.575343
4      Big Trouble in Little China (1986)  0.573421
5                      Superman II (1980)  0.572544
6               Planet of the Apes (1968)  0.562688
7              Conan the Barbarian (1982)  0.552975
8                        Ladyhawke (1985)  0.528053
9            Young Sherlock Holmes (1985)  0.518432
10                       Excalibur (1981)  0.434872
