# Фаза 2 • Неделя 11 • Понедельник
## Рекомендательные системы
### Классические подходы 

### Задание

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import plotly.express as px
import scipy

### Загрузка данных

Запусти ячейку ниже, чтобы загрузить данные. 

In [2]:
CURR_PATH = os.path.dirname("__name__")

In [3]:
# 1. Users dataset
u_cols = ["user_id", "age", "sex", "occupation", "zip_code"]
users = pd.read_csv(
    os.path.join(CURR_PATH, "data", "ml-100k", "u.user"),
    sep="|",
    names=u_cols,
    encoding="latin-1",
    parse_dates=True,
    header=None,
)
# 2. Rating dataset
r_cols = ["user_id", "movie_id", "rating", "unix_timestamp"]
ratings = pd.read_csv(
    os.path.join(CURR_PATH, "data", "ml-100k", "u.data"),
    sep="\t",
    names=r_cols,
    encoding="latin-1",
)

# 3.Movies Dataset
m_cols = [
    "movie_id",
    "title",
    "release_date",
    "video_release_date",
    "imdb_url",
    "unknown",
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
movies = pd.read_csv(
    os.path.join(CURR_PATH, "data", "ml-100k", "u.item"),
    sep="|",
    names=m_cols,
    encoding="latin-1",
).drop(["video_release_date", "unknown"], axis=1)

ratings = ratings.merge(movies[["movie_id", "title"]], how="left", on="movie_id")

In [4]:
users.shape, ratings.shape, movies.shape

((943, 5), (100000, 5), (1682, 22))

In [5]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [6]:
users["user_id"].nunique()

943

In [7]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


## 📊 Exploratory data analysis / Разведывательный анализ данных

Чтобы что-то рекомендовать, всегда стоит узнать о пользователях побольше. В этом разделе тебе необходимо визуализировать распределения разных признаков твоей рекомендательной системы. 

In [8]:
def configure_plotly_theme(fig):
    """Функция для применения единого стиля ко всем графикам"""
    fig.update_layout(
        plot_bgcolor="white",
        paper_bgcolor="white",
        font=dict(size=14),
        xaxis=dict(
            title_font=dict(
                size=20,
                family="Arial",
            ),
            tickfont=dict(size=16),
            gridcolor="lightgray",
            gridwidth=1,
            griddash="dash",
            showline=True,
            linecolor="black",
            linewidth=1,
        ),
        yaxis=dict(
            title_font=dict(
                size=20,
                family="Arial",
            ),
            tickfont=dict(size=16),
            gridcolor="lightgray",
            gridwidth=1,
            griddash="dash",
            showline=True,
            linecolor="black",
            linewidth=1,
        ),
        title_font=dict(size=24, family="Arial", weight="bold"),
    )
    fig.update_xaxes(
        # title_text="Название оси X",
        title_font=dict(size=20, family="Arial")
    )
    fig.update_yaxes(
        # title_text="Название оси Y",
        title_font=dict(size=20, family="Arial")
    )

    return fig

### Пользователь 👨

Визуализируй следующие распределения: 
- пола (`barplot`)

In [9]:
sex_distrib = users.groupby("sex").count()["user_id"]
fig_1 = px.bar(x=sex_distrib.index, y=sex_distrib, title="Распределение пола")
fig_1.update_layout(
    width=800,  # ширина в пикселях
    height=600,  # высота в пикселях
    # autosize=False,  # отключаем авторазмер
    xaxis_title="Пол",
    yaxis_title="Количество пользователей",
    # title_font=dict(size=24, family="Arial", weight="bold"),
)
fig_1 = configure_plotly_theme(fig_1)
fig_1.update_traces(marker=dict(line=dict(width=2, color="black")))
fig_1.update_traces(
    texttemplate="%{y}",  # Формат текста (значение y)
    textposition="outside",  # Положение: 'outside', 'inside', 'auto', 'none'
    textfont=dict(
        size=18, family="Arial", color="black"
    ),  # Размер шрифта  # Цвет текста
)
fig_1.update_layout(
    xaxis=dict(
        showticklabels=True,
        # tickmode="linear",
        # ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        # ticklen=6,  # Длина ticks
        # tickwidth=2,  # Толщина ticks
        # tickcolor="black",  # Цвет ticks
        # dtick=100,
        # tickmode='array',  # Режим ручной установки
        # tickvals=[0, 1, 2, 3],  # Позиции ticks
        # ticktext=['Категория A', 'Категория B', 'Категория C', 'Категория D']  # Подписи
    ),
    yaxis=dict(
        showticklabels=True,
        tickmode="linear",
        dtick=100,
        ticks="inside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        # tickmode="linear",
        # dtick=5
    ),
)
fig_1.show()

- возраста (`hist`)

In [10]:
fig_2 = px.histogram(users["age"].astype(int), nbins=70, title="Распределение возраста")
fig_2.update_layout(
    width=800,  # ширина в пикселях
    height=600,  # высота в пикселях
    # autosize=False,  # отключаем авторазмер
    xaxis_title="Возраст",
    yaxis_title="Количество пользователей",
    # title_font=dict(size=24, family="Arial", weight="bold"),
    showlegend=False,
)
fig_2.update_layout(
    xaxis=dict(
        showticklabels=True,
        tickmode="linear",
        ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        tickcolor="black",  # Цвет ticks
        dtick=10,
        # tickmode='array',  # Режим ручной установки
        # tickvals=[0, 1, 2, 3],  # Позиции ticks
        # ticktext=['Категория A', 'Категория B', 'Категория C', 'Категория D']  # Подписи
    ),
    yaxis=dict(
        showticklabels=True,
        tickmode="linear",
        dtick=5,
        ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        # tickmode="linear",
        # dtick=5
    ),
)
fig_2 = configure_plotly_theme(fig_2)
fig_2.update_traces(marker=dict(line=dict(width=1, color="black")))
fig_2.show()

- профессии (`barplot`)

In [11]:
occupations_distrib = users.groupby("occupation").count()["user_id"]
occupations_distrib = occupations_distrib.sort_values(ascending=False)
fig_3 = px.bar(
    x=occupations_distrib.index, y=occupations_distrib, title="Распределение профессий"
)
fig_3.update_layout(
    width=800,  # ширина в пикселях
    height=600,  # высота в пикселях
    # autosize=False,  # отключаем авторазмер
    xaxis_title="Профессия",
    yaxis_title="Количество пользователей",
    # title_font=dict(size=24, family="Arial", weight="bold"),
)
fig_3.update_traces(
    marker=dict(line=dict(width=1.5, color="black"))  # Толщина рамки  # Цвет рамки
)
fig_3.update_layout(
    xaxis=dict(tickangle=-45)  # Поворот на 45 градусов против часовой стрелки
)
fig_3.update_traces(
    texttemplate="%{y}",  # Формат текста (значение y)
    textposition="outside",  # Положение: 'outside', 'inside', 'auto', 'none'
    textfont=dict(
        size=14, family="Arial", color="black"
    ),  # Размер шрифта  # Цвет текста
)
fig_3.update_layout(
    xaxis=dict(
        showticklabels=True,
        # tickmode="linear",
        # ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        # ticklen=6,  # Длина ticks
        # tickwidth=2,  # Толщина ticks
        # tickcolor="black",  # Цвет ticks
        # dtick=10,
        # tickmode='array',  # Режим ручной установки
        # tickvals=[0, 1, 2, 3],  # Позиции ticks
        # ticktext=['Категория A', 'Категория B', 'Категория C', 'Категория D']  # Подписи
    ),
    yaxis=dict(
        showticklabels=True,
        tickmode="linear",
        dtick=50,
        ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        # tickmode="linear",
        # dtick=5
    ),
)
fig_3 = configure_plotly_theme(fig_3)
fig_3.show()

### Фильмы 🎥

Визуализируй следующие распределения: 

- количество фильмов каждого жанра (у нас есть много жанров фильмов: сколько фильмов у каждого жанра?)

In [12]:
movies.head()

Unnamed: 0,movie_id,title,release_date,imdb_url,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [13]:
genres = movies.iloc[:, 4:]
print(genres.shape)
genres_film_count = genres.copy().apply(sum)
genres_film_count = genres_film_count.sort_values(ascending=False)

(1682, 18)


In [14]:
fig_4 = px.bar(
    x=genres_film_count.index,
    y=genres_film_count,
    title="Распределение фильмов по жанрам",
)
fig_4.update_layout(
    width=1000,  # ширина в пикселях
    height=600,  # высота в пикселях
    # autosize=False,  # отключаем авторазмер
    xaxis_title="Жанры",
    yaxis_title="Количество фильмов",
    title_font=dict(size=24, family="Arial", weight="bold"),
)
fig_4.update_traces(
    marker=dict(line=dict(width=1.5, color="black"))  # Толщина рамки  # Цвет рамки
)
fig_4.update_layout(
    xaxis=dict(tickangle=-45)  # Поворот на 45 градусов против часовой стрелки
)
fig_4.update_traces(
    texttemplate="%{y}",  # Формат текста (значение y)
    textposition="outside",  # Положение: 'outside', 'inside', 'auto', 'none'
    textfont=dict(
        size=14, family="Arial", color="black"
    ),  # Размер шрифта  # Цвет текста
)
fig_4.update_layout(
    xaxis=dict(
        showticklabels=True,
        tickmode="linear",
        ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        tickcolor="black",  # Цвет ticks
        dtick=1,
        # tickmode='array',  # Режим ручной установки
        # tickvals=[0, 1, 2, 3],  # Позиции ticks
        # ticktext=['Категория A', 'Категория B', 'Категория C', 'Категория D']  # Подписи
    ),
    yaxis=dict(
        showticklabels=True,
        tickmode="linear",
        dtick=100,
        ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        # tickmode="linear",
        # dtick=5
    ),
)
fig_4 = configure_plotly_theme(fig_4)
fig_4.show()

- распределение количества жанров у разных фильмов (какие-то фильмы принадлежат одному жанру, какие-то сразу нескольким)

In [15]:
genres_distrib = genres.copy()
genres_distrib["id"] = movies["movie_id"]
genres_distrib.set_index("id", inplace=True)
genres_distrib = genres_distrib.T
# genres_distrib.reset_index(inplace=True)
genres_distrib.head()
genres_sum = genres_distrib.apply(sum)
genres_sum = genres_sum.reset_index()
genres_sum.head()

Unnamed: 0,id,0
0,1,3
1,2,3
2,3,1
3,4,3
4,5,3


In [16]:
for_barplot = genres_sum.groupby(0).count()
for_barplot

Unnamed: 0_level_0,id
0,Unnamed: 1_level_1
0,2
1,831
2,569
3,215
4,51
5,11
6,3


In [17]:
# genres_sum.head()
fig_5 = px.bar(
    x=for_barplot.index,
    y=for_barplot["id"],
    title="Распределение количества жанров",
)
fig_5.update_layout(
    width=1000,  # ширина в пикселях
    height=600,  # высота в пикселях
    # autosize=False,  # отключаем авторазмер
    xaxis_title="Количество жанров",
    yaxis_title="Количество фильмов",
    # title_font=dict(size=24, family="Arial", weight="bold"),
)
fig_5.update_traces(
    marker=dict(line=dict(width=1.5, color="black"))  # Толщина рамки  # Цвет рамки
)
# fig_5.update_layout(
#     xaxis=dict(tickangle=-45)  # Поворот на 45 градусов против часовой стрелки
# )
fig_5.update_traces(
    texttemplate="%{y}",  # Формат текста (значение y)
    textposition="outside",  # Положение: 'outside', 'inside', 'auto', 'none'
    textfont=dict(
        size=14, family="Arial", color="black"
    ),  # Размер шрифта  # Цвет текста
)
fig_5.update_layout(
    xaxis=dict(
        showticklabels=True,
        tickmode="linear",
        ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        tickcolor="black",  # Цвет ticks
        dtick=1,
        # tickmode='array',  # Режим ручной установки
        # tickvals=[0, 1, 2, 3],  # Позиции ticks
        # ticktext=['Категория A', 'Категория B', 'Категория C', 'Категория D']  # Подписи
    ),
    yaxis=dict(
        showticklabels=True,
        tickmode="linear",
        dtick=100,
        ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        # tickmode="linear",
        # dtick=5
    ),
)
fig_5 = configure_plotly_theme(fig_5)
fig_5.show()

- корреляцонная матрица жанров (какие-то жанры близки между собой, а какие-то противоречивы, корреляционная матрица поможет понять, какие жанры связаны между собой больше всего)

In [18]:
genres.head()

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [19]:
corr_matrix = genres.corr()

fig_6 = px.imshow(
    corr_matrix,
    text_auto=".2f",  # Формат чисел с 2 знаками после запятой
    color_continuous_scale=["blue", "white", "red"],
    zmin=-1,  # Минимальное значение шкалы
    zmax=1,  # Максимальное значение шкалы
    title="Корреляционная матрица",
)

# Настройка внешнего вида
fig_6.update_layout(
    width=1000,
    height=1000,
    # xaxis_title="Переменные", yaxis_title="Переменные"
)
fig_6.update_layout(
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(size=16),
    xaxis=dict(
        title_font=dict(size=20),
        tickfont=dict(size=16),
        # gridcolor="lightgray",
        # gridwidth=1,
        # griddash="dash",
        # showline=True,
        # linecolor="black",
        # linewidth=1,
    ),
    yaxis=dict(
        title_font=dict(size=20),
        tickfont=dict(size=16),
        # gridcolor="lightgray",
        # gridwidth=1,
        # griddash="dash",
        # showline=True,
        # linecolor="black",
        # linewidth=1,
    ),
    title_font=dict(size=24, family="Arial", weight="bold"),
)
fig_6.update_xaxes(
    # title_text="Название оси X",
    title_font=dict(size=20, family="Arial")
)
fig_6.update_yaxes(
    # title_text="Название оси Y",
    title_font=dict(size=20, family="Arial")
)
fig_6.show()

# ❓

Найди два жанра, которые больше всего близки друг к другу и запиши ответ в ячейку ниже

> Children и Animation

<img src="https://icons.iconarchive.com/icons/icons8/windows-8/256/Programming-Github-icon.png" width=32 /> Пора сохранить изменения для __github__. После пуша распечатай результат команды `!git status` в ячейке ниже.

In [20]:
# code

### Анализ рейтинга 🔢
Рейтинг - самое информативное, что у тебя есть. 

1. Построй распределение количества оценок, поставленных пользователем. Это должна быть функция, которая принимает на вход `user_id`, а на выходе возвращает словарь вида: 
    
`{1: <число оценок 1, 2: <число оценок 2>, ..., 5 <число оценок 5>}`

На основе этой результата данной функции должен строиться словарь. 

In [21]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [22]:
ratings.sort_values("user_id")

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title
41842,1,46,4,876893230,Exotica (1994)
38751,1,257,4,874965954,Men in Black (1997)
8976,1,12,5,878542960,"Usual Suspects, The (1995)"
3248,1,74,1,889751736,Faster Pussycat! Kill! Kill! (1965)
3260,1,134,4,875073067,Citizen Kane (1941)
...,...,...,...,...,...
95594,943,217,3,888640067,Bram Stoker's Dracula (1992)
77956,943,94,4,888639929,Home Alone (1990)
76855,943,943,5,888639614,Killing Zoe (1994)
94966,943,566,4,888639886,Clear and Present Danger (1994)


In [23]:
ratings["rating"].nunique()

5

In [24]:
temp = ratings.groupby(["user_id", "rating"]).count()["movie_id"]
# user_ratings = {}
# user_ratings[1] = temp[1][1]
# user_ratings
temp[8][3]
# temp

10

In [25]:
def get_user_ratings(user_id: int) -> dict:
    # temp = ratings.groupby(["user_id", "rating"]).count()["movie_id"]
    user_ratings = {}
    user_series = temp.loc[user_id]
    for i in range(1, 6):
        user_ratings[i] = user_series.get(i, 0)
    return user_ratings

In [26]:
get_user_ratings(456)

{1: 5, 2: 19, 3: 81, 4: 96, 5: 16}

2. Построй распределение средней оценки по пользователям. Для этого нужно узнать, какая средняя оценка у каждого пользователя, а после этого построить распределение. 

In [27]:
def get_mean_rating(user_id: int) -> np.float64:
    user_ratings = get_user_ratings(user_id)
    sum_for_mean = 0
    for i in user_ratings.keys():
        sum_for_mean += user_ratings[i] * i
    return sum_for_mean / sum(user_ratings.values())

In [28]:
get_mean_rating(1)

3.610294117647059

In [29]:
aver_rating_distrib = users["user_id"].copy().reset_index()
# aver_rating_distrib["nunique_ratings"] = aver_rating_distrib["user_id"].map(
#     lambda x: len(get_user_ratings(x).keys())
# )
aver_rating_distrib["mean_rating"] = aver_rating_distrib["user_id"].map(get_mean_rating)
# get_mean_rating(aver_rating_distrib.loc[0, "user_id"])

In [30]:
aver_rating_distrib

Unnamed: 0,index,user_id,mean_rating
0,0,1,3.610294
1,1,2,3.709677
2,2,3,2.796296
3,3,4,4.333333
4,4,5,2.874286
...,...,...,...
938,938,939,4.265306
939,939,940,3.457944
940,940,941,4.045455
941,941,942,4.265823


In [31]:
kde = scipy.stats.gaussian_kde(aver_rating_distrib["mean_rating"])
x = np.linspace(
    min(aver_rating_distrib["mean_rating"]),
    max(aver_rating_distrib["mean_rating"]),
    200,
)
y_vals = kde(x)
df_kde = pd.DataFrame({"x": x, "density": y_vals})
fig_7 = px.line(df_kde, x="x", y="density", title="Распределение средних оценок")
fig_7.update_layout(
    width=800,  # ширина в пикселях
    height=600,  # высота в пикселях
    # autosize=False,  # отключаем авторазмер
    xaxis_title="Оценка",
    yaxis_title="Нормированное кол-во оценок",
    # title_font=dict(size=24, family="Arial", weight="bold"),
    showlegend=False,
)
fig_7.update_layout(
    xaxis=dict(
        showticklabels=True,
        tickmode="linear",
        ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        tickcolor="black",  # Цвет ticks
        dtick=1,
        # tickmode='array',  # Режим ручной установки
        # tickvals=[0, 1, 2, 3],  # Позиции ticks
        # ticktext=['Категория A', 'Категория B', 'Категория C', 'Категория D']  # Подписи
    ),
    yaxis=dict(
        showticklabels=True,
        tickmode="linear",
        dtick=0.1,
        ticks="inside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        # tickmode="linear",
        # dtick=5
    ),
)
fig_7 = configure_plotly_theme(fig_7)
fig_7.update_traces(marker=dict(line=dict(width=1, color="black")))
fig_7.show()

3. Построй распределение количества оценок, поставленных фильму. Каждый фильм оценен разное количество раз. Узнай, сколько оценок у каждого фильма, а после визуализируй это на гистограмме.  

In [33]:
print(ratings["title"].nunique())
print(ratings["title"].isna().sum())
print(ratings["movie_id"].nunique())
ratings.head(2)

1664
0
1682


Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)


In [34]:
movies_ratings_count = ratings.groupby("movie_id").count()["user_id"]
movies_ratings_count = movies_ratings_count.sort_index()
movies_ratings_count.head(5)

movie_id
1    452
2    131
3     90
4    209
5     86
Name: user_id, dtype: int64

In [35]:
movies_ratings_count[movies_ratings_count > 500]

movie_id
50     583
100    508
181    507
258    509
Name: user_id, dtype: int64

In [36]:
fig_8 = px.histogram(
    movies_ratings_count, nbins=200, title="Распределение количества оценок у фильма"
)
fig_8.update_layout(
    width=800,  # ширина в пикселях
    height=600,  # высота в пикселях
    # autosize=False,  # отключаем авторазмер
    xaxis_title="Количество оценок",
    yaxis_title="Количество фильмов",
    # title_font=dict(size=24, family="Arial", weight="bold"),
    showlegend=False,
)
fig_8.update_layout(
    xaxis=dict(
        showticklabels=True,
        tickmode="linear",
        ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        tickcolor="black",  # Цвет ticks
        dtick=100,
        # tickmode='array',  # Режим ручной установки
        # tickvals=[0, 1, 2, 3],  # Позиции ticks
        # ticktext=['Категория A', 'Категория B', 'Категория C', 'Категория D']  # Подписи
    ),
    yaxis=dict(
        showticklabels=True,
        tickmode="linear",
        dtick=50,
        ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        # tickmode="linear",
        # dtick=5
    ),
)
fig_8 = configure_plotly_theme(fig_8)
fig_8.update_traces(marker=dict(line=dict(width=1, color="black")))
fig_8.show()

4. Построй распределение средней оценки по фильмам. 

In [37]:
movies_rating = ratings.groupby(["movie_id", "rating"]).count()["user_id"]
movies_rating

movie_id  rating
1         1           8
          2          27
          3          96
          4         202
          5         119
                   ... 
1678      1           1
1679      3           1
1680      2           1
1681      3           1
1682      3           1
Name: user_id, Length: 6714, dtype: int64

In [38]:
def mean_movie_rating(id: int):
    movie_dict = {}
    movie_series = movies_rating[id]
    for i in range(1, 6):
        movie_dict[i] = movie_series.get(i, 0)
    sum_for_mean = 0
    for i in range(1, 6):
        sum_for_mean += movie_dict[i] * i
    return sum_for_mean / sum(movie_dict.values())

In [39]:
mean_rating = movies_ratings_count.reset_index()
mean_rating["mean_rating"] = mean_rating["movie_id"].map(mean_movie_rating)
mean_rating.head()
# mean_rating.shape

Unnamed: 0,movie_id,user_id,mean_rating
0,1,452,3.878319
1,2,131,3.206107
2,3,90,3.033333
3,4,209,3.550239
4,5,86,3.302326


In [40]:
kde_2 = scipy.stats.gaussian_kde(mean_rating["mean_rating"])
x_2 = np.linspace(
    min(mean_rating["mean_rating"]),
    max(mean_rating["mean_rating"]),
    200,
)
y_2 = kde_2(x_2)
df_kde_2 = pd.DataFrame({"x": x_2, "density": y_2})
fig_9 = px.line(
    df_kde_2, x="x", y="density", title="Распределение средних оценок фильмов"
)
fig_9.update_layout(
    width=800,  # ширина в пикселях
    height=600,  # высота в пикселях
    # autosize=False,  # отключаем авторазмер
    xaxis_title="Оценка",
    yaxis_title="Нормированное кол-во оценок",
    # title_font=dict(size=24, family="Arial", weight="bold"),
    showlegend=False,
)
fig_9.update_layout(
    xaxis=dict(
        showticklabels=True,
        tickmode="linear",
        ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        tickcolor="black",  # Цвет ticks
        dtick=1,
        # tickmode='array',  # Режим ручной установки
        # tickvals=[0, 1, 2, 3],  # Позиции ticks
        # ticktext=['Категория A', 'Категория B', 'Категория C', 'Категория D']  # Подписи
    ),
    yaxis=dict(
        showticklabels=True,
        tickmode="linear",
        dtick=0.1,
        ticks="inside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        # tickmode="linear",
        # dtick=5
    ),
)
fig_9 = configure_plotly_theme(fig_9)
fig_9.update_traces(marker=dict(line=dict(width=1, color="black")))
fig_9.show()

In [41]:
fig_10 = px.histogram(
    mean_rating["mean_rating"],
    nbins=200,
    title="Распределение средних оценок у фильмов",
)
fig_10.update_layout(
    width=800,  # ширина в пикселях
    height=600,  # высота в пикселях
    # autosize=False,  # отключаем авторазмер
    xaxis_title="Количество оценок",
    yaxis_title="Количество фильмов",
    # title_font=dict(size=24, family="Arial", weight="bold"),
    showlegend=False,
)
fig_10.update_layout(
    xaxis=dict(
        showticklabels=True,
        tickmode="linear",
        ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        tickcolor="black",  # Цвет ticks
        dtick=100,
        # tickmode='array',  # Режим ручной установки
        # tickvals=[0, 1, 2, 3],  # Позиции ticks
        # ticktext=['Категория A', 'Категория B', 'Категория C', 'Категория D']  # Подписи
    ),
    yaxis=dict(
        showticklabels=True,
        tickmode="linear",
        dtick=50,
        ticks="outside",  # Ticks снаружи: 'outside', 'inside', ''
        ticklen=6,  # Длина ticks
        tickwidth=2,  # Толщина ticks
        # tickmode="linear",
        # dtick=5
    ),
)
fig_10 = configure_plotly_theme(fig_10)
fig_10.update_traces(marker=dict(line=dict(width=1, color="black")))
fig_10.show()

<img src="https://icons.iconarchive.com/icons/icons8/windows-8/256/Programming-Github-icon.png" width=32 /> Пора сохранить изменения для __github__. После пуша распечатай результат команды `!git status` в ячейке ниже.

In [42]:
# code

### Рекомендации по популярности 🔝

Мы хотим рекомендовать фильмы с самой большой оценкой, которые при этом смотрели достаточно часто. Для этого нужно оценить величину, которую мы назовем __score__, значение которой будет вычисляться как логарифм числа оценок фильма, умноженный на среднюю оценку: 
$$score_{film} = \log n * \bar{r}_{film},$$
где $n$ - число оценок для фильма, $\bar{r}_{film}$ - средний рейтинг фильма. 

# ❓
Зачем мы берем логарифм от числа оценок фильма?

> Чтобы приращение score при росте количества просмотров было плавным. В противном случае количество просмотров будет влиять слишком сильно

Добавь колонки `num_ratings, mean_rating, score` в датафрейм `ratings` и найди топ-10 фильмов, которые всем точно стоит посмотреть.

In [43]:
print(movies.shape)
movies.head()

(1682, 22)


Unnamed: 0,movie_id,title,release_date,imdb_url,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [44]:
movies_short = movies[["movie_id", "title"]]
movies_short = pd.merge(
    movies_short,
    mean_rating[["movie_id", "mean_rating"]],
    how="left",
    on="movie_id",
)
movies_short = pd.merge(
    movies_short,
    movies_ratings_count.reset_index()[["movie_id", "user_id"]],
    how="left",
    on="movie_id",
)
# movies_short["num_ratings"] = movies_ratings_count.copy()
movies_short = movies_short.rename(columns={"user_id": "ratings_count"})
movies_short.head()

Unnamed: 0,movie_id,title,mean_rating,ratings_count
0,1,Toy Story (1995),3.878319,452
1,2,GoldenEye (1995),3.206107,131
2,3,Four Rooms (1995),3.033333,90
3,4,Get Shorty (1995),3.550239,209
4,5,Copycat (1995),3.302326,86


In [45]:
movies_short["score"] = (
    np.log(movies_short["ratings_count"]) * movies_short["mean_rating"]
)
movies_short.sort_values("score", ascending=False).head(10)

Unnamed: 0,movie_id,title,mean_rating,ratings_count,score
49,50,Star Wars (1977),4.358491,583,27.755684
99,100,Fargo (1996),4.155512,508,25.890839
126,127,"Godfather, The (1972)",4.283293,413,25.800191
173,174,Raiders of the Lost Ark (1981),4.252381,420,25.685464
97,98,"Silence of the Lambs, The (1991)",4.289744,390,25.59324
317,318,Schindler's List (1993),4.466443,298,25.445743
63,64,"Shawshank Redemption, The (1994)",4.44523,283,25.095308
180,181,Return of the Jedi (1983),4.00789,507,24.963184
312,313,Titanic (1997),4.245714,350,24.87111
171,172,"Empire Strikes Back, The (1980)",4.20436,367,24.828265


In [46]:
movies_short.head(6)

Unnamed: 0,movie_id,title,mean_rating,ratings_count,score
0,1,Toy Story (1995),3.878319,452,23.710807
1,2,GoldenEye (1995),3.206107,131,15.630404
2,3,Four Rooms (1995),3.033333,90,13.649423
3,4,Get Shorty (1995),3.550239,209,18.966565
4,5,Copycat (1995),3.302326,86,14.709705
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,3.576923,26,11.653961


## Контентный подход

### Похожесть фильмов между собой 🎥 - 🎥

Посчитай меры похожести каждой пары фильмов по жанрам. Будем рекомендовать фильмы, похожие на какой-нибудь из фильмов, который понравился пользователю. В результате должна получиться функция `recommend(user_id, top=10)`, которая должна выдавать `list` или `pd.DataFrame` из `<top>` фильмов, которые мы будем рекомендовать пользователю.

<details>
<summary>Что такое похожесть в текущем контексте?</summary>
Каждый фильм может быть представлен вектором, описывающим его принадлежность к жанрам. Близость этих векторов можно использовать как меру сходства двух фильмов. 
</details>


In [47]:
genres.head(2)

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [48]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity_matrix = cosine_similarity(genres)
# n = movies_short.shape[0]

# cosine_similarity_matrix = np.zeros((n, n))
# for j in range(0, n):
#     for k in range(j, n):
#         cosine_similarity_matrix[j, k] = cosine_similarity(
#             genres.iloc[j, :], genres.iloc[k, :]
#         )
cosine_similarity_matrix = pd.DataFrame(cosine_similarity_matrix)
# cosine_similarity_matrix.columns = movies_short["title"]
# cosine_similarity_matrix.index = movies_short["title"]
cosine_similarity_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,1.000000,0.000000,0.000000,0.333333,0.000000,0.000000,0.000000,0.666667,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.57735,0.000000
1,0.000000,1.000000,0.577350,0.333333,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.816497,0.000000,0.000000,0.000000,0.000000,0.000000,0.408248,0.000000,0.00000,0.000000
2,0.000000,0.577350,1.000000,0.000000,0.577350,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.707107,0.000000,0.00000,0.000000
3,0.333333,0.333333,0.000000,1.000000,0.333333,0.577350,0.408248,0.666667,0.577350,0.408248,...,0.408248,0.577350,0.577350,0.577350,0.577350,0.577350,0.000000,0.408248,0.57735,0.577350
4,0.000000,0.333333,0.577350,0.333333,1.000000,0.577350,0.408248,0.333333,0.577350,0.408248,...,0.408248,0.577350,0.577350,0.577350,0.577350,0.577350,0.408248,0.408248,0.00000,0.577350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0.000000,0.000000,0.000000,0.577350,0.577350,1.000000,0.707107,0.577350,1.000000,0.707107,...,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,0.707107,0.00000,1.000000
1678,0.000000,0.408248,0.707107,0.000000,0.408248,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.500000,0.00000,0.000000
1679,0.000000,0.000000,0.000000,0.408248,0.408248,0.707107,0.500000,0.408248,0.707107,0.500000,...,0.000000,0.707107,0.707107,0.707107,0.707107,0.707107,0.500000,1.000000,0.00000,0.707107
1680,0.577350,0.000000,0.000000,0.577350,0.000000,0.000000,0.000000,0.577350,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.00000,0.000000


In [49]:
id_name_table = movies[["movie_id", "title"]]
id_name_table.loc[33, :]

movie_id                             34
title       Doom Generation, The (1995)
Name: 33, dtype: object

In [50]:
r = cosine_similarity_matrix[5]
r[r > 0.9]

5       1.0
8       1.0
14      1.0
17      1.0
18      1.0
       ... 
1674    1.0
1675    1.0
1676    1.0
1677    1.0
1681    1.0
Name: 5, Length: 376, dtype: float64

In [51]:
cosine_similarity(genres.iloc[[323, 1069], :])

array([[1., 0.],
       [0., 1.]])

In [52]:
cosine_similarity(genres.iloc[[0, 3], :])

array([[1.        , 0.33333333],
       [0.33333333, 1.        ]])

In [53]:
id_name_dict = id_name_table.set_index("movie_id")["title"].to_dict()

In [54]:
id_name_dict[324]

'Lost Highway (1997)'

In [55]:
def choose_the_movie(user_id):
    # Выбирает случайный фильм из тех, которые пользователь лучше всего оценил.
    # Возвращает movie_id
    ratings_for_user = ratings[ratings["user_id"] == user_id]
    max_rating_user = ratings_for_user["rating"].max()
    top_ratings_user = ratings_for_user[ratings_for_user["rating"] == max_rating_user]
    the_movie = np.random.choice(top_ratings_user["movie_id"])
    return the_movie

In [56]:
film = choose_the_movie(34)
film

242

In [57]:
def title_from_id(movie_id):
    return id_name_dict.get(movie_id, None)

In [58]:
title_from_id(324)

'Lost Highway (1997)'

In [59]:
np.random.seed(42)
# the_movie_id = choose_the_movie(34)
# similarity_series = cosine_similarity_matrix.iloc[:, the_movie_id - 1]
# similarity_series
# top = similarity_series.sort_values(ascending=False)
# # top
# top_films = top.reset_index()
# # top_films
# top_films = top_films[top_films["index"] + 1 != the_movie_id]
# # top_films
# films = top_films.iloc[0:9, 0].map(lambda x: title_from_id(x + 1))
# films

In [60]:
def recommend(user_id, top=10):
    the_movie_id = choose_the_movie(user_id)
    similarity_series = cosine_similarity_matrix.iloc[:, the_movie_id - 1].drop(
        the_movie_id - 1
    )
    top = similarity_series.sort_values(ascending=False)
    top_films = top.reset_index()
    # top_films = top_films[top_films["index"] + 1 != the_movie_id]
    films = top_films.head(10).copy()
    films["title"] = films.iloc[:, 0].apply(lambda x: title_from_id(x + 1))
    return the_movie_id, title_from_id(the_movie_id), films

In [61]:
np.random.seed(42)
result = recommend(34)
print(result[0], result[1])
result[2]

324 Lost Highway (1997)


Unnamed: 0,index,323,title
0,532,1.0,"Daytrippers, The (1996)"
1,1580,1.0,"Woman in Question, The (1950)"
2,492,1.0,"Thin Man, The (1934)"
3,1283,0.707107,Before and After (1996)
4,478,0.707107,Vertigo (1958)
5,602,0.707107,Rear Window (1954)
6,483,0.707107,"Maltese Falcon, The (1941)"
7,1041,0.707107,Just Cause (1995)
8,190,0.707107,Amadeus (1984)
9,804,0.707107,Manhattan Murder Mystery (1993)


In [62]:
movies[movies["title"] == "Lost Highway (1997)"]

Unnamed: 0,movie_id,title,release_date,imdb_url,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
323,324,Lost Highway (1997),21-Feb-1997,http://us.imdb.com/Title?Lost+Highway+(1997),0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [63]:
movies[movies["movie_id"] == 533]

Unnamed: 0,movie_id,title,release_date,imdb_url,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
532,533,"Daytrippers, The (1996)",21-Mar-1997,http://us.imdb.com/M/title-exact?Daytrippers%2...,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [64]:
genres.iloc[[323, 532], :]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
323,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
532,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [65]:
genres.iloc[[33, 4], :]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
33,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [66]:
cosine_similarity(genres.iloc[[323, 4], :])

array([[1., 0.],
       [0., 1.]])

In [67]:
movies[movies["title"] == "Amateur (1994)"]

Unnamed: 0,movie_id,title,release_date,imdb_url,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1068,1069,Amateur (1994),01-Jan-1994,http://us.imdb.com/M/title-exact?Amateur%20(1994),0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


### Похожесть пользователей между собой 🥸 - 🥴

Найди 10 самых похожих пользователей и рекомендуй текущему пользователю то, что понравилось наиболее близким к нему пользователям. В результате должна получиться функция `recommend_by_user(user_id, top=10)`, которая должна выдавать `list` или `pd.DataFrame` из `<top>` фильмов, которые мы будем рекомендовать пользователю. 

In [68]:
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)
...,...,...,...,...,...
99995,880,476,3,880175444,"First Wives Club, The (1996)"
99996,716,204,5,879795543,Back to the Future (1985)
99997,276,1090,1,874795795,Sliver (1993)
99998,13,225,2,882399156,101 Dalmatians (1996)


In [69]:
user_movie_rating = ratings[["user_id", "movie_id", "rating"]].copy()
user_item_matrix = user_movie_rating.pivot(
    index="user_id", columns="movie_id", values="rating"
)
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
users_sim_matrix = cosine_similarity(user_item_matrix)
users_sim_matrix = pd.DataFrame(users_sim_matrix)
users_sim_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
0,1.000000,0.166931,0.047460,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
1,0.166931,1.000000,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.226790,0.161485,0.172268,0.105798
2,0.047460,0.110591,1.000000,0.344151,0.021245,0.072415,0.066137,0.083060,0.061040,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.161890,0.101243,0.133416,0.026556
3,0.064358,0.178121,0.344151,1.000000,0.031804,0.068044,0.091230,0.188060,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
4,0.378475,0.072979,0.021245,0.031804,1.000000,0.237286,0.373600,0.248930,0.056847,0.201427,...,0.338794,0.080580,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.118095,0.228583,0.026271,0.030138,0.071459,0.111852,0.107027,0.095898,0.039852,0.071460,...,0.066039,0.431154,0.258021,0.226449,0.432666,1.000000,0.087687,0.180029,0.043264,0.144250
939,0.314072,0.226790,0.161890,0.196858,0.239955,0.352449,0.329925,0.246883,0.120495,0.342961,...,0.327153,0.107024,0.187536,0.181317,0.175158,0.087687,1.000000,0.145152,0.261376,0.241028
940,0.148617,0.161485,0.101243,0.152041,0.139595,0.144446,0.059993,0.146145,0.143245,0.090305,...,0.046952,0.203301,0.288318,0.234211,0.313400,0.180029,0.145152,1.000000,0.101642,0.095120
941,0.179508,0.172268,0.133416,0.170086,0.152497,0.317328,0.282003,0.175322,0.092497,0.212330,...,0.226440,0.073513,0.089588,0.129554,0.099385,0.043264,0.261376,0.101642,1.000000,0.182465


In [71]:
temp_2 = user_movie_rating.groupby(["user_id", "movie_id"]).sum()
temp_2

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
user_id,movie_id,Unnamed: 2_level_1
1,1,5
1,2,3
1,3,4
1,4,3
1,5,3
...,...,...
943,1067,2
943,1074,4
943,1188,3
943,1228,3


In [72]:
user_rate = temp_2.loc[1, :].sort_values(by="rating", ascending=False)
user_rate

Unnamed: 0_level_0,rating
movie_id,Unnamed: 1_level_1
16,5
270,5
1,5
32,5
19,5
...,...
260,1
243,1
219,1
231,1


In [73]:
def top_film_for_user(user_id):
    temp = user_movie_rating.groupby(["user_id", "movie_id"]).sum()
    user_rate = temp.loc[user_id, :].copy().sort_values(by="rating", ascending=False)
    max_value = user_rate["rating"].max()
    film_id = np.random.choice(user_rate[user_rate["rating"] == max_value].index)
    return film_id

In [74]:
def recommend_by_user(user_id, top=10):
    similarity_series = users_sim_matrix.iloc[:, user_id - 1]
    similarity_series = similarity_series.drop(similarity_series.index[user_id - 1])
    top_similar_users = similarity_series.sort_values(ascending=False).head(top)
    sim_users = pd.DataFrame(
        {
            "user_id": top_similar_users.index + 1,  # если индексы с 0
            "similarity": top_similar_users.values,
        }
    )
    sim_users["top_films_id"] = sim_users["user_id"].apply(top_film_for_user)
    sim_users["top_films"] = sim_users["top_films_id"].apply(title_from_id)
    return sim_users

In [75]:
np.random.seed(41)
recommend_by_user(54)

Unnamed: 0,user_id,similarity,top_films_id,top_films
0,294,0.55212,475,Trainspotting (1996)
1,938,0.549014,118,Twister (1996)
2,624,0.529494,50,Star Wars (1977)
3,190,0.515259,302,L.A. Confidential (1997)
4,104,0.495357,302,L.A. Confidential (1997)
5,689,0.489779,50,Star Wars (1977)
6,403,0.487078,235,Mars Attacks! (1996)
7,730,0.481857,1012,Private Parts (1997)
8,569,0.481626,50,Star Wars (1977)
9,526,0.481022,676,"Crucible, The (1996)"


### scikit-surprice


Создай `user-item` матрицу (по строкам – пользователи, по столбцам – фильмы, на пересечении – оценки). Примени `SVD` для решения задачи прогнозирования оценки. 

> https://surpriselib.com/

In [153]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.prediction_algorithms.matrix_factorization import NMF

In [82]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(
    user_movie_rating[["user_id", "movie_id", "rating"]], reader
)

In [83]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

In [149]:
model_svd = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.02)
model_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x73f8a757db80>

In [150]:
predictions = model_svd.test(testset)
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")

RMSE: 0.9375
RMSE: 0.9375033581421104


In [151]:
# test_user_id = np.random.choice(users["user_id"])
# test_movie_id = np.random.choice(movies["movie_id"])
test_user_id = 45
test_movie_id = 127
prediction = model_svd.predict(test_user_id, test_movie_id)
print(
    f"Movie: {title_from_id(test_movie_id)}, \n\
Pred_Rating: {prediction.est}, \n\
True_Rating: {user_item_matrix.loc[test_user_id, test_movie_id]}"
)
print(prediction)

Movie: Godfather, The (1972), 
Pred_Rating: 4.539385594379782, 
True_Rating: 5.0
user: 45         item: 127        r_ui = None   est = 4.54   {'was_impossible': False}


In [146]:
# ratings[ratings["user_id"] == 45]

In [128]:
movies[movies["movie_id"] == 10]

Unnamed: 0,movie_id,title,release_date,imdb_url,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
9,10,Richard III (1995),22-Jan-1996,http://us.imdb.com/M/title-exact?Richard%20III...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0



Реализуй предсказание рейтинга пользователя с помощью алгоритма [Nonnegative Matrix Factorization](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF). Для этого понадобится библиотека [scikit-surpice](https://surprise.readthedocs.io/en/stable/index.html)

In [147]:
trainset_2, testset_2 = train_test_split(data, test_size=0.25, random_state=42)

In [154]:
model_NMF = NMF(
    n_factors=20,
    n_epochs=20,
    lr_bu=0.005,
    # reg_all=0.02
)

In [155]:
model_NMF.fit(trainset_2)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x73f8a7582a20>

In [None]:
# test_user_id = np.random.choice(users["user_id"])
# test_movie_id = np.random.choice(movies["movie_id"])
# test_user_id = 45
# test_movie_id = 127
prediction_2 = model_NMF.predict(test_user_id, test_movie_id)
print(
    f"Movie: {title_from_id(test_movie_id)}, \n\
Pred_Rating: {prediction_2.est}, \n\
True_Rating: {user_item_matrix.loc[test_user_id, test_movie_id]}"
)
print(prediction_2)

Movie: Godfather, The (1972), 
Pred_Rating: 5, 
True_Rating: 5.0
user: 45         item: 127        r_ui = None   est = 5.00   {'was_impossible': False}


### Slope One

Реализуй подход Slope One (пример есть в слайдах лекции или в [статье](https://www.researchgate.net/publication/1960789_Slope_One_Predictors_for_Online_Rating-Based_Collaborative_Filtering))

In [158]:
from surprise import SlopeOne

In [157]:
trainset_3, testset_3 = train_test_split(data, test_size=0.25, random_state=42)

In [159]:
model_SO = SlopeOne()
model_SO.fit(trainset_3)

<surprise.prediction_algorithms.slope_one.SlopeOne at 0x73f8a7577f20>

In [161]:
prediction_3 = model_SO.predict(test_user_id, test_movie_id)
print(
    f"Movie: {title_from_id(test_movie_id)}, \n\
Pred_Rating: {prediction_3.est}, \n\
True_Rating: {user_item_matrix.loc[test_user_id, test_movie_id]}"
)
print(prediction_3)

Movie: Godfather, The (1972), 
Pred_Rating: 4.7007286241712025, 
True_Rating: 5.0
user: 45         item: 127        r_ui = None   est = 4.70   {'was_impossible': False}


In [None]:
# def slope_one(user_id, item_id):

#     return pred_rating

In [167]:
def compare_models_on_sample(
    testset, model_svd=model_svd, model_NMF=model_NMF, model_SO=model_SO, n=15, seed=42
):
    """
    Выбирает n случайных наблюдений из testset, делает предсказания
    с тремя моделями и возвращает таблицу с результатами.

    Parameters
    ----------
    testset : list
        Список троек (user, item, real_rating), полученных от Surprise (train_test_split).
    model_svd : surprise.AlgoBase
        Обученная модель SVD.
    model_NMF : surprise.AlgoBase
        Обученная модель NMF.
    model_SO : surprise.AlgoBase
        Обученная модель SlopeOne.
    n : int
        Количество случайных примеров (по умолчанию 15).
    seed : int
        Фиксированный seed для воспроизводимости.
    """

    np.random.seed(seed)
    idx = np.random.choice(len(testset), size=min(n, len(testset)), replace=False)
    sample = [testset[i] for i in idx]

    results = []
    for uid, iid, true_r in sample:
        svd_pred = model_svd.predict(uid, iid).est
        nmf_pred = model_NMF.predict(uid, iid).est
        so_pred = model_SO.predict(uid, iid).est

        results.append(
            {
                "user_id": uid,
                "item_id": iid,
                "title": title_from_id(iid),
                "real_rating": true_r,
                "svd_pred": svd_pred,
                "nmf_pred": nmf_pred,
                "slopeone_pred": so_pred,
            }
        )
    df = pd.DataFrame(results)
    metrics = {}
    for model in ["svd_pred", "nmf_pred", "slopeone_pred"]:
        errors = df[model] - df["real_rating"]
        mae = np.mean(np.abs(errors))
        rmse = np.sqrt(np.mean(errors**2))
        metrics[model] = {"MAE": mae, "RMSE": rmse}

    return df, metrics

In [169]:
results, metrics = compare_models_on_sample(testset=testset)
results

Unnamed: 0,user_id,item_id,title,real_rating,svd_pred,nmf_pred,slopeone_pred
0,293,657,"Manchurian Candidate, The (1962)",4.0,3.936547,4.55729,3.838454
1,221,358,Spawn (1997),3.0,2.750319,3.636582,2.821254
2,587,875,She's So Lovely (1997),1.0,2.50102,2.806274,2.593047
3,648,177,"Good, The Bad and The Ugly, The (1966)",5.0,3.512336,4.01632,3.775982
4,343,89,Blade Runner (1982),3.0,4.572941,5.0,4.449186
5,943,24,Rumble in the Bronx (1995),4.0,3.461032,3.68458,3.533147
6,815,87,Searching for Bobby Fischer (1993),5.0,4.162396,4.620847,4.012277
7,894,16,French Twist (Gazon maudit) (1995),3.0,3.291812,4.063164,3.383193
8,883,319,Everyone Says I Love You (1996),3.0,3.750421,4.003301,3.690117
9,64,381,Muriel's Wedding (1994),4.0,3.499042,4.315577,3.516758


In [170]:
metrics

{'svd_pred': {'MAE': 0.8588507270516116, 'RMSE': 1.0864563505791502},
 'nmf_pred': {'MAE': 0.9856063643460412, 'RMSE': 1.2515926172415197},
 'slopeone_pred': {'MAE': 0.8341885904854729, 'RMSE': 1.055687941650403}}

<img src="https://icons.iconarchive.com/icons/icons8/windows-8/256/Programming-Github-icon.png" width=32 /> Сделай `commit + push` на  __github__. 

In [123]:
# code