In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Загружаем датасет

In [None]:
df = pd.read_csv('/content/drive/MyDrive/prepared_data.csv', sep=';', index_col=0)
df.head()

Unnamed: 0,DeviceType,UserID,Act,EventID,Type,AgeRestriction,SUP прогулки,Аттракцион,Балет,Баскетбол,...,Хобби,Хоккей,Цирк,Шансон,Шоу,Экологические акции,Экскурсии и туры,Экскурсия,Эстрадный концерт,Ярмарка
0,0,0,1,0,1,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,1,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2,1,2,1,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,3,1,3,1,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,4,4,4,1,16,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Возьмем первые 20000 строк для модели т.к слишком большое количество не обработать

In [None]:
interaction_df = df[['UserID', 'EventID', 'Act']].head(20000)

### Сделаем groupby чтобы для каждой пары UserID EventID было максимальное уникальное значение Act

In [None]:
interaction_df = interaction_df.groupby(['UserID', 'EventID'], as_index=False).agg({'Act': 'max'})

### Создаем матрицу взаимодействий

In [None]:
user_event_matrix = interaction_df.pivot_table(index='UserID', columns='EventID', values='Act', fill_value=0)

In [None]:
user_event_matrix.head()

EventID,0,1,2,3,4,5,6,7,8,9,...,4343,4344,4345,4346,4347,4348,4349,4350,4351,4352
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Нормализуем матрицу взаимодействий

In [None]:
user_event_matrix = user_event_matrix - user_event_matrix.mean() / (user_event_matrix.std())

### Вычисляем косинусное расстояние в матрице взаимодействий. Формула: $
\text{cosine_similarity}(A, B) = \frac{A \cdot B}{\|A\| \|B\|}
$

In [None]:
user_similarity = cosine_similarity(user_event_matrix)

In [None]:
user_similarity_df = pd.DataFrame(user_similarity, index=user_event_matrix.index, columns=user_event_matrix.index)

In [None]:
user_similarity_df.head()

UserID,0,1,2,3,4,5,6,7,8,9,...,17325,17326,17327,17328,17329,17330,17331,17332,17333,17334
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.50656,0.508399,0.401157,0.156186,0.50006,0.501964,0.497639,0.168955,0.31014,...,0.50293,0.508399,0.507638,0.502408,0.4952,0.508399,0.505248,0.503616,0.290891,0.31703
1,0.50656,1.0,0.50656,0.399536,0.155468,0.498125,0.50005,0.495676,0.167942,0.308897,...,0.501028,0.50656,0.50579,0.5005,0.493208,0.50656,0.503373,0.501722,0.289538,0.315826
2,0.508399,0.50656,1.0,0.401157,0.156186,0.50006,0.501964,0.497639,0.168955,0.31014,...,0.50293,0.508399,0.507638,0.502408,0.4952,0.508399,0.505248,0.503616,0.290891,0.31703
3,0.401157,0.399536,0.401157,1.0,0.122298,0.393801,0.395481,0.391663,0.130845,0.243974,...,0.396334,0.401157,0.400486,0.395873,0.389508,0.401157,0.398379,0.39694,0.227682,0.249805
4,0.156186,0.155468,0.156186,0.122298,1.0,0.152925,0.15367,0.151976,0.049679,0.094608,...,0.154049,0.156186,0.155889,0.846709,0.151019,0.156186,0.154955,0.154317,0.0877,0.097079


In [None]:
user_similarity_df[0].describe()

Unnamed: 0,0
count,17335.0
mean,0.453083
std,0.09299
min,0.001246
25%,0.412227
50%,0.502408
75%,0.506124
max,1.0


### Функция для предсказания лучших событий пользвателя

In [None]:
def get_recommendations(user_id, n=5):
    # Получаем похожих пользователей
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n+1]

    # Получаем события, которые они оценили
    similar_user_events = user_event_matrix.loc[similar_users.index]

    # Суммируем оценки событий
    recommendations = similar_user_events.sum(axis=0)

    # Фильтруем события, которые пользователь уже оценил
    already_rated = user_event_matrix.loc[user_id][user_event_matrix.loc[user_id] > 0].index
    recommendations = recommendations.drop(already_rated, errors='ignore')

    # Получаем n лучших рекомендаций и их близость
    top_recommendations = recommendations.sort_values(ascending=False).head(n)

    # Создаем список кортежей (событие, близость)
    result = [(event_id, top_recommendations[event_id]) for event_id in top_recommendations.index]

    return result


### Вывод лучших событий и их вес

In [None]:
user_id_example = 0
get_recommendations(user_id_example, n=5)

[(4352, 0.9620240994549253),
 (4349, 0.9620240994549253),
 (4347, 0.9620240994549253),
 (4346, 0.9620240994549253),
 (4345, 0.9620240994549253)]