In [1]:
from torch import nn
import torch

import dgl
import networkx as nx

import collections
import math
import ml_metrics as metrics
import datetime
from tqdm.notebook import trange, tqdm
from sklearn.neighbors import NearestNeighbors
import collections
import pandas as pd
import numpy as np

Using backend: pytorch


## Data preprocession

In [2]:
def create_category_vector(category_map, category_ids_str):
    result = []
    
    category_ids = category_ids_str.split(':')
    
    for category in category_map:
        if str(category) in category_ids:
            result.append(1)
        else:
            result.append(0)
            
    return result

In [3]:
cold_start_df = pd.read_csv('2021-04-17_09 15 39_cold_start_data.csv',encoding='cp1251')
cold_start_df.columns = ['user_id', 'event_id', 'is_favorite', 'survey_time']

events_df = pd.read_csv('2021-04-17_11 59 03_cold_start_events.csv', sep=';', encoding='cp1251')
events_df.columns = ['id', 'short_title', 'start_date', 'description', 'is_free', 'place_id', 'place_short_title', 'category_ids', 
                    'category_names', 'category_slags']

actual_events_df = events_df[events_df['id'].isin(cold_start_df['event_id'].unique())]

event_id_name_dict = {k: v for k, v in actual_events_df[['id', 'short_title']].values}

cold_start_df['event_name'] = [event_id_name_dict[event_id] for event_id in cold_start_df['event_id']]

In [4]:
events_df

Unnamed: 0,id,short_title,start_date,description,is_free,place_id,place_short_title,category_ids,category_names,category_slags
0,1,«Ночь музеев — 2021»,2019-05-18,"Только один раз в году бывает так, что музеи, ...",False,,,12:6:10:8:20,Вечеринки:Выставки:Детям:Фестивали:Экскурсии,party:exhibition:kids:festival:tour
1,2,«Ночь музеев — 2021»,2021-05-22,"Только один раз в году бывает так, что музеи, ...",False,,,12:6:10:8:20,Вечеринки:Выставки:Детям:Фестивали:Экскурсии,party:exhibition:kids:festival:tour
2,3,День Победы в Петербурге,2019-05-09,76-ю годовщину Великой Победы Петербург отгуля...,False,636.0,Дворцовая площадь,17:3:9:8,Благотворительность:Концерты:Праздники:Фестивали,social-activity:concert:holiday:festival
3,4,День Победы в Петербурге,2021-05-09,76-ю годовщину Великой Победы Петербург отгуля...,False,636.0,Дворцовая площадь,17:3:9:8,Благотворительность:Концерты:Праздники:Фестивали,social-activity:concert:holiday:festival
4,5,«Библионочь – 2021»,2019-04-20,Бессонные ночи можно коротать не только в м...,False,,,12:10:4:8,Вечеринки:Детям:Обучение:Фестивали,party:kids:education:festival
...,...,...,...,...,...,...,...,...,...,...
3398,3399,Организация движения поездов,,Экспозиция рассказывает о работе дежурных по с...,False,346.0,музей железнодорожного транспорта,6,Выставки,exhibition
3399,3400,История инженерных войск после 1917 года,,"Любая армия нуждается в сапёрах, строителях ф...",False,310.0,Музей артиллерии,6,Выставки,exhibition
3400,3401,Железные дороги в Великой Отечественной войне ...,2012-11-27,Экспозиция посвящена подвигу железнодорожников...,False,346.0,музей железнодорожного транспорта,6,Выставки,exhibition
3401,3402,Железные дороги в Великой Отечественной войне ...,,Экспозиция посвящена подвигу железнодорожников...,False,346.0,музей железнодорожного транспорта,6,Выставки,exhibition


In [5]:
uniqual_events_and_dates_df = actual_events_df.groupby(['short_title'])['start_date'].max().reset_index(name='date')
uniqual_events_and_dates_map = {name: (idx, date) for idx, (name, date) in enumerate(uniqual_events_and_dates_df[['short_title', 'date']].values)}

new_ids = []
new_dates = []

for short_title in actual_events_df['short_title'].values:
    new_id, new_date = uniqual_events_and_dates_map[short_title]
    new_ids.append(new_id)
    new_dates.append(new_date)
    
actual_events_df['id'] = new_ids
actual_events_df['start_date'] = new_dates

actual_events_df = actual_events_df.drop_duplicates(subset=['id', 'short_title', 'start_date'], keep='last')

new_ids = []
for event_name in cold_start_df['event_name'].values:
    new_id, new_date = uniqual_events_and_dates_map[event_name]
    new_ids.append(new_id)
    
cold_start_df['event_id'] = new_ids

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_events_df['id'] = new_ids
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actual_events_df['start_date'] = new_dates


In [6]:
category_map = {}

for category_ids_str, category_slags_str in actual_events_df.loc[:, ['category_ids', 'category_slags']].values:
    
    category_ids = category_ids_str.split(':')
    category_slags = category_slags_str.split(':')
    
    for category_id, category_slag in zip(category_ids, category_slags):
        category_map[int(category_id)] = category_slag 
        
category_map = {k: v for k, v in sorted(category_map.items())}

In [7]:
category_vectors = []

for category_ids_str in actual_events_df.loc[:, ['category_ids']].values:
    category_vectors.append(create_category_vector(category_map, category_ids_str[0]))

category_vectors_columns = [f'category_{v}' for v in category_map.values()]

category_feactures_dict = collections.defaultdict(list)
for idx, v in enumerate(category_map.values()):
    category_feactures_dict[f'category_{v}'] = [cat_vector[idx] for cat_vector in category_vectors]

In [8]:
event_features_df = pd.DataFrame()

event_features_df['id'] = actual_events_df['id']

for category, values in category_feactures_dict.items():
    event_features_df[category] = values
    
event_features_df['is_free'] = [1 if is_free else 0 for is_free in actual_events_df['is_free']]

event_features_df['weekday'] = [datetime.datetime.strptime(weekday, "%Y-%m-%d").date().weekday() + 1 for weekday in actual_events_df['start_date']]

## Graph building

In [12]:
def create_graph(aggregated_group, key_column, G=None):
    for group in aggregated_group.groups:
        value = aggregated_group.get_group(group)[key_column].values
        
        for i in range(len(value)):       
            for j in range(i+1, len(value)):
                if G.has_edge(value[i], value[j]):
                    G[value[i]][value[j]]['weight'] += 1
                else:
                    G.add_edge(value[i], value[j], weight=1)
    return G

In [10]:
short_titles = []
for event_id in event_features_df['id']:
    short_titles.append(list(actual_events_df[actual_events_df['id'] == event_id]['short_title'].values)[0])

In [13]:
G = nx.Graph(name='G')

for event_id, short_title in zip(event_features_df['id'].unique(), short_titles):
    G.add_node(event_id, name=short_title)
    
negative_group = cold_start_df[cold_start_df['is_favorite'] == False].groupby('user_id')
G = create_graph(negative_group, 'event_id', G)

positive_group = cold_start_df[cold_start_df['is_favorite'] == True].groupby('user_id')
G = create_graph(positive_group, 'event_id', G)

#### Создана матрица с лейблами, если -1, то значение для пользователя неопределено

In [17]:
label_matrix = []
for idx, node in enumerate(list(G.nodes())):
    pos_users_list = cold_start_df[(cold_start_df['is_favorite'] == True) & (cold_start_df['event_id'] == node)]['user_id'].values
    neg_users_list = cold_start_df[(cold_start_df['is_favorite'] == False) & (cold_start_df['event_id'] == node)]['user_id'].values
    
    sublist = []
    for user_id in cold_start_df['user_id'].unique():
        if user_id in pos_users_list:
            sublist.append(1)
        elif user_id in neg_users_list:
            sublist.append(0)
        else:
            sublist.append(-1)
    
    label_matrix.append(sublist)

In [18]:
adjacency_matrix = np.array(nx.attr_matrix(G, node_attr='name')[0]) # матрица смежностей

nodes = np.array(nx.attr_matrix(G, node_attr='name')[1]) # Список нод

In [19]:
cold_start_df['user_id'].nunique()

44

## Model training

In [206]:
class GraphLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super(GraphLayer, self).__init__()
        
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.FloatTensor(self.in_features, self.out_features))
        self.bias = nn.Parameter(torch.FloatTensor(self.out_features))
        self.reset_parameters()
        
    def reset_parameters(self):
        stdv = 0.01
        self.weight.data.uniform_(-stdv, stdv)
        self.bias.data.uniform_(-stdv, stdv)
        
    def forward(self, input_data, adj):
        support = torch.mm(input_data, self.weight)
        output = torch.spmm(adj, support)
        
        return output + self.bias

In [207]:
class GCN(nn.Module):
    def __init__(self, n_feats, n_hidden, n_class, dropout):
        super(GCN, self).__init__()
        
        self.gc1 = GraphLayer(n_feats, n_hidden)
        self.gc2 = GraphLayer(n_hidden, n_hidden // 2)
        self.gc3 = GraphLayer(n_hidden // 2, n_class)
        self.dropout = dropout
    
    def calculate_score(self, result_labels, real_labels):
        getting_indexes = [index for index, val in enumerate(real_labels) if val != -1.]
        real_labels = real_labels[getting_indexes]
        result_labels = result_labels[getting_indexes]
        
        predicts = nn.functional.sigmoid(result_labels).round().type_as(real_labels)
        correct_result = predicts.eq(real_labels).double()
        correct_result = correct_result.sum()        
        
        return correct_result / len(real_labels)
    
    def forward(self, x, adj):
        x = nn.functional.relu(self.gc1(x, adj))
        x = nn.functional.dropout(x, self.dropout // 2, training=self.training)
        x_embed = nn.functional.relu(self.gc2(x, adj))
        x = nn.functional.dropout(x_embed, self.dropout, training=self.training)
        x = self.gc3(x, adj)
        
        return x, x_embed
    
    def __fit(self, adjacency_matrix, features_matrix, label_matrix, epoch, verbose=False):
        self.train()
        self.optimizer.zero_grad()
        
        output, _ = self(features_matrix, adjacency_matrix)

        loss_train = self.loss(output, label_matrix)
        score_train = self.calculate_score(output.view(-1), label_matrix.view(-1))
        
        loss_train.backward()
        self.optimizer.step()
        
        self.eval()
        
        output, _ = self(features_matrix, adjacency_matrix)
        
        loss_val = self.loss(output, label_matrix)
        score_val = self.calculate_score(output.view(-1), label_matrix.view(-1))
    
        if verbose:
            print(f'{epoch+1}) train loss: {round(loss_train.item(), 3)}\ttrain accuracy: {round(score_train.item(), 3)}\t val loss: {round(loss_val.item(), 3)}\tval accuracy: {round(score_val.item(),3)}\n')
    
    def predict_at_num(self, features_matrix, adjacency_matrix, num):
        predicts_values, _ = self(features_matrix, adjacency_matrix)
        predicts_values = predicts_values.detach().numpy()
        _, predict = self.knn.kneighbors([predicts_values.T[num]])

        return [self.node_map[pred][0] for pred in predict[0]]
    
    def predict(self, features_matrix, adjacency_matrix):
        predicts_values, _ = self(features_matrix, adjacency_matrix)
        predicts_values = predicts_values.detach().numpy().T
        
        results = []
        for predict_vector in predicts_values:
            _, predict = self.knn.kneighbors([predict_vector])
            results.append([self.node_map[pred] for pred in predict[0]])

        return results
        
    def fit(self, adjacency_matrix, features_matrix, label_matrix, nodes, nodes_name, n_epoch=1, learing_rait=0.01, weight_decay=0.01, verbose=False, k_elements=10):
        self.optimizer = torch.optim.Adam(self.parameters(), lr=learing_rait, weight_decay=weight_decay)
        self.loss = nn.modules.BCEWithLogitsLoss()
        
        for epoch in range(n_epoch):
            self.__fit(adjacency_matrix, features_matrix, label_matrix, epoch, verbose)
            learing_rait /= 2
            
            self.optimizer = torch.optim.Adam(self.parameters(), lr=learing_rait, weight_decay=weight_decay)
            
        self.node_map = {idx: (node_id, name) for idx, (node_id, name) in enumerate(zip(nodes, nodes_name))}
        
        vectors, _ = self(features_matrix, adjacency_matrix)
        self.knn = NearestNeighbors(n_neighbors=k_elements, algorithm='ball_tree').fit(vectors.detach().numpy().T)

## Нужно отсортировать фичи в том порядке, в котором они были заданы в нодах

In [198]:
event_features_without_id_tensor = torch.FloatTensor(event_features_df.drop(['id', 'weekday'], axis=1).values)

adjacency_matrix_tensor = torch.FloatTensor(adjacency_matrix)

label_matrix_tensor = torch.FloatTensor(label_matrix)

In [208]:
model = GCN(13, 400, 44, 0.33)

model.fit(adjacency_matrix_tensor, event_features_without_id_tensor, label_matrix_tensor, nodes, np.array(G.nodes), n_epoch=100, learing_rait=0.001, verbose=False)



In [209]:
model.predict_at_num(event_features_without_id_tensor, adjacency_matrix_tensor, 32)

['GSPD',
 'Ползком по клубам и барам',
 'экскурсия в Петергоф',
 'Опера-гала',
 'Экскурсия в особняк Матильды Кшесинской',
 'Мари Краймбрери',
 'Дворец Шувалова',
 'Григорий Лепс',
 'Орф. Кармина Бурана',
 'По рок-местам']

In [201]:
cold_start_df['user_id'].unique()

array([90217929, 48809944, 23573591, 19986396, 70081526, 90432167,
       12198841, 91041459, 79290094, 70208099, 93117857, 97480214,
       59771642, 11957260, 44434193,  9235193, 44227856, 90800104,
       29771783, 95320616, 89294496, 12455543, 94173382,  8901930,
       84514373, 58912127, 72147781, 88008390, 57692221, 30762396,
       77912307, 95130035, 38089885, 90441422, 93458070, 95927257,
       99414675, 57670421, 30532386, 72898140, 26578059, 31206380,
       99356768, 89810864], dtype=int64)

In [202]:
cold_start_df[(cold_start_df['user_id'] == 38089885)  & (cold_start_df['is_favorite'] == True)]

Unnamed: 0,user_id,event_id,is_favorite,survey_time,event_name
966,38089885,115,True,09:19:36,Ползком по клубам и барам
972,38089885,43,True,09:19:36,Вечеринки в Comedy Place


In [210]:
result = model.predict(event_features_without_id_tensor, adjacency_matrix_tensor)

In [211]:
real_user_event_ids = []
prediction_user_element_ids = []

for idx, user_id in enumerate(cold_start_df['user_id'].unique()):
    real_user_event_ids.append(list(cold_start_df[(cold_start_df['is_favorite'] == True) & (cold_start_df['user_id'] == user_id)]['event_id'].values))
    prediction_user_element_ids.append([v for k, v in result[idx]])
    
print(f'5 topics map@10 {round(metrics.mapk(real_user_event_ids, prediction_user_element_ids, 10), 4)}')
print(f'5 topics map@5 {round(metrics.mapk(real_user_event_ids, prediction_user_element_ids, 5), 4)}')

5 topics map@10 0.0528
5 topics map@5 0.058


## Функции вычислений метрик качества

In [182]:
def calculate_pk(k):
    pk = 0
    for idx, user_id in enumerate(cold_start_df['user_id'].unique()):
        real_event_ids = cold_start_df[(cold_start_df['is_favorite'] == True) & (cold_start_df['user_id'] == user_id)]['event_id'].values
    
        for idx2, (name, prediction_element_id) in enumerate(result[idx]):
            if idx2 == k:
                break
                
            if prediction_element_id in real_event_ids:
                pk += 1
                
    return pk / (k * 44)

In [212]:
round(calculate_pk(10), 4)

0.0636

In [68]:
def calculate_apk(k, df, predictions):
    apk_dict = collections.defaultdict(int)
    
    for idx, user_id in enumerate(df['user_id'].unique()):
        real_event_ids = df[(df['is_favorite'] == True) & (df['user_id'] == user_id)]['event_id'].values
        
        for idx2, (name, prediction_element_id) in enumerate(predictions[idx]):
            if idx2 == k:
                break
                
            if idx2 >= len(real_event_ids):
                break
            
            if prediction_element_id == real_event_ids[idx2]:
                apk_dict[idx2] += 1
    
    return sum([value * calculate_pk(idx + 1) for idx, value in enumerate(apk_dict.values())]) / k / 44

In [69]:
def calculate_user_pk(k, real_user_event_ids, prediction_user_element_ids):
    pk = 0
    
    for idx, prediction_element_id in enumerate(prediction_user_element_ids):
        if idx == k:
            break
            
        if prediction_element_id[1] in real_user_event_ids:
            pk += 1
            
    return pk / k

In [70]:
def calculate_user_apk(k, real_user_event_ids, prediction_user_element_ids):
    apk = 0
    
    for idx, prediction_element_id in enumerate(prediction_user_element_ids):
        if idx == k:
            break
        
        if idx >= len(real_user_event_ids):
            break
        
        if prediction_element_id[1] == real_user_event_ids[idx]:
            apk += 1 * calculate_user_pk(idx + 1, real_user_event_ids, prediction_user_element_ids)
            
    return apk / k

In [71]:
def calculate_mapk(k, df, predictions):
    mapk = 0
    
    for idx, user_id in enumerate(df['user_id'].unique()):
        real_user_event_ids = df[(df['is_favorite'] == True) & (df['user_id'] == user_id)]['event_id'].values
        prediction_user_element_ids = predictions[idx]
        
        mapk += calculate_user_apk(k, real_user_event_ids, prediction_user_element_ids)
    
    return mapk / 44

In [73]:
real_user_event_ids = []
prediction_user_element_ids = []

for idx, user_id in enumerate(cold_start_df['user_id'].unique()):
    real_user_event_ids.append(list(cold_start_df[(cold_start_df['is_favorite'] == True) & (cold_start_df['user_id'] == user_id)]['event_id'].values))
    prediction_user_element_ids.append([v for k, v in result[idx]])
    
print(f'5 topics map@10 {round(metrics.mapk(real_user_event_ids, prediction_user_element_ids, 10), 5)}')
print(f'5 topics map@5 {round(metrics.mapk(real_user_event_ids, prediction_user_element_ids, 5), 5)}')

In [74]:
print(f'5 topics map@10 {round(metrics.mapk(real_user_event_ids, prediction_user_element_ids, 10), 5)}')
print(f'5 topics map@5 {round(metrics.mapk(real_user_event_ids, prediction_user_element_ids, 5), 5)}')

5 topics map@10 0.04541
5 topics map@5 0.04881
