In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import heapq
import os
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [None]:
data_path = 'data/'                         
data_1 = 'data_learn_match.csv'
data_2 = 'data_test_match.csv'
data_3 = 'data_target_thread.csv'
raw_data = pd.read_csv(os.path.join(data_path, data_1), header=None)
raw_data.drop(raw_data.columns[2:3], axis=1, inplace=True)
raw_data['values'] = 1

In [None]:
raw_data.columns = ['user_uid', 'thread_uid', 'match_date', 'values']

In [None]:
raw_data['values'] = 15552000/(15552000 + 1570136400 - raw_data['match_date'])

In [None]:
input_list_1 = raw_data['user_uid'].unique()

def scale_user_uid(input_uid):
    return np.where(input_list_1 == input_uid)[0][0]

raw_data['user_uid_renumber'] = raw_data['user_uid'].apply(scale_user_uid)

In [None]:
data_mini = raw_data.copy()

In [None]:
n_users = data_mini['user_uid'].unique().shape[0]
n_items = data_mini['thread_uid'].unique().shape[0]

In [None]:
raw_data_2 = pd.read_csv(os.path.join(data_path, data_2), header=None)

In [None]:
raw_data_2.columns = ['user_uid_test', 'thread_uid_test']
data_2 = raw_data_2.copy()

In [None]:
raw_data_3 = pd.read_csv(os.path.join(data_path, data_3), header=None)

In [None]:
raw_data_3.columns = ['thread_uid_test']
data_3 = raw_data_3.copy()
list_test_threads = data_3['thread_uid_test'].unique().tolist()

In [None]:
list_test_user = data_2['user_uid_test'].unique()
list_match_user = data_mini['user_uid'].unique()

In [None]:
list_drop = list(set(list_test_user).difference(list_match_user))

In [None]:
list_test = [x for x in list_test_user if x not in list_drop]

In [None]:
list_test.sort()

In [None]:
list_user = list_match_user.tolist()

In [None]:
list_of_idx = []
for user in tqdm(list_test):
    idx = list_user.index(user)
    list_of_idx.append(idx)

In [None]:
sparse_user_item = sparse.csr_matrix((data_mini['values'].astype(float), (data_mini['user_uid_renumber'], data_mini['thread_uid'])))

In [None]:
sim_cos = []
for i in tqdm(list_of_idx):
    similarities = cosine_similarity(sparse_user_item, sparse_user_item[i:i+1]).reshape(sparse_user_item.shape[0],)
    sim_cos.append(similarities)

In [None]:
#Модуль фильтрации по квантилю или по значению сходства. Если раскомментировать,
#то ниже вместо "25" поставить rec_len[user]
#rec_len = []
#for scores in range(len(sim_cos)):
    #quantile = np.quantile(sim_cos[scores], 0.9995)
    #condition = lambda x: x >= quantile                                 
    #filtered_scores = list(filter(condition, sim_cos[scores]))         
    #len_filtered_scores = len(filtered_scores)                         
    #rec_len.append(len_filtered_scores)

In [None]:
similarity_users = []
for user in tqdm(range(len(sim_cos))):
    a = heapq.nlargest(25, range(len(sim_cos[user])), sim_cos[user].take)  #rec_len[user] вместо числа пользователей, 
    sim = []                                                               #если фильтрация есть
    for idx in a:
        if idx != list_of_idx[user]:
            sim_user = list_user[idx]
            sim.append(sim_user)
        else: 
            pass
    similarity_users.append(sim)

In [None]:
similarity_threads = []
for sim in tqdm(similarity_users):
    list_sim = []
    for user in sim:
        sim_threads = data_mini['thread_uid'].loc[data_mini['user_uid'] == user].tolist()
        list_sim_threads = [x for x in sim_threads if x in list_test_threads]
        if len(list_sim_threads) != 0:
            list_sim=list_sim+list_sim_threads
        else: 
            pass
    similarity_threads.append(list(set(list_sim)))

In [None]:
data_2['columns'] = data_2.groupby('user_uid_test')['thread_uid_test'].cumcount()

In [None]:
result = data_2.pivot(index='user_uid_test', columns='columns')

In [None]:
result['concat_col']=result.apply(lambda row: row.dropna().tolist(), axis=1)

In [None]:
df_test_threads = result[['concat_col']]

In [None]:
df_test_threads_f = df_test_threads.drop(list_drop, axis = 0)

In [None]:
df_test_threads_f['sim_threads'] = similarity_threads

In [None]:
raw_data_seen = raw_data.loc[raw_data['user_uid'].isin(list_test)]

In [None]:
raw_data_seen['columns'] = raw_data_seen.groupby('user_uid')['thread_uid'].cumcount()

In [None]:
result_2 = raw_data_seen.pivot(index='user_uid', columns='columns')

In [None]:
result_2['seen_threads']=result_2.apply(lambda row: row.dropna().tolist(), axis=1)

In [None]:
df_test_threads_f['seen_threads'] = result_2[['seen_threads']]

In [None]:
relev = []
for x in range(df_test_threads_f.shape[0]):
    A = list(set(df_test_threads_f['concat_col'].iloc[x]) & (set(df_test_threads_f['sim_threads'].iloc[x]) - set(df_test_threads_f['seen_threads'].iloc[x])))
    relev.append(len(A))

In [None]:
df_test_threads_f['relevant'] = relev
df_test_threads_f['len_concat'] = df_test_threads_f['concat_col'].apply(lambda x: len(x))
df_test_threads_f['len_predict'] = df_test_threads_f['sim_threads'].apply(lambda x: len(x))

In [None]:
df_test_threads_f['precision'] = df_test_threads_f['relevant']/df_test_threads_f['len_predict']
df_test_threads_f['recall'] = df_test_threads_f['relevant']/df_test_threads_f['len_concat']

In [None]:
Precision = df_test_threads_f[['precision']].mean()
Precision

In [None]:
Recall = df_test_threads_f[['recall']].mean()
Recall