In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import heapq
import os
from tqdm import tqdm
from sklearn.metrics import jaccard_score
from scipy.spatial import distance

In [None]:
data_path = 'data/'                         
data_1 = 'data_learn_match.csv'
data_2 = 'data_test_match.csv'
data_3 = 'data_target_thread.csv'
raw_data = pd.read_csv(os.path.join(data_path, data_1), header=None)
raw_data.drop(raw_data.columns[2:3], axis=1, inplace=True)
raw_data['values'] = 1
raw_data['values_pessimize'] = 1

In [None]:
raw_data.columns = ['user_uid', 'thread_uid', 'match_date', 'values', 'values_pessimize']

In [None]:
input_list_1 = raw_data['user_uid'].unique()

def scale_user_uid(input_uid):
    return np.where(input_list_1 == input_uid)[0][0]

raw_data['user_uid_renumber'] = raw_data['user_uid'].apply(scale_user_uid)

In [None]:
input_list_2 = raw_data['thread_uid'].unique()

def scale_thread_uid(input_uid):
    return np.where(input_list_2 == input_uid)[0][0]

raw_data['thread_uid_renumber'] = raw_data['thread_uid'].apply(scale_thread_uid)

In [None]:
raw_data['values_pessimize'] = 15552000/(15552000 + 1570136400 - raw_data['match_date'])

In [None]:
data_mini = raw_data.copy()

In [None]:
n_users = data_mini['user_uid'].unique().shape[0]
n_items = data_mini['thread_uid'].unique().shape[0]

In [None]:
raw_data_2 = pd.read_csv(os.path.join(data_path, data_2), header=None)

In [None]:
raw_data_2.columns = ['user_uid_test', 'thread_uid_test']
data_2 = raw_data_2.copy()

In [None]:
raw_data_3 = pd.read_csv(os.path.join(data_path, data_3), header=None)

In [None]:
raw_data_3.columns = ['thread_uid_test']
data_3 = raw_data_3.copy()
list_test_threads = data_3['thread_uid_test'].unique().tolist()

In [None]:
list_test_user = data_2['user_uid_test'].unique()
list_match_thread = data_mini['thread_uid'].unique()
list_match_user = data_mini['user_uid'].unique()

In [None]:
list_drop = list(set(list_test_user).difference(list_match_user))

In [None]:
list_test = list_test_threads

In [None]:
list_test_user_filtered = list(set(list_test_user) - set(list_drop))

In [None]:
list_test.sort()

In [None]:
list_thread = list_match_thread.tolist()

In [None]:
list_of_idx = []
for thread in tqdm(list_test):
    idx = list_thread.index(thread)
    list_of_idx.append(idx)

In [None]:
sparse_item_user = sparse.csr_matrix((data_mini['values'].astype(float), (data_mini['thread_uid_renumber'], data_mini['user_uid_renumber'])))

In [None]:
dice_sim = []
for idx in tqdm(list_of_idx):
    d_sim = []
    for i in range(sparse_item_user.shape[0]):
        score = distance.jaccard(sparse_item_user[i].toarray(), sparse_item_user[idx:idx+1].toarray())
        dice = 2*(1 - score) / (2 - score)
        d_sim.append(dice)
    dice_sim.append(d_sim)

In [None]:
A = np.array(dice_sim).T

In [None]:
sparse_item_user_pessim = sparse.csr_matrix((data_mini['values_pessimize'].astype(float), (data_mini['thread_uid_renumber'], data_mini['user_uid_renumber'])))

In [None]:
#чтобы включить пессимизацию, вместо sparse_item_user вставить sparse_item_user_pessim
pred = sparse_item_user_pessim.T.dot(A) / np.array([np.abs(A.T).sum(axis=1)]) 

In [None]:
#для тестовых юзеров находим их перенумерованный индекс
user_test = []                                       
for user in list_test_user_filtered:
    user_renumber = data_mini['user_uid_renumber'].loc[data_mini['user_uid'] == user].tolist()
    user_test.extend(list(set(user_renumber)))
user_test.sort()

In [None]:
predict_test = []
for i in user_test:
    predict = pred[i].tolist()
    predict_test.append(predict)
predict_test_users = np.array(predict_test)

In [None]:
#модуль фильтрации по квантилю или по значению сходства
rec_len = []
for scores in range(len(predict_test_users)):
    #quantile = np.quantile(predict_test_users[scores], 0.95)
    #для фильтрации по значению: закомментить строку (quantile = ...), а в следующей строке прописать "x >= значение фильтрации"
    condition = lambda x: x >= 0.004                                  
    filtered_scores = list(filter(condition, predict_test_users[scores]))           
    len_filtered_scores = len(filtered_scores)
    rec_len.append(len_filtered_scores)

In [None]:
similarity_threads = []
for user in tqdm(range(len(predict_test_users))):
    a = heapq.nlargest(rec_len[user], range(len(predict_test_users[user])), predict_test_users[user].take)   
    sim = []                                                               
    for idx in a:
        sim_threads = list_test[idx]
        sim.append(sim_threads)
    similarity_threads.append(sim)

In [None]:
data_2['columns'] = data_2.groupby('user_uid_test')['thread_uid_test'].cumcount()

In [None]:
result = data_2.pivot(index='user_uid_test', columns='columns')

In [None]:
result['concat_col']=result.apply(lambda row: row.dropna().tolist(), axis=1)

In [None]:
df_test_threads = result[['concat_col']]

In [None]:
df_test_threads_f = df_test_threads.drop(list_drop, axis = 0)

In [None]:
df_test_threads_f['sim_threads'] = similarity_threads

In [None]:
raw_data_seen = raw_data.loc[raw_data['user_uid'].isin(list_test_user_filtered)]

In [None]:
raw_data_seen['columns'] = raw_data_seen.groupby('user_uid')['thread_uid'].cumcount()

In [None]:
result_2 = raw_data_seen.pivot(index='user_uid', columns='columns')

In [None]:
result_2['seen_threads']=result_2.apply(lambda row: row.dropna().tolist(), axis=1)

In [None]:
df_test_threads_f['seen_threads'] = result_2['seen_threads']

In [None]:
relev = []
for x in range(df_test_threads_f.shape[0]):
    A = list(set(df_test_threads_f['concat_col'].iloc[x]) & (set(df_test_threads_f['sim_threads'].iloc[x]) - set(df_test_threads_f['seen_threads'].iloc[x])))
    relev.append(len(A))

In [None]:
df_test_threads_f['relevant'] = relev
df_test_threads_f['len_concat'] = df_test_threads_f['concat_col'].apply(lambda x: len(x))
df_test_threads_f['len_predict'] = df_test_threads_f['sim_threads'].apply(lambda x: len(x))

In [None]:
df_test_threads_f['precision'] = df_test_threads_f['relevant']/df_test_threads_f['len_predict']
df_test_threads_f['recall'] = df_test_threads_f['relevant']/df_test_threads_f['len_concat']

In [None]:
Precision = df_test_threads_f['precision'].mean()
Precision

In [None]:
Recall = df_test_threads_f['recall'].mean()
Recall