In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import heapq
import os
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [2]:
data_path = 'data/'                        
data_1 = 'data_learn_match.csv'
data_2 = 'data_test_match.csv'
data_3 = 'data_target_thread.csv'
raw_data = pd.read_csv(os.path.join(data_path, data_1), header=None)
raw_data.drop(raw_data.columns[2:3], axis=1, inplace=True)
raw_data['values'] = 1

In [3]:
raw_data.columns = ['user_uid', 'thread_uid', 'match_date', 'values']

In [4]:
input_list_1 = raw_data['user_uid'].unique()

def scale_user_uid(input_uid):
    return np.where(input_list_1 == input_uid)[0][0]

raw_data['user_uid_renumber'] = raw_data['user_uid'].apply(scale_user_uid)

In [5]:
input_list_2 = raw_data['thread_uid'].unique()

def scale_thread_uid(input_uid):
    return np.where(input_list_2 == input_uid)[0][0]

raw_data['thread_uid_renumber'] = raw_data['thread_uid'].apply(scale_thread_uid)

In [6]:
raw_data['values'] = 15552000/(15552000 + 1570136400 - raw_data['match_date'])

In [7]:
data_mini = raw_data.copy()

In [8]:
n_users = data_mini['user_uid'].unique().shape[0]
n_items = data_mini['thread_uid'].unique().shape[0]

In [11]:
raw_data_2 = pd.read_csv(os.path.join(data_path, data_2), header=None)

In [12]:
raw_data_2.columns = ['user_uid_test', 'thread_uid_test']
data_2 = raw_data_2.copy()

In [13]:
raw_data_3 = pd.read_csv(os.path.join(data_path, data_3), header=None)

In [14]:
raw_data_3.columns = ['thread_uid_test']
data_3 = raw_data_3.copy()
list_test_threads = data_3['thread_uid_test'].unique().tolist()

In [15]:
list_test_user = data_2['user_uid_test'].unique()
list_match_thread = data_mini['thread_uid'].unique()
list_match_user = data_mini['user_uid'].unique()

In [16]:
list_drop = list(set(list_test_user).difference(list_match_user))

In [17]:
list_test = list_test_threads

In [18]:
list_test_user_filtered = list(set(list_test_user) - set(list_drop))

In [19]:
list_test.sort()

In [21]:
list_thread = list_match_thread.tolist()

In [22]:
list_of_idx = []
for thread in tqdm(list_test):
    idx = list_thread.index(thread)
    list_of_idx.append(idx)

100%|██████████████████████████████████████| 397/397 [00:00<00:00, 1327.76it/s]


In [23]:
sparse_item_user = sparse.csr_matrix((data_mini['values'].astype(float), (data_mini['thread_uid_renumber'], data_mini['user_uid_renumber'])))

In [28]:
sim_cos = []
#i= 0
for idx in tqdm(list_of_idx):
    #similarities = cosine_similarity(sparse.vstack([sparse_user_item[:i, :], sparse_user_item[i+1:, :]]), sparse_user_item[i:i+1])
    similarities = cosine_similarity(sparse_item_user, sparse_item_user[idx:idx+1])
    sim_cos.append(similarities)
    #sim_cos[i:] = similarities
    #i+=1

100%|████████████████████████████████████████| 397/397 [00:52<00:00,  7.51it/s]


In [30]:
array = np.array(sim_cos)

In [31]:
A = array[:,:,0].T

In [39]:
pred = sparse_item_user.T.dot(A) / np.array([np.abs(A.T).sum(axis=1)])

In [43]:
user_test = []                                       #для тестовых юзеров находим их перенумерованный индекс
for user in list_test_user_filtered:
    user_renumber = data_mini['user_uid_renumber'].loc[data_mini['user_uid'] == user].tolist()
    user_test.extend(list(set(user_renumber)))
user_test.sort()

In [44]:
predict_test = []
for i in user_test:
    predict = pred[i].tolist()
    predict_test.append(predict)
predict_test_users = np.array(predict_test)

In [172]:
rec_len = []
for scores in range(len(predict_test_users)):
    #quantile = np.quantile(predict_test_users[scores], 0.95)
    condition = lambda x: x >= 0.005                                       #модуль фильтрации по квантилю 
    filtered_scores = list(filter(condition, predict_test_users[scores]))  #или по значению сходства
    len_filtered_scores = len(filtered_scores)
    rec_len.append(len_filtered_scores)

In [173]:
similarity_threads = []
for user in tqdm(range(len(predict_test_users))):
    a = heapq.nlargest(rec_len[user], range(len(predict_test_users[user])), predict_test_users[user].take)  #rec_len[user] вместо числа пользователей, 
    sim = []                                                               #если фильтрация есть
    for idx in a:
        sim_threads = list_test[idx]
        sim.append(sim_threads)
    similarity_threads.append(sim)

100%|████████████████████████████████████| 2713/2713 [00:01<00:00, 1710.59it/s]


[[103578,
  103259,
  103542,
  103506,
  103280,
  103269,
  103311,
  103314,
  103523,
  103233,
  103494,
  103625,
  103509,
  103461,
  103505,
  103579,
  103474,
  103608,
  103609],
 [103578,
  103506,
  103461,
  103314,
  103311,
  103259,
  103542,
  103523,
  103448,
  103233,
  103280,
  103625,
  103269,
  103505,
  103474,
  103494,
  103608,
  103609,
  103509,
  103520,
  103303,
  103472,
  103579,
  103495,
  103226,
  103564,
  103254,
  103265,
  103297,
  103591,
  103333,
  103544,
  103322,
  103329,
  103554,
  103330,
  103217,
  103291,
  103559,
  103581,
  103343,
  103458,
  103563,
  103510,
  103238,
  103611,
  103601,
  103317,
  103296,
  103455,
  103466,
  103335,
  103597,
  103310,
  103527,
  103537,
  103485,
  103593,
  103504,
  103573,
  103239,
  103484,
  103540,
  103486,
  103234,
  103415,
  103230,
  103325,
  103219,
  103646,
  103508,
  103599,
  103279,
  103639,
  103370,
  103222,
  103514,
  103490,
  103302,
  103633,
  103453,

In [178]:
data_2['columns'] = data_2.groupby('user_uid_test')['thread_uid_test'].cumcount()

In [179]:
result = data_2.pivot(index='user_uid_test', columns='columns')

In [180]:
result['concat_col']=result.apply(lambda row: row.dropna().tolist(), axis=1)

In [181]:
df_test_threads = result[['concat_col']]

In [183]:
df_test_threads_f = df_test_threads.drop(list_drop, axis = 0)

Unnamed: 0_level_0,concat_col
columns,Unnamed: 1_level_1
user_uid_test,Unnamed: 1_level_2
47,[103578.0]
51,[103474.0]
52,"[103233.0, 103506.0]"
115,[103442.0]
156,"[103613.0, 103625.0]"
...,...
57232,[103634.0]
57242,[103594.0]
57261,"[103224.0, 103285.0, 103438.0, 103541.0]"
57274,[103345.0]


In [184]:
df_test_threads_f['sim_threads'] = similarity_threads

In [186]:
raw_data_seen = raw_data.loc[raw_data['user_uid'].isin(list_test_user_filtered)]

Unnamed: 0,user_uid,thread_uid,match_date,values,user_uid_renumber,thread_uid_renumber
9361,47,182,1463076644,0.126839,46,7710
9362,47,551,1558122978,0.564185,46,13
9363,47,1037,1420920939,0.094388,46,3222
9364,47,4469,1556642889,0.535436,46,234
9365,47,5985,1565599253,0.774149,46,7711
...,...,...,...,...,...,...
9801120,57287,101109,1566277450,0.801197,57133,102916
9801121,57287,101791,1566810198,0.823807,57133,92283
9801122,57287,102205,1568115882,0.885018,57133,102917
9801123,57287,102589,1568616892,0.910992,57133,63849


In [187]:
raw_data_seen['columns'] = raw_data_seen.groupby('user_uid')['thread_uid'].cumcount()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [188]:
result_2 = raw_data_seen.pivot(index='user_uid', columns='columns')

In [189]:
result_2['seen_threads']=result_2.apply(lambda row: row.dropna().tolist(), axis=1)

In [190]:
df_test_threads_f['seen_threads'] = result_2['seen_threads']

In [191]:
relev = []
for x in range(df_test_threads_f.shape[0]):
    A = list(set(df_test_threads_f['concat_col'].iloc[x]) & (set(df_test_threads_f['sim_threads'].iloc[x]) - set(df_test_threads_f['seen_threads'].iloc[x])))
    relev.append(len(A))

In [192]:
df_test_threads_f['relevant'] = relev
df_test_threads_f['len_concat'] = df_test_threads_f['concat_col'].apply(lambda x: len(x))
df_test_threads_f['len_predict'] = df_test_threads_f['sim_threads'].apply(lambda x: len(x))

In [193]:
df_test_threads_f['precision'] = df_test_threads_f['relevant']/df_test_threads_f['len_predict']
df_test_threads_f['recall'] = df_test_threads_f['relevant']/df_test_threads_f['len_concat']

In [195]:
Precision = df_test_threads_f['precision'].mean()
Precision

0.0371608147678524

In [196]:
Recall = df_test_threads_f['recall'].mean()
Recall

0.5120165107448555