In [201]:
import pandas as pd
import numpy as np
import math

In [202]:
articles_df = pd.read_csv('data/shared_articles.zip')
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df.head()

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en
5,1459194522,CONTENT SHARED,-2826566343807132236,4340306774493623681,8940341205206233829,,,,HTML,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,en


In [203]:
interactions_df = pd.read_csv('data/users_interactions.zip')

In [204]:
interactions_df.personId = interactions_df.personId.astype(str)
interactions_df.contentId = interactions_df.contentId.astype(str)
articles_df.contentId = articles_df.contentId.astype(str)
interactions_df.head()

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,


In [205]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

In [206]:
interactions_df['eventStrength'] = interactions_df.eventType.apply(lambda x: event_type_strength[x])

In [207]:
users_interactions_count_df = (
    interactions_df
    .groupby(['personId', 'contentId'])
    .first()
    .reset_index()
    .groupby('personId').size())

users_with_enough_interactions_df = \
    users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['personId']]

In [208]:
interactions_from_selected_users_df = interactions_df.loc[np.in1d(interactions_df.personId,
            users_with_enough_interactions_df)]

In [209]:
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_full_df = (
    interactions_from_selected_users_df
    .groupby(['personId', 'contentId']).eventStrength.sum()
    .apply(smooth_user_preference)
    .reset_index().set_index(['personId', 'contentId'])
)
interactions_full_df['last_timestamp'] = (
    interactions_from_selected_users_df
    .groupby(['personId', 'contentId'])['timestamp'].max()
)
        
interactions_full_df = interactions_full_df.reset_index()

In [210]:
from sklearn.model_selection import train_test_split

split_ts = 1475519545
interactions_train_df = interactions_full_df.loc[interactions_full_df.last_timestamp < split_ts].copy()
interactions_test_df = interactions_full_df.loc[interactions_full_df.last_timestamp >= split_ts].copy()
interactions_train_df.head()


Unnamed: 0,personId,contentId,eventStrength,last_timestamp
0,-1007001694607905623,-5065077552540450930,1.0,1470395911
2,-1007001694607905623,-793729620925729327,1.0,1472834892
6,-1032019229384696495,-1006791494035379303,1.0,1469129122
7,-1032019229384696495,-1039912738963181810,1.0,1459376415
8,-1032019229384696495,-1081723567492738167,2.0,1464054096


In [211]:
#Задание 6.1
pivot_ratings = pd.pivot_table(data=interactions_train_df, values='eventStrength', columns='contentId', index='personId', aggfunc='sum').fillna(0)
display(pivot_ratings.head())

print('Сумма коэффициента полезного взаимодействия запрашиваемого пользователя:', 
      round(pivot_ratings.loc['-1032019229384696495']['943818026930898372'], 2))

contentId,-1006791494035379303,-1021685224930603833,-1022885988494278200,-1024046541613287684,-1033806831489252007,-1038011342017850,-1039912738963181810,-1046621686880462790,-1051830303851697653,-1055630159212837930,...,9217155070834564627,921770761777842242,9220445660318725468,9222265156747237864,943818026930898372,957332268361319692,966067567430037498,972258375127367383,980458131533897249,98528655405030624
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1007001694607905623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-1032019229384696495,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,2.321928,0.0,0.0,0.0,0.0,0.0
-108842214936804958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
-1130272294246983140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
-1160159014793528221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Сумма коэффициента полезного взаимодействия запрашиваемого пользователя: 2.32


In [218]:
#Задание 6.2

interactions_train_array = np.array(pivot_ratings)
print('Среднее арифметическое всех значений', round(interactions_train_array.mean(), 3))

Среднее арифметическое всех значений 0.0166686207


In [213]:
similarity_array = np.zeros(shape=(len(interactions_train_array), len(interactions_train_array)))

for i in range(len(interactions_train_array)-1):
    for j in range(i+1, len(interactions_train_array)):
        
        mask_uc = ((interactions_train_array[i] != 0) & (interactions_train_array[j] != 0))
        ratings_u = interactions_train_array[i, mask_uc]
        ratings_c = interactions_train_array[j, mask_uc]
        
        similarity_array[i,j] = np.corrcoef(ratings_u, ratings_c)[0, 1]
        similarity_array[j,i] = similarity_array[i,j]
        
similarity_array

  avg = a.mean(axis, **keepdims_kw)
  ret = um.true_divide(
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


array([[ 0.        ,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,  0.        ,  0.41742881, ..., -0.82922103,
                nan,         nan],
       [        nan,  0.41742881,  0.        , ..., -1.        ,
                nan,         nan],
       ...,
       [        nan, -0.82922103, -1.        , ...,  0.        ,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
         0.        ,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,  0.        ]])

In [214]:
#Задание 6.3

print(round(similarity_array[3, 40], 2))

-0.33


In [None]:
#Задание 6.4

interactions = (
    interactions_train_df
    .groupby('personId')['contentId'].agg(lambda x: list(x))
    .reset_index()
    .rename(columns={'contentId': 'true_train'})
    .set_index('personId')
)
 
interactions['true_test'] = (
    interactions_test_df
    .groupby('personId')['contentId'].agg(lambda x: list(x))
)

interactions['true_test'] = [[] if x is np.NaN else x for x in interactions['true_test']]

recommendations_user_based = []
for i in range(len(similarity_array)):
    similar_users = similarity_array[i] > 0
    if not any(similar_users):
        recommendations_user_based.append([])
    else:
        temp_recommend = np.argsort(interactions_train_array[similar_users].sum(axis=0))[::-1]
        temp_recommend = pivot_ratings.columns[temp_recommend]
        recommendation = np.array(temp_recommend)[~np.in1d(temp_recommend, interactions.iloc[i]["true_train"])][:10]
        recommendations_user_based.append(list(recommendation))
interactions['prediction_user_based'] = recommendations_user_based

print('Главная рекомендация для необходимого пользователя:', recommendations_user_based[35][0])

['-5148591903395022444',
 '2581138407738454418',
 '-8208801367848627943',
 '-820343972901090172',
 '-2447632164766022033',
 '3306277069425849869',
 '6062146090334604102',
 '2372438485070148864',
 '-9019233957195913605',
 '1356221992133852808']

In [221]:
#Функция для подсчёта метрики качества

def calc_precision(column):
    return ( interactions.apply(  lambda row:len(set(row['true_test']).intersection(
                set(row[column]))) /min(len(row['true_test']) + 0.001, 10.0), axis=1)).mean()

In [223]:
#Задание 6.5

print(round(calc_precision('prediction_user_based'), 3))

0.005


In [227]:
#Задание 6.6

from scipy import linalg

U, sigma, V = linalg.svd(pivot_ratings)

print('Максимальное U в векторе:', round(U.max(), 2))

Максимальное U в векторе: 0.71


In [236]:
#Задание 6.7

k = 100
U = U[:, 0:k]
s = np.diag(sigma[0:k])
V = V[0:k, :]

print('Сумма всех значений в новой сингулярной матрице:', round(s.sum(), 2))

Сумма всех значений в новой сингулярной матрице: 2096.43


In [None]:
#Создаем дата-фрейм с новым массивом
new_ratings = pd.DataFrame(data=(U@s@V), columns=pivot_ratings.columns, index=pivot_ratings.index)
new_ratings

contentId,-1006791494035379303,-1021685224930603833,-1022885988494278200,-1024046541613287684,-1033806831489252007,-1038011342017850,-1039912738963181810,-1046621686880462790,-1051830303851697653,-1055630159212837930,...,9217155070834564627,921770761777842242,9220445660318725468,9222265156747237864,943818026930898372,957332268361319692,966067567430037498,972258375127367383,980458131533897249,98528655405030624
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1007001694607905623,-0.003787,0.000872,-0.001285,0.002931,0.003889,0.013863,-0.001591,0.032587,-0.001272,0.000256,...,-0.003759,-0.000227,-0.008608,-0.000033,0.010733,0.009452,0.003438,-0.000179,0.009597,-0.002503
-1032019229384696495,0.992435,0.015898,0.030610,0.001633,0.012849,0.033984,1.013460,0.002882,0.010873,-0.004411,...,2.974838,0.007639,-0.020085,-0.004258,2.353326,-0.031199,0.033007,0.002056,-0.003534,0.009510
-108842214936804958,-0.015866,-0.029223,-0.032987,-0.012270,0.045642,0.015761,0.177709,0.014010,0.021468,-0.029695,...,0.134796,0.029271,-0.050596,-0.000277,-0.178115,-0.080439,1.891358,0.016009,0.003305,-0.017166
-1130272294246983140,0.166369,0.106842,0.223735,0.055399,-0.009947,-0.088267,0.004609,-0.070600,-0.039416,0.062143,...,0.042020,-0.058595,0.214094,-0.005800,1.156069,-0.176958,-0.020508,0.041146,-0.055612,0.017217
-1160159014793528221,0.011422,-0.002571,0.004961,-0.005116,-0.004820,-0.051882,-0.016217,0.000016,0.019629,-0.003028,...,-0.004893,0.008450,-0.024916,0.002783,-0.007486,0.035766,0.003191,0.000242,-0.034082,0.003778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953707509720613429,-0.008445,0.008741,-0.034499,0.012429,-0.004019,-0.068137,-0.026000,-0.008958,-0.006899,0.010585,...,-0.008839,-0.023717,-0.067747,0.000863,0.157679,-0.019117,-0.008760,-0.001366,0.075268,0.004712
983095443598229476,-0.033461,-0.017441,0.003829,-0.024063,-0.079712,-0.018086,-0.057891,-0.020028,-0.005829,0.058693,...,0.020201,-0.012622,-0.050680,0.035532,-0.107726,0.088145,0.043583,0.010233,-0.078222,0.011099
989049974880576288,-0.012545,-0.017248,0.010352,-0.005367,-0.035086,0.094543,-0.045640,0.004211,-0.007016,0.050335,...,-0.000913,-0.053048,0.207418,0.032478,-0.205465,0.077313,0.040472,0.013610,0.181102,0.004919
997469202936578234,-0.028194,-0.004740,0.051152,0.001024,0.023937,-0.103786,-0.003911,-0.002672,-0.006405,-0.014027,...,0.005487,0.045981,0.016940,0.009955,-0.035695,0.039257,-0.038895,-0.008983,0.099922,0.008412


In [241]:
top_k = 10
predictions_SVD = []

for personId in interactions.index:
    prediction = (
        new_ratings.loc[personId].sort_values(ascending=False).index.values
    )

    predictions_SVD.append(
        list(
            prediction[
                ~np.in1d(prediction, interactions.loc[personId, "true_train"])
            ]
        )[:top_k]
    )

interactions["prediction_svd"] = predictions_SVD

print('Оценка предсказания:', round(calc_precision("prediction_svd"), 3))

Оценка предсказания: 0.012


In [None]:
# ratings = pd.pivot_table(
#     interactions_train_df,
#     values="eventStrength",
#     index="personId",
#     columns="contentId",
# ).fillna(0)
# print(round(ratings.loc["-1032019229384696495", "943818026930898372"], 2))

# ratings_m = ratings.values
# print(ratings_m.mean())

# similarity_users = np.zeros((len(ratings_m), len(ratings_m)))
# for i in (range(len(ratings_m)-1)):
#     for j in range(i+1, len(ratings_m)):
     
#         mask_uv = (ratings_m[i] != 0) & (ratings_m[j] != 0)
#         ratings_v = ratings_m[i, mask_uv]
#         ratings_u = ratings_m[j, mask_uv]

#         similarity_users[i,j] = np.corrcoef(ratings_v, ratings_u)[0, 1]
#         similarity_users[j,i] = similarity_users[i,j]

# print(similarity_users[3,40])

# interactions = (
#     interactions_train_df
#     .groupby('personId')['contentId'].agg(lambda x: list(x))
#     .reset_index()
#     .rename(columns={'contentId': 'true_train'})
#     .set_index('personId')
# )
 
# interactions['true_test'] = (
#     interactions_test_df
#     .groupby('personId')['contentId'].agg(lambda x: list(x))
# )

# interactions['true_test'] = [ [] if x is np.NaN else x for x in interactions['true_test'] ]

# prediction_user_based = []
# for i in range(len(similarity_users)):
#     users_sim = similarity_users[i] > 0
#     if not any(users_sim):
#         prediction_user_based.append([])
#     else:
#         tmp_recommend = np.argsort(ratings_m[users_sim].sum(axis=0))[::-1]
#         tmp_recommend = ratings.columns[tmp_recommend]
#         recommend = np.array(tmp_recommend)[~np.in1d(tmp_recommend, interactions.iloc[i]["true_train"])][:10]
#         prediction_user_based.append(list(recommend))
# interactions['prediction_user_based'] = prediction_user_based
# print(prediction_user_based[35][0])

# def calc_precision(column):
#     return ( interactions.apply(  lambda row:len(set(row['true_test']).intersection(
#                 set(row[column]))) /min(len(row['true_test']) + 0.001, 10.0), axis=1)).mean()

2.32
0.016668620737604063


  avg = a.mean(axis, **keepdims_kw)
  ret = um.true_divide(
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


-0.3333333333333333


['-5148591903395022444',
 '2581138407738454418',
 '-8208801367848627943',
 '-820343972901090172',
 '-2447632164766022033',
 '3306277069425849869',
 '6062146090334604102',
 '2372438485070148864',
 '-9019233957195913605',
 '1356221992133852808']