In [1]:
import pandas as pd 
import numpy as np 
import implicit
from sklearn.model_selection import train_test_split
from implicit.nearest_neighbours import CosineRecommender
import annoy

In [4]:
from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import AnnoyAlternatingLeastSquares

In [5]:
ratings = pd.read_csv('data/ratings.csv')
movies = pd.read_csv('data/movies.csv')

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
ratings['ratingMoreThanFour'] = np.where(ratings['rating'] >= 4, 1, 0)

In [8]:
from scipy.sparse import coo_matrix

user_item_matrix = coo_matrix((
    (ratings["rating"]>=4).astype(np.float32), # по колонке оценок пораждается булевская колонка "нравится"
    (ratings["userId"], ratings["movieId"])    # назначение матрицы строк и столбцов
))

In [9]:
total_len = user_item_matrix.data.size
train_len = int(total_len * 0.8)
all_indices = np.arange(total_len)
np.random.seed(42)
train_indices = np.random.choice(all_indices, train_len, replace=False)
train_mask = np.in1d(all_indices, train_indices)

In [10]:
def get_masked(arr, mask):
    return coo_matrix(
        (
            [np.float32(item) for item in arr.data[mask]],
            (arr.row[mask], arr.col[mask])
        ), arr.shape
    )

In [11]:
train_csr = get_masked(user_item_matrix, train_mask).tocsr()
train = train_csr.T
test_coo = get_masked(user_item_matrix, ~train_mask)
test_csr = test_coo.tocsr()

In [60]:
from implicit.als import AlternatingLeastSquares
import os

os.environ['OPENBLAS_NUMTHREADS'] = '1'
os.environ ['MKL_NUM_THREADS'] = '1'

model = AlternatingLeastSquares(random_state=42)
modelAnnoy = AnnoyAlternatingLeastSquares(random_state=42, n_trees = 130)

In [61]:
%%time
model.fit(train)


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


CPU times: user 7.92 s, sys: 67.9 ms, total: 7.99 s
Wall time: 8.06 s


In [62]:
%%time
modelAnnoy.fit(train)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


CPU times: user 11.1 s, sys: 76.3 ms, total: 11.1 s
Wall time: 8.91 s


In [63]:
users = list(set(test_coo.row))
small_user = users[:100000]

In [64]:
def get_recs(users, model):
    return{
        user:model.recommend(userid=user, user_items=train_csr, N=N_test)
        for user in users
    }

In [65]:
%%time

N_test = 50

# посчитаем по N_test рекомендаций для каждого пользователя из тестовой выборки
recs = get_recs(users, model)

CPU times: user 3.66 s, sys: 45.1 ms, total: 3.71 s
Wall time: 3.72 s


In [66]:
def hitrate (k, recs, users):
    hits = 0
    for user in users:
        if recs[user]:
            rec_items, _ = zip(*recs[user])
            hits += len(set(rec_items[:k]).intersection(set(test_csr[user].indices))) > 0
    return hits / len(users) 

In [67]:
%%time
print('hitrate=50  ', hitrate(50, recs, users))

hitrate=50   0.9540858610972982
CPU times: user 766 ms, sys: 10.7 ms, total: 776 ms
Wall time: 780 ms


In [68]:
%%time
recsAnnoy = get_recs(users,modelAnnoy)

CPU times: user 11.6 s, sys: 108 ms, total: 11.7 s
Wall time: 11.7 s


In [69]:
%%time
print('hitrate=50  ', hitrate(50, recsAnnoy, users))

hitrate=50   0.9540858610972982
CPU times: user 818 ms, sys: 13.5 ms, total: 831 ms
Wall time: 831 ms


In [70]:
from implicit.nearest_neighbours import CosineRecommender
os.environ['OPENBLAS_NUMTHREADS'] = '1'
os.environ ['MKL_NUM_THREADS'] = '1'
cos_model = CosineRecommender()

In [20]:
cos_model.fit(train)

  X.data = X.data / sqrt(bincount(X.row, X.data ** 2))[X.row]


HBox(children=(FloatProgress(value=0.0, max=3953.0), HTML(value='')))




In [21]:
recs_cos = get_recs(users,cos_model)

In [22]:
print('hitrate=50  ', hitrate(50, recs_cos, small_user))

hitrate=50   0.6363334990883475


In [186]:
from sklearn.preprocessing import minmax_scale
def normalize(alg, users):
    for user in users:
        if alg[user]:
            rec_items, rec_us = zip(*alg[user])
            rec_us = list(minmax_scale(list(rec_us)))
            for i in range(len(alg[user])):
                alg[user][i] = (rec_items[i], rec_us[i])

In [190]:
alg_recs = normalize(recs, users)
cos_recs = normalize(recs_cos, users)

In [191]:
users = alg_recs.keys()
new_recs = dict()

AttributeError: 'NoneType' object has no attribute 'keys'