In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from lightfm import LightFM
import multiprocessing as mp

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.nn import functional as F
import random
from pathlib import Path
import os

# Datasets

In [2]:
datasets_path = '/home/worker/persistent/code/KERN/xgb_data/recsys-seminar/SASRec/data/'
datasets_path = Path(datasets_path)

!ls '/home/worker/persistent/code/KERN/xgb_data/recsys-seminar/SASRec/data/'

Beauty.txt	   Steam.txt  beer.txt	     ml-1m.txt
DataProcessing.py  Video.txt  goodreads.txt


In [3]:
def evaluate(model, dataset, args):
    [train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)

    NDCG = 0.0
    HT = 0.0
    valid_user = 0.0

    if usernum>10000:
        users = random.sample(range(1, usernum + 1), 10000)
    else:
        users = range(1, usernum + 1)
    for u in users:

        if len(train[u]) < 1 or len(test[u]) < 1: continue

        seq = np.zeros([args.maxlen], dtype=np.int32)
        idx = args.maxlen - 1
        seq[idx] = valid[u][0]
        idx -= 1
        for i in reversed(train[u]):
            seq[idx] = i
            idx -= 1
            if idx == -1: break
        rated = set(train[u])
        rated.add(0)
        item_idx = [test[u][0]]
        for _ in range(100):
            t = np.random.randint(1, itemnum + 1)
            while t in rated: t = np.random.randint(1, itemnum + 1)
            item_idx.append(t)

        predictions = -model.predict(*[np.array(l) for l in [[u], [seq], item_idx]])
        predictions = predictions[0] # - for 1st argsort DESC

        rank = predictions.argsort().argsort()[0].item()

        valid_user += 1

        if rank < 10:
            NDCG += 1 / np.log2(rank + 2)
            HT += 1
        if valid_user % 100 == 0:
            print('.', end="")
            sys.stdout.flush()

    return NDCG / valid_user, HT / valid_user


# evaluate on val set
def evaluate_valid(model, dataset, args):
    [train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)

    NDCG = 0.0
    valid_user = 0.0
    HT = 0.0
    if usernum>10000:
        users = random.sample(range(1, usernum + 1), 10000)
    else:
        users = range(1, usernum + 1)
    for u in users:
        if len(train[u]) < 1 or len(valid[u]) < 1: continue

        seq = np.zeros([args.maxlen], dtype=np.int32)
        idx = args.maxlen - 1
        for i in reversed(train[u]):
            seq[idx] = i
            idx -= 1
            if idx == -1: break

        rated = set(train[u])
        rated.add(0)
        item_idx = [valid[u][0]]
        for _ in range(100):
            t = np.random.randint(1, itemnum + 1)
            while t in rated: t = np.random.randint(1, itemnum + 1)
            item_idx.append(t)

        predictions = -model.predict(*[np.array(l) for l in [[u], [seq], item_idx]])
        predictions = predictions[0]

        rank = predictions.argsort().argsort()[0].item()

        valid_user += 1

        if rank < 10:
            NDCG += 1 / np.log2(rank + 2)
            HT += 1
        if valid_user % 100 == 0:
            print('.', end="")
            sys.stdout.flush()

    return NDCG / valid_user, HT / valid_user

In [4]:
# ratings = pd.read_csv('ml-1m/ratings.dat', delimiter='::', header=None, 
#         names=['user_id', 'movie_id', 'rating', 'timestamp'], 
#         usecols=['user_id', 'movie_id', 'rating'], engine='python')

# movie_info = pd.read_csv('ml-1m/movies.dat', delimiter='::', header=None, 
#         names=['movie_id', 'name', 'category'], engine='python')
file = 'goodreads.txt'#'Beauty.txt'#   Steam.txt  Video.txt  ml-1m.txt
        
print('dataset', file)
users = []
movies = []
with open(datasets_path / file, 'r') as data:
    lines = data.readlines()

    for line in lines:
        user_movie = line.split(' ')
        user = int(user_movie[0])
        movie = int(user_movie[1])

        users.append(user)
        movies.append(movie)

df_data = {
    'user_id': users,
    'movie_id': movies
}

interactions = pd.DataFrame.from_dict(df_data)
del df_data
del lines
ratings = interactions

dataset goodreads.txt


In [5]:
#ratings = ratings.loc[(ratings['rating'] >= 4)]
users = ratings["user_id"]
movies = ratings["movie_id"]
#user_item = sp.coo_matrix((np.ones_like(users), (users, movies)))
#user_item_csr = user_item.tocsr()

In [6]:
%%time

unique_users = np.unique(users)
unique_movies = set(np.unique(movies))

grouped_interactions = ratings.groupby('user_id')['movie_id'].apply(list)
del ratings
train_dataset = {}
test_dataset = {}
#negative_dataset = {}
#all_dataset = {}
print('grouped')
grouped_len = len(grouped_interactions)
counter = 0
prcnt = 0

for user_id, user_movies in grouped_interactions.iteritems():
    if len(user_movies) < 2:
        continue
        
    if counter >= grouped_len / 100:
        counter = 0
        prcnt += 1
        
        #print('percentage processed', prcnt)
    
    train_dataset[user_id] = user_movies[:-1]
    test_dataset[user_id] = user_movies[-1]
    #negative_dataset[user_id] =  list(unique_movies - set(user_movies))
    counter += 1

del grouped_interactions
total_users = list(test_dataset.keys())

grouped
CPU times: user 54.1 s, sys: 1.26 s, total: 55.4 s
Wall time: 55.5 s


In [7]:
len(train_dataset), len(total_users)

(416603, 416603)

In [8]:
max_users = max(users)
max_movies = max(movies)

max_users, max_movies

(542145, 124082)

In [9]:
def extract_csr_data(interactions):
    coo_users = []
    for user_id in interactions:
        coo_users.append(np.full(len(interactions[user_id]), user_id))
    
    coo_users = np.hstack(coo_users)
    coo_movies = []
    for user_id in interactions:
        coo_movies.append(np.array(interactions[user_id]))
    
    coo_movies = np.hstack(coo_movies)
    
    user_item = sp.coo_matrix((np.ones_like(coo_users), (coo_users, coo_movies)), shape=(max_users + 1, max_movies + 1))
    return user_item.tocsr()

train_data = extract_csr_data(train_dataset)

# Evaluation 

Аналогично статье из NCF будем сравнивать все модели по метрикам Hit rate(HR@K) и NDCG@K. K = 10
Помимо одного позитива, также добавим 99 случайных негативных фильмов для пользователя, тем самым будем оценивать эти метрики относительно ранжирования этих 1 + 99 фильмов.

In [10]:
def evaluate_metrics_for_user(args):
    k = 10
    model, user_id = args
    if user_id not in test_dataset:
        return None
    
    last_user_movie = test_dataset[user_id]
    seen_movies = train_dataset[user_id] + [last_user_movie]
    negative_samples = list(unique_movies - set(seen_movies))
    #print(len(negative_samples))
    np.random.shuffle(negative_samples)
    random_negative_movies = negative_samples[:999]
    #print(last_user_movie)
    input_movies = np.array([last_user_movie] + list(random_negative_movies))
    input_user = np.full(len(input_movies), user_id)
    #print('input_movies', input_movies)
    pred = model.predict(input_user, input_movies)
    #print(pred)
    top_movies = input_movies[np.argsort(pred)[-k:]]
    #print('top', top_movies)
    hit_rate = 1 if last_user_movie in top_movies else 0

    ndcg = 0 # TODO? 1 / np.log2(rank + 2)
    for position, movie in enumerate(top_movies):
        if movie == last_user_movie:
            ndcg = 1 / np.log2(position + 2)
            break
    return hit_rate, ndcg

def evaluate_model(model):
    with mp.Pool(mp.cpu_count()) as pool:
        users_len = len(total_users)
        metrics = pool.map(evaluate_metrics_for_user, zip([model] * users_len, total_users))
        hrs = [metric[0] for metric in metrics if metric is not None]
        ndcgs = [metric[1] for metric in metrics if metric is not None]

    print('Mean HR', np.mean(hrs))
    print('Mean NDCG', np.mean(ndcgs))
    
#evaluate_metrics_for_user((baseline_warp, 15))

In [11]:
baseline_warp = LightFM(
    no_components=64, 
    learning_rate=0.01,
    loss='warp',
    max_sampled=200
)

baseline_warp.fit(train_data, epochs=40, verbose=True, num_threads=mp.cpu_count())

Epoch: 100%|██████████| 40/40 [21:33<00:00, 32.33s/it]


<lightfm.lightfm.LightFM at 0x7f7d7c9fd410>

In [12]:
evaluate_model(baseline_warp)

Mean HR 0.7783933385021231
Mean NDCG 0.2474113564557018


In [13]:
baseline_bpr = LightFM(
    no_components=64, 
    learning_rate=0.01,
    loss='bpr',
    max_sampled=200
)

baseline_bpr.fit(train_data, epochs=40, verbose=True, num_threads=mp.cpu_count())

Epoch: 100%|██████████| 40/40 [04:37<00:00,  6.93s/it]


<lightfm.lightfm.LightFM at 0x7f7e17a30fd0>

In [14]:
evaluate_model(baseline_bpr)

Mean HR 0.6467284201025917
Mean NDCG 0.20998324031703727


In [7]:
class Recommender:
    
    def __init__(self, model, user_emb, item_emb, bias_u=None, bias_i=None):
        self.model = model
        self.user_emb = user_emb
        self.user_bias = bias_u
        self.item_emb = item_emb
        self.item_bias = bias_i
        
    def predict(self, users, movies):
        return self.model.predict(users, movies)
    
    def similars(self, toy_movie_id=1, top=10):
        input_vector = self.item_emb[toy_movie_id]

        data = []
        for item_idx, column in enumerate(self.item_emb):
            dst = np.linalg.norm(column - input_vector)
            data.append((item_idx, dst))

        sorted_by_dst = list(sorted(data, key=lambda val: val[1]))

        similars = []
        for item in sorted_by_dst:
            search = movie_info[movie_info["movie_id"] == item[0]]
            movie_name = search["name"].to_string()
            if len(search) > 0:
                similars.append((item[0], movie_name))

        return similars[:top]

    def recommend(self, user_id=4, top=10):
        new_movie_ids = negative_dataset[user_id]

        data = []
        for movie_id in new_movie_ids:
            bias_w = self.user_bias[user_id] if self.user_bias is not None else 0
            bias_h = self.item_bias[movie_id] if self.item_bias is not None else 0

            dot = np.dot(self.user_emb[user_id], self.item_emb[movie_id])
            data.append((movie_id, dot + bias_w + bias_h))

        data = list(sorted(data, key=lambda val: val[1], reverse=True))
        recommendations = [movie_info[movie_info["movie_id"] == x[0]]["name"].to_string() for x in data]
        return recommendations[:top]

# Baseline MF model: LightFM warp

In [16]:
baseline_recommender = Recommender(
    baseline, 
    baseline.user_embeddings, 
    baseline.item_embeddings,
    baseline.user_biases,
    baseline.item_biases
)

In [18]:
baseline_recommender.similars()

[(1, '0    Toy Story (1995)'),
 (588, '584    Aladdin (1992)'),
 (3114, '3045    Toy Story 2 (1999)'),
 (2355, "2286    Bug's Life, A (1998)"),
 (1197, '1179    Princess Bride, The (1987)'),
 (1265, '1245    Groundhog Day (1993)'),
 (364, '360    Lion King, The (1994)'),
 (595, '591    Beauty and the Beast (1991)'),
 (1073, '1058    Willy Wonka and the Chocolate Factory (1971)'),
 (2321, '2252    Pleasantville (1998)')]

In [19]:
baseline_recommender.recommend()

['1178    Star Wars: Episode V - The Empire Strikes Back...',
 '847    Godfather, The (1972)',
 '585    Terminator 2: Judgment Day (1991)',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)',
 '1182    Aliens (1986)',
 '1203    Godfather: Part II, The (1974)',
 '108    Braveheart (1995)',
 '2502    Matrix, The (1999)',
 '453    Fugitive, The (1993)',
 '537    Blade Runner (1982)']