Копируем считывание из условия

In [31]:
import datetime
import implicit
from abc import abstractmethod
import pandas as pd
import numpy as np
import scipy.sparse as sp

In [32]:
from pandas import DataFrame
from scipy.special import expit
ratings = pd.read_csv('ml-1m/ratings.dat', delimiter='::', header=None, 
        names=['user_id', 'movie_id', 'rating', 'timestamp'], 
        usecols=['user_id', 'movie_id', 'rating'], engine='python')

In [33]:
movie_info = pd.read_csv('ml-1m/movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')

In [34]:
implicit_ratings = ratings.loc[(ratings['rating'] >= 4)].reset_index(drop=True)
users = implicit_ratings["user_id"]
movies = implicit_ratings["movie_id"]
user_item = sp.coo_matrix((np.ones_like(users), (users, movies)))
user_item_t_csr = user_item.T.tocsr()
user_item_csr = user_item.tocsr()

Также скопируем вспомогательные функции

In [35]:
def format_movie(movie):
    return f"{movie['movie_id'].values[0]}. {movie['name'].values[0]} ({movie['category'].values[0]})"

def get_user_history(user_id, implicit_ratings):
    return [format_movie(movie_info[movie_info["movie_id"] == x]) for x in implicit_ratings[implicit_ratings["user_id"] == user_id]["movie_id"]]

def get_recommendations(user_id, model):
    return [format_movie(movie_info[movie_info["movie_id"] == x[0]]) for x in model.recommend(user_id, user_item_csr)]

Крутые модели умеют делать __.fit(...)__, __.recommend(...)__.
Естественно хотим также, поэтому придется вспомнить ООП в Python

In [36]:
class AbstractModel:
    def __init__(self):
        self.user_vectors = None
        self.item_vectors = None
        self.item_bias = None
        self.cnt_rec = 10

        self.fitted = False

    @abstractmethod
    def __fit_impl__(self, **kwargs):
        pass

    def fit(self, data, **kwargs):
        self.fitted = False
        self.__fit_impl__(data=data, **kwargs)
        self.fitted = True

    def recommend(self, id, user_item_csr):
        if not self.fitted: raise Exception('Using unfitted model')
        similarity = np.dot(self.item_vectors, self.user_vectors[id])
        if self.item_bias is not None: similarity += self.item_bias
        recommendations = np.flip(similarity.argsort())
        formatted_recommendations = np.fromiter((id_rec for id_rec in recommendations if user_item_csr[id, id_rec] == 0), dtype=recommendations.dtype)
        return formatted_recommendations[:self.cnt_rec, np.newaxis]

Половина работы сделана, осталось только написать 4 варианта __.\_\_fit_impl\_\_()__

### Задание 1. Не используя готовые решения, реализовать SVD разложение используя SGD на explicit данных

In [37]:
import random

class Svd(AbstractModel):
    def __fit_impl__(self, data: DataFrame, alpha=1e-2, learning_rate=1e-2, cnt_epoch=100):
        cnt_users = data['user_id'].max() + 1
        cnt_items = data['movie_id'].max() + 1
        cnt_samples = len(data)

        limit = 0.1
        self.user_vectors = np.random.uniform(high=limit, size=[cnt_users, 128])
        self.item_vectors = np.random.uniform(high=limit, size=[cnt_items, 128])

        self.user_bias = np.random.uniform(high=limit, size=[cnt_users])
        self.item_bias = np.random.uniform(high=limit, size=[cnt_items])

        bias = 0

        def correct(x, error):
            return x - learning_rate * (error + x * alpha)

        history: np.ndarray = data.values

        for epoch in range(1, cnt_epoch + 1):
            errors = []
            epoch_start = datetime.datetime.utcnow()
            for _ in range(cnt_samples):
                sample_ind = random.randint(0, cnt_samples - 1)

                user_id = history[sample_ind][0]
                movie_id = history[sample_ind][1]

                prediction = \
                    np.dot(self.user_vectors[user_id], self.item_vectors[movie_id]) + \
                    self.user_bias[user_id] + self.item_bias[movie_id] + bias
                error = prediction - history[sample_ind][2]
                errors.append(error ** 2)

                saved_user_vector = self.user_vectors[user_id]
                self.user_vectors[user_id] = \
                    correct(saved_user_vector, error * self.item_vectors[movie_id])
                self.user_bias[user_id] = correct(self.user_bias[user_id], error)
                self.item_vectors[movie_id] = \
                    correct(self.item_vectors[movie_id], error * saved_user_vector)
                self.item_bias[movie_id] = correct(self.item_bias[movie_id], error)
                bias = correct(bias, error)
            epoch_time = datetime.datetime.utcnow() - epoch_start
            print(f"Average error in {epoch} epoch is {sum(errors) / len(errors)}. Time = {epoch_time.seconds} s.")

### Задание 2. Не используя готовые решения, реализовать матричное разложение используя ALS на implicit данных

In [38]:
import tensorflow as tf

class Als(AbstractModel):
    def __fit_impl__(self, data: sp.csr_matrix, alpha=1e1, cnt_epoch=20):
        p = tf.cast(tf.convert_to_tensor(data.toarray()), tf.float32)
        p_t = tf.transpose(p)
        C = p * alpha + 1.0
        C_t = tf.transpose(C)

        cnt_users = p.shape[0]
        cnt_items = p.shape[1]

        limit = 0.125
        self.user_vectors = tf.random.uniform(shape=(cnt_users, 64), maxval=limit)
        self.item_vectors = tf.random.uniform(shape=(cnt_items, 64), maxval=limit)

        diag_matrix = tf.linalg.tensor_diag([0.01] * 64)

        def calculate_row(vec, C, p):
            return tf.linalg.matvec(
                tf.linalg.matmul(
                    tf.linalg.inv(
                        tf.linalg.matmul(
                            tf.transpose(vec) * C,
                            vec) + diag_matrix),
                    tf.transpose(vec) * C),
                p)

        for epoch in range(1, cnt_epoch + 1, 2):
            epoch_start = datetime.datetime.utcnow()
            self.user_vectors = tf.convert_to_tensor(
                [calculate_row(self.item_vectors, C[i], p[i]) for i in range(cnt_users)])
            if epoch < cnt_epoch:
                self.item_vectors = tf.convert_to_tensor(
                    [calculate_row(self.user_vectors, C_t[i], p_t[i]) for i in range(cnt_items)])
            error = tf.reduce_mean((tf.linalg.matmul(self.user_vectors, tf.transpose(self.item_vectors)) - p) ** 2)
            epoch_time = datetime.datetime.utcnow() - epoch_start
            print(f"Error in {epoch}-{epoch + 1} epochs is {error}. Time = {epoch_time.seconds} s.")

### Задание 3. Не используя готовые решения, реализовать матричное разложение BPR на implicit данных

In [39]:
class Bpr(AbstractModel):
    def __fit_impl__(self, data: DataFrame, learning_rate=1e-2, alpha=1e-2, cnt_epoch=30):
        cnt_users = data['user_id'].max() + 1
        cnt_items = data['movie_id'].max() + 1
        cnt_samples = len(data)

        limit = 0.125
        self.user_vectors = np.random.uniform(high=limit, size=[cnt_users, 128])
        self.item_vectors = np.random.uniform(high=limit, size=[cnt_items, 128])

        def correct(x, error):
            return x - learning_rate * (error + x * alpha)

        history: np.ndarray = data.values

        for epoch in range(1, cnt_epoch + 1):
            epoch_start = datetime.datetime.utcnow()
            negative_movies = np.random.randint(low=0, high=cnt_items, size=[cnt_samples])
            errors = []

            for _ in range(cnt_samples):
                sample_ind = random.randint(0, cnt_samples - 1)

                user_id = history[sample_ind][0]
                movie_id = history[sample_ind][1]
                movie_2_id = negative_movies[sample_ind]

                difference = self.item_vectors[movie_id] - self.item_vectors[movie_2_id]
                user_vector = self.user_vectors[user_id]
                error = expit(-np.dot(user_vector, difference))

                self.user_vectors[user_id] = correct(self.user_vectors[user_id], -error * difference)
                self.item_vectors[movie_id] = correct(self.item_vectors[movie_id], -error * user_vector)
                self.item_vectors[movie_2_id] = correct(self.item_vectors[movie_2_id], error * user_vector)
                errors.append(np.log(error))
            epoch_time = datetime.datetime.utcnow() - epoch_start
            print(f"Average error in {epoch} epoch is {sum(errors) / len(errors)}. Time = {epoch_time.seconds} s.")

Всё, теперь посмотрим на рекомендации библиотечной модели и наших трёх.

In [40]:
library_model = implicit.als.AlternatingLeastSquares(factors=64, iterations=100, calculate_training_loss=True)
library_model.fit(user_item_t_csr)

  0%|          | 0/100 [00:00<?, ?it/s]

In [41]:
svd_model = Svd()
svd_model.fit(data=ratings)

Average error in 1 epoch is 0.8833346415916736. Time = 28 s.
Average error in 2 epoch is 0.8206529662474858. Time = 27 s.
Average error in 3 epoch is 0.8007058637644624. Time = 27 s.
Average error in 4 epoch is 0.7716102980583966. Time = 27 s.
Average error in 5 epoch is 0.7264685801558614. Time = 27 s.
Average error in 6 epoch is 0.6748900481122385. Time = 27 s.
Average error in 7 epoch is 0.6202030108967974. Time = 27 s.
Average error in 8 epoch is 0.5656413859254246. Time = 27 s.
Average error in 9 epoch is 0.5137615104262576. Time = 28 s.
Average error in 10 epoch is 0.4661430469523228. Time = 27 s.
Average error in 11 epoch is 0.4241671116110198. Time = 27 s.
Average error in 12 epoch is 0.3868902628885745. Time = 27 s.
Average error in 13 epoch is 0.35550728397012993. Time = 27 s.
Average error in 14 epoch is 0.3303208003386003. Time = 28 s.
Average error in 15 epoch is 0.3056956133891929. Time = 27 s.
Average error in 16 epoch is 0.28689996026998177. Time = 28 s.
Average error i

In [42]:
als_model = Als()
als_model.fit(data=user_item_csr)

Error in 1-2 epochs is 0.039610426872968674. Time = 23 s.
Error in 3-4 epochs is 0.03528401255607605. Time = 23 s.
Error in 5-6 epochs is 0.034002240747213364. Time = 23 s.
Error in 7-8 epochs is 0.033531904220581055. Time = 23 s.
Error in 9-10 epochs is 0.033298809081315994. Time = 23 s.
Error in 11-12 epochs is 0.033164799213409424. Time = 24 s.
Error in 13-14 epochs is 0.03308132663369179. Time = 23 s.
Error in 15-16 epochs is 0.03302651271224022. Time = 23 s.
Error in 17-18 epochs is 0.03298892825841904. Time = 23 s.
Error in 19-20 epochs is 0.032962169498205185. Time = 24 s.


In [43]:
bpr_model = Bpr()
bpr_model.fit(data=implicit_ratings)

Average error in 1 epoch is -1.0082818187021054. Time = 20 s.
Average error in 2 epoch is -1.5261330479602553. Time = 20 s.
Average error in 3 epoch is -1.8241977171781802. Time = 20 s.
Average error in 4 epoch is -1.9953837222087702. Time = 20 s.
Average error in 5 epoch is -2.097714514137024. Time = 20 s.
Average error in 6 epoch is -2.179461705038402. Time = 20 s.
Average error in 7 epoch is -2.2405373761868836. Time = 20 s.
Average error in 8 epoch is -2.2934128127536764. Time = 20 s.
Average error in 9 epoch is -2.3235382700738856. Time = 20 s.
Average error in 10 epoch is -2.359575487244047. Time = 20 s.
Average error in 11 epoch is -2.392249494449624. Time = 20 s.
Average error in 12 epoch is -2.4169218440342752. Time = 20 s.
Average error in 13 epoch is -2.4383575709912475. Time = 20 s.
Average error in 14 epoch is -2.4575321735427247. Time = 20 s.
Average error in 15 epoch is -2.473607922858665. Time = 20 s.
Average error in 16 epoch is -2.5006870014297276. Time = 20 s.
Averag

Взглянем на историю пользователя с ID = 4, а затем проверим рекоммендации

In [44]:
get_user_history(4, implicit_ratings)

['3468. Hustler, The (1961) (Drama)',
 '2951. Fistful of Dollars, A (1964) (Action|Western)',
 '1214. Alien (1979) (Action|Horror|Sci-Fi|Thriller)',
 '1036. Die Hard (1988) (Action|Thriller)',
 '260. Star Wars: Episode IV - A New Hope (1977) (Action|Adventure|Fantasy|Sci-Fi)',
 '2028. Saving Private Ryan (1998) (Action|Drama|War)',
 '480. Jurassic Park (1993) (Action|Adventure|Sci-Fi)',
 '1198. Raiders of the Lost Ark (1981) (Action|Adventure)',
 '1954. Rocky (1976) (Action|Drama)',
 "1097. E.T. the Extra-Terrestrial (1982) (Children's|Drama|Fantasy|Sci-Fi)",
 '3418. Thelma & Louise (1991) (Action|Drama)',
 '3702. Mad Max (1979) (Action|Sci-Fi)',
 '2366. King Kong (1933) (Action|Adventure|Horror)',
 '1387. Jaws (1975) (Action|Horror)',
 '1201. Good, The Bad and The Ugly, The (1966) (Action|Western)',
 '2692. Run Lola Run (Lola rennt) (1998) (Action|Crime|Romance)',
 '2947. Goldfinger (1964) (Action)',
 '1240. Terminator, The (1984) (Action|Sci-Fi|Thriller)']

Посмотрим на все рекомендации

In [45]:
pd.set_option('display.max_colwidth', None)

def construct_recommendations(id):
    return pd.DataFrame(list(
        zip(get_recommendations(id, library_model),
        get_recommendations(id, svd_model),
        get_recommendations(id, als_model),
        get_recommendations(id, bpr_model))),
             columns=['Library ALS', 'SVD', 'ALS', 'BPR']).head(10)

In [46]:
construct_recommendations(4)

Unnamed: 0,Library ALS,SVD,ALS,BPR
0,589. Terminator 2: Judgment Day (1991) (Action|Sci-Fi|Thriller),3421. Animal House (1978) (Comedy),589. Terminator 2: Judgment Day (1991) (Action|Sci-Fi|Thriller),1196. Star Wars: Episode V - The Empire Strikes Back (1980) (Action|Adventure|Drama|Sci-Fi|War)
1,1196. Star Wars: Episode V - The Empire Strikes Back (1980) (Action|Adventure|Drama|Sci-Fi|War),953. It's a Wonderful Life (1946) (Drama),1196. Star Wars: Episode V - The Empire Strikes Back (1980) (Action|Adventure|Drama|Sci-Fi|War),"2762. Sixth Sense, The (1999) (Thriller)"
2,1304. Butch Cassidy and the Sundance Kid (1969) (Action|Comedy|Western),"858. Godfather, The (1972) (Action|Crime|Drama)","2571. Matrix, The (1999) (Action|Sci-Fi|Thriller)",296. Pulp Fiction (1994) (Crime|Drama)
3,1291. Indiana Jones and the Last Crusade (1989) (Action|Adventure),1283. High Noon (1952) (Western),1210. Star Wars: Episode VI - Return of the Jedi (1983) (Action|Adventure|Romance|Sci-Fi|War),2858. American Beauty (1999) (Comedy|Drama)
4,1200. Aliens (1986) (Action|Sci-Fi|Thriller|War),912. Casablanca (1942) (Drama|Romance|War),1200. Aliens (1986) (Action|Sci-Fi|Thriller|War),"593. Silence of the Lambs, The (1991) (Drama|Thriller)"
5,"2571. Matrix, The (1999) (Action|Sci-Fi|Thriller)","3095. Grapes of Wrath, The (1940) (Drama)","457. Fugitive, The (1993) (Action|Thriller)",1617. L.A. Confidential (1997) (Crime|Film-Noir|Mystery|Thriller)
6,"1953. French Connection, The (1971) (Action|Crime|Drama|Thriller)",1136. Monty Python and the Holy Grail (1974) (Comedy),1291. Indiana Jones and the Last Crusade (1989) (Action|Adventure),"1197. Princess Bride, The (1987) (Action|Adventure|Comedy|Romance)"
7,3527. Predator (1987) (Action|Sci-Fi|Thriller),3363. American Graffiti (1973) (Comedy|Drama),1304. Butch Cassidy and the Sundance Kid (1969) (Action|Comedy|Western),"858. Godfather, The (1972) (Action|Crime|Drama)"
8,"858. Godfather, The (1972) (Action|Crime|Drama)",3671. Blazing Saddles (1974) (Comedy|Western),110. Braveheart (1995) (Action|Drama|War),2997. Being John Malkovich (1999) (Comedy)
9,3471. Close Encounters of the Third Kind (1977) (Drama|Sci-Fi),1203. 12 Angry Men (1957) (Drama),3703. Mad Max 2 (a.k.a. The Road Warrior) (1981) (Action|Sci-Fi),527. Schindler's List (1993) (Drama|War)


Аналогично для пользователя с ID = 7

In [47]:
get_user_history(7, implicit_ratings)

['648. Mission: Impossible (1996) (Action|Adventure|Mystery)',
 '861. Supercop (1992) (Action|Thriller)',
 '2916. Total Recall (1990) (Action|Adventure|Sci-Fi|Thriller)',
 '1610. Hunt for Red October, The (1990) (Action|Thriller)',
 '589. Terminator 2: Judgment Day (1991) (Action|Sci-Fi|Thriller)',
 '6. Heat (1995) (Action|Crime|Thriller)',
 '442. Demolition Man (1993) (Action|Sci-Fi)',
 '733. Rock, The (1996) (Action|Adventure|Thriller)',
 '2353. Enemy of the State (1998) (Action|Thriller)',
 '1196. Star Wars: Episode V - The Empire Strikes Back (1980) (Action|Adventure|Drama|Sci-Fi|War)',
 '2571. Matrix, The (1999) (Action|Sci-Fi|Thriller)',
 '380. True Lies (1994) (Action|Adventure|Comedy|Romance)',
 '1997. Exorcist, The (1973) (Horror)',
 '1270. Back to the Future (1985) (Comedy|Sci-Fi)',
 '457. Fugitive, The (1993) (Action|Thriller)',
 '1573. Face/Off (1997) (Action|Sci-Fi|Thriller)',
 '3753. Patriot, The (2000) (Action|Drama|War)',
 '474. In the Line of Fire (1993) (Action|Thrill

In [48]:
construct_recommendations(7)


Unnamed: 0,Library ALS,SVD,ALS,BPR
0,377. Speed (1994) (Action|Romance|Thriller),1198. Raiders of the Lost Ark (1981) (Action|Adventure),377. Speed (1994) (Action|Romance|Thriller),"2762. Sixth Sense, The (1999) (Thriller)"
1,1608. Air Force One (1997) (Action|Thriller),"3062. Longest Day, The (1962) (Action|Drama|War)",260. Star Wars: Episode IV - A New Hope (1977) (Action|Adventure|Fantasy|Sci-Fi),260. Star Wars: Episode IV - A New Hope (1977) (Action|Adventure|Fantasy|Sci-Fi)
2,1370. Die Hard 2 (1990) (Action|Thriller),"2764. Thomas Crown Affair, The (1968) (Crime|Drama|Thriller)",1370. Die Hard 2 (1990) (Action|Thriller),2858. American Beauty (1999) (Comedy|Drama)
3,"858. Godfather, The (1972) (Action|Crime|Drama)","318. Shawshank Redemption, The (1994) (Drama)","2058. Negotiator, The (1998) (Action|Thriller)",296. Pulp Fiction (1994) (Crime|Drama)
4,1210. Star Wars: Episode VI - Return of the Jedi (1983) (Action|Adventure|Romance|Sci-Fi|War),1287. Ben-Hur (1959) (Action|Adventure|Drama),"2763. Thomas Crown Affair, The (1999) (Action|Thriller)","593. Silence of the Lambs, The (1991) (Drama|Thriller)"
5,780. Independence Day (ID4) (1996) (Action|Sci-Fi|War),"1939. Best Years of Our Lives, The (1946) (Drama|War)",1198. Raiders of the Lost Ark (1981) (Action|Adventure),"1197. Princess Bride, The (1987) (Action|Adventure|Comedy|Romance)"
6,"2763. Thomas Crown Affair, The (1999) (Action|Thriller)",1291. Indiana Jones and the Last Crusade (1989) (Action|Adventure),1608. Air Force One (1997) (Action|Thriller),1198. Raiders of the Lost Ark (1981) (Action|Adventure)
7,"2058. Negotiator, The (1998) (Action|Thriller)",3338. For All Mankind (1989) (Documentary),1210. Star Wars: Episode VI - Return of the Jedi (1983) (Action|Adventure|Romance|Sci-Fi|War),1210. Star Wars: Episode VI - Return of the Jedi (1983) (Action|Adventure|Romance|Sci-Fi|War)
8,"1408. Last of the Mohicans, The (1992) (Action|Romance|War)","2208. Lady Vanishes, The (1938) (Comedy|Mystery|Romance|Thriller)",1214. Alien (1979) (Action|Horror|Sci-Fi|Thriller),1214. Alien (1979) (Action|Horror|Sci-Fi|Thriller)
9,260. Star Wars: Episode IV - A New Hope (1977) (Action|Adventure|Fantasy|Sci-Fi),"2183. Man Who Knew Too Much, The (1956) (Thriller)",2278. Ronin (1998) (Action|Crime|Thriller),1265. Groundhog Day (1993) (Comedy|Romance)
