### Homework 3

* 1) найти датасет, обосновать
* 2) построить MF, покрутить, построить метрики-симилары-рекоммендации
* 3) построить NCF: покрутить, построить метрики-симилары-рекоммендации
* 4) построить simple attention: покрутить, построить метрики-симилары-рекоммендации.


In [1]:
!pip install rankfm
!pip install lightfm
!pip install wget



In [2]:
import scipy.sparse as sp
import pandas as pd
import numpy as np
import torch
import wget
import os
import zipfile
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from rankfm.rankfm import RankFM
from rankfm.evaluation import hit_rate, discounted_cumulative_gain, precision, recall
from lightfm.datasets import fetch_movielens
from  torch.utils.data import DataLoader
from lightfm import LightFM
from sklearn.metrics import ndcg_score
from torch.optim import Adam, SGD



In [3]:
link = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'

wget.download(link)

with zipfile.ZipFile('ml-1m.zip',"r") as zip_ref:
    zip_ref.extractall(os.getcwd())

In [4]:
os.chdir('ml-1m')

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

### 1. DATASET

В качества датасета взяла movieLens потому что:
* Уже достаточно знакома с этим датасетом по 1 - дз, есть параметры на которые оритентироваться при обучении WARP
* Интуитивно понятно качество модели по выданным айтемам
* Достаточно популярный датасет, используемый во многих статьях, в том числе в Neural Collaborative Filtering

Метрики будем смотреть как в статье - NDCG и HR10

In [6]:
ratings = pd.read_csv('ratings.dat', delimiter='::', header=None, 
        names=['user_id', 'movie_id', 'rating', 'timestamp'], 
        usecols=['user_id', 'movie_id', 'rating'], engine='python')
movie_info = pd.read_csv('movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'name', 'category'], engine='python')
ratings = ratings.loc[ratings['rating'] >= 4]

In [7]:
users = np.unique(ratings["user_id"])
items = np.unique(ratings["movie_id"])

users_dict, items_dict = {i: j for j, i in enumerate(users)}, {i: j for j, i in enumerate(items)}
inv_users_dict, inv_items_dict =  {i: j for j, i in users_dict.items()}, {i: j for j, i in items_dict.items()}

new_users, new_items = ratings['user_id'].apply(lambda i: users_dict[i]), ratings['movie_id'].apply(lambda i: items_dict[i])

implicit = pd.concat([new_users, new_items], axis=1).to_numpy()

In [8]:
get_similars = lambda item_id, model : [movie_info[movie_info["movie_id"] == inv_items_dict[x]]["name"].to_string()
                                        for x in model.similar_items(items_dict[item_id])]

get_recommendations = lambda user_id, model: [movie_info[movie_info["movie_id"] == inv_items_dict[x]]["name"].to_string()
                                                   for x in model.recommend([users_dict[user_id]]).values[0]]

In [9]:
def metrics(model, dataset, n=500, k=10):
    NDCG, HR = [], []  
    labels = np.array([1] + (n-1)*[0])
    for user_item in tqdm(dataset.test, position=0, leave=False):
        user = user_item[0]
        negatives = np.column_stack([np.full(n-1, user), np.random.choice(dataset.user_neg[user],n-1, replace=False)])
        
        predictions = model.predict(np.append([user_item], negatives, axis=0))
        predictions = np.nan_to_num(predictions)

        NDCG.append(ndcg_score([labels], [predictions], k=k))
        HR.append(np.sum(np.argsort(-predictions)[:k] == 0))

    print(f"NDCG@{k}: {np.mean(NDCG):2f}")
    print(f"HR@k{k}: {np.mean(HR):2f}")

In [10]:
class Data:
    def __init__(self, dataset):
        self.data = dataset
        self.n_users = dataset[:, 0].max() + 1
        self.n_items = dataset[:, 1].max() + 1
        self.compute_pos_neg()
        self.train_test_split()
        self.create_train_dataset()

        
    def compute_pos_neg(self):
        self.user_neg = {}
        for u in tqdm(range(self.n_users), position=0, leave=False):        
            user_pos_items = self.data[self.data[:, 0] == u][:, 1]
            user_neg_items = list(set(np.arange(self.n_items)) - set(user_pos_items))
            self.user_neg[u] = user_neg_items
            
    def train_test_split(self): 
        train, test = np.array([]), np.array([])
        for u in tqdm(range(self.n_users), position=0, leave=False): 
            watched = self.data[self.data[:, 0] == u]
            train = np.append(train, watched[:-1])
            test = np.append(test, watched[-1])
        self.test = test.reshape(-1, 2).astype(int)
        self.train = train.reshape(-1, 2).astype(int)

    def create_train_dataset(self, n_negatives=5):
        self.users, self.items, self.labels = [], [], []
        for u in tqdm(range(self.n_items), position=0, leave=False): 
            positives = n_negatives * list(self.train[self.train[:,0] == u][:, 1])
            negatives = list(np.random.choice(self.user_neg[u], len(positives)))
            self.users.extend([u]*(len(positives) + len(negatives)))
            self.items.extend(positives + negatives)
            self.labels.extend([1]*len(positives) + [0]*len(negatives))
        self.dataset = np.stack([self.users, self.items, self.labels], axis=1)
     
    def __getitem__(self, idx):
        return (*self.dataset[idx],)   
             
    def __len__(self):
        return len(self.labels)


In [11]:
dataset = Data(implicit)



### 2. WARP

In [12]:
warp = RankFM(factors=30,
               loss='warp',
               max_samples=20,
               learning_rate=1e-1,
               learning_schedule='invscaling')
warp.fit(dataset.train, epochs=55)

In [13]:
get_similars(1, warp)

['584    Aladdin (1992)',
 '3045    Toy Story 2 (1999)',
 '591    Beauty and the Beast (1991)',
 '360    Lion King, The (1994)',
 '1838    Mulan (1998)',
 '1132    Wrong Trousers, The (1993)',
 '1205    Grand Day Out, A (1992)',
 '735    Close Shave, A (1995)',
 "2286    Bug's Life, A (1998)",
 '1526    Hercules (1997)']

In [14]:
get_recommendations(4, warp)

['1196    Alien (1979)',
 '1180    Raiders of the Lost Ark (1981)',
 '847    Godfather, The (1972)',
 '257    Star Wars: Episode IV - A New Hope (1977)',
 '2502    Matrix, The (1999)',
 '1178    Star Wars: Episode V - The Empire Strikes Back...',
 '1183    Good, The Bad and The Ugly, The (1966)',
 '1203    Godfather: Part II, The (1974)',
 '1023    Die Hard (1988)',
 '1366    Jaws (1975)']

In [15]:
hit_rate(warp, dataset.test)
    

0.0399204903097565

In [16]:
discounted_cumulative_gain(warp, dataset.test)

0.01873986131435244

In [17]:
precision(warp, dataset.test)

0.00399204903097565

In [18]:
metrics(warp, dataset)

                                                    

NDCG@10: 0.201512
HR@k10: 0.379430




Рекомендации и симилары хорошие
Встроенные метрики считаются немного подругому поэтому они различаются, но все равно хотелось на них посмотреть

### 3. NCF

В статье наибольшие значения метрик достигаются при конкатенции предобученных эмбедингов MLP и GMF.

In [44]:
class Base(nn.Module):
    def __init__(self):
        super().__init__()
              
    def fit(self): 
        
        if self.optimizer_type =="Adam":
            optimizer = Adam(self.parameters(), lr=self.lr)
        elif self.optimizer_type =="SGD":
            optimizer = SGD(self.parameters(), lr=self.lr)
        
        criterion = nn.BCELoss()
        bar = tqdm(total=self.epochs, position=0,leave=False, desc='')
        
        for epoch in range(self.epochs):
            self.data.create_train_dataset(n_negatives=5)
            L_epoch = 0
            train_loader = DataLoader(self.data, batch_size=self.batch_size, shuffle=True) 
            
            for batch in train_loader:
                
                users, items, labels = batch
                users = users.to(device)
                items = items.to(device)
                labels = labels.to(device)

                outputs = self.forward(users, items)

                L = criterion(outputs, labels.float().view(-1, 1))

                optimizer.zero_grad()
                L.backward()
                optimizer.step()
                
                L_epoch += L.item()
           
            bar.update(1)
            bar.set_description(f'Loss: {L_epoch:.2f}')
#             print(self.rmse())
        self.embeddings()
        
            
    def embeddings(self):       
        if self.model_type in ('MLP', 'GMF'):
            self.items_embedding_w = self.embedding_item.weight.data.cpu().numpy()
            self.users_embedding_w = self.embedding_user.weight.data.cpu().numpy()
        elif self.model_type in ('NCF'):
            self.cat_embeddings()

    def similar_items(self, item_id, k=10):
        """
        Get similar items for item_id according to fitted embeddings 
        """
        predicted_ratings = self.items_embedding_w @ self.items_embedding_w[item_id] / np.linalg.norm(self.items_embedding_w, axis=-1)
        idx_similars = np.argsort(-predicted_ratings)
        return idx_similars[:k]

    def recommend(self, user_id, k=10):
        """
        Recommend users new items which he didn't look before 
        """
        unwatched = np.array(self.data.user_neg[user_id[0]])
        negatives = np.column_stack([np.full(len(unwatched), user_id), unwatched])
        predicted_ratings = self.predict(negatives)
        idx_recommendations = unwatched[np.argsort(-predicted_ratings)]
        return pd.DataFrame(idx_recommendations[:k].reshape(1, -1))
    
    def predict(self, user_item, cold_start=None):
        predictions = np.array([])
        loader = DataLoader(user_item, batch_size=self.batch_size)
        with torch.no_grad():
            for u_i in loader:
                u = u_i[:, 0].to(device).long()
                i = u_i[:, 1].to(device).long()
                predictions = np.append(predictions, self.forward(u, i).cpu().numpy().squeeze())
        return predictions
        

In [45]:
class Model(Base):
    def __init__(self, args):
        super().__init__()
        self.model_type = args["model_type"]
        self.optimizer_type = args["optimizer_type"]
        self.lr, self.batch_size, self.epochs = args["lr"], args["batch_size"], args["epochs"]
        self.data = args["data"]
        
        self.embedding_user = nn.Embedding(num_embeddings=self.data.n_users, embedding_dim=args["embedding_dim"])
        self.embedding_item = nn.Embedding(num_embeddings=self.data.n_items, embedding_dim=args["embedding_dim"])
        self.sigm = nn.Sigmoid()

        if self.model_type == 'MLP':
            self.MLP = nn.Sequential(nn.Linear(2 * args["embedding_dim"], 128),
                                     nn.ReLU(), 
                                     nn.Linear(128, 64), 
                                     nn.ReLU(), 
                                     nn.Linear(64, 32),
                                     nn.ReLU(),
                                     nn.Linear(32, 16),
                                     nn.ReLU())
            self.output =  nn.Linear(16, 1)
        elif self.model_type == 'GMF':
            self.output =  nn.Linear(args["embedding_dim"], 1)
        else:
            raise NotImplementedError
    

    def forward(self, user_id, item_id, forward_type='full_model'):
        
        if self.model_type == 'MLP':
            out = torch.cat([self.embedding_user(user_id), self.embedding_item(item_id)], dim=-1)
            out = self.MLP(out)
            if forward_type == 'full_model':
                return self.sigm(self.output(out))
            return out
        
        elif self.model_type == 'GMF':
            out = torch.mul(self.embedding_user(user_id), self.embedding_item(item_id))
            if forward_type == 'full_model':
                return self.sigm(self.output(out))
            return out
        else:
            raise NotImplementedError

In [46]:
class NCF(Base):
    def __init__(self, args, pretrained_mlp, pretrained_gmf):
        super().__init__()
        self.model_type = args["model_type"]
        self.optimizer_type = args["optimizer_type"]
        self.lr, self.batch_size, self.epochs = args["lr"], args["batch_size"], args["epochs"]
        self.data= args["data"]
        
        self.alpha = args['alpha']

        self.pretrained_MLP = pretrained_mlp
        self.pretrained_GMF = pretrained_gmf
        
        self.output = nn.Linear(pretrained_mlp.__dict__['_modules']['output'].in_features
                                + pretrained_gmf.__dict__['_modules']['output'].in_features, 1)
    
    
    def forward(self, user_id, item_id):
        mlp_rating = self.pretrained_MLP.forward(user_id, item_id, forward_type='compound')
        gmf_rating = self.pretrained_GMF.forward(user_id, item_id,  forward_type='compound')
        rating = torch.cat([self.alpha * gmf_rating, (1 - self.alpha) * mlp_rating], dim=-1)
        return torch.sigmoid(self.output(rating)) 

    def cat_embeddings(self):
        self.items_embedding_w = torch.cat([self.alpha * self.pretrained_GMF.embedding_item.weight.data.cpu(), (1 - self.alpha)*self.pretrained_MLP.embedding_item.weight.data.cpu()], dim=-1).numpy()
        self.users_embedding_w = torch.cat([self.alpha * self.pretrained_GMF.embedding_user.weight.data.cpu(), (1 - self.alpha)*self.pretrained_MLP.embedding_user.weight.data.cpu()], dim=-1).numpy()

In [47]:

mlp_parametrs = {"data": dataset,
                 "embedding_dim": 64,
                 "hidden_dim": 64,
                 "lr":1e-2, 
                 "batch_size":512,
                 "epochs": 10,
                 "optimizer_type": "Adam",
                 "model_type" : 'MLP'}

gmf_parametrs = {"data": dataset,
                 "embedding_dim": 64,
                 "hidden_dim": 64,
                 "lr":1e-2, 
                 "batch_size":256,
                 "epochs": 30,
                 "optimizer_type": "Adam",
                 "model_type" : 'GMF'}

In [48]:
mlp = Model(mlp_parametrs).to(device)
mlp.fit()

Loss: 1309.58: 100%|██████████| 10/10 [06:09<00:00, 36.71s/it]

In [49]:
get_similars(1, mlp)

['0    Toy Story (1995)',
 '33    Babe (1995)',
 '3045    Toy Story 2 (1999)',
 '1245    Groundhog Day (1993)',
 '1250    Back to the Future (1985)',
 '352    Forrest Gump (1994)',
 '1179    Princess Bride, The (1987)',
 '360    Lion King, The (1994)',
 '584    Aladdin (1992)',
 '257    Star Wars: Episode IV - A New Hope (1977)']

In [50]:
get_recommendations(4, mlp)

['1178    Star Wars: Episode V - The Empire Strikes Back...',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)',
 '1182    Aliens (1986)',
 '1353    Star Trek: The Wrath of Khan (1982)',
 '585    Terminator 2: Judgment Day (1991)',
 '847    Godfather, The (1972)',
 '108    Braveheart (1995)',
 '1267    Ben-Hur (1959)',
 '1284    Butch Cassidy and the Sundance Kid (1969)',
 '1204    Full Metal Jacket (1987)']

In [51]:
metrics(mlp, dataset)

                                                    

NDCG@10: 0.156559
HR@k10: 0.296621




In [52]:
gmf = Model(gmf_parametrs).to(device)


In [53]:
gmf.fit()

Loss: 2280.41: 100%|██████████| 30/30 [17:55<00:00, 35.90s/it]

In [54]:
get_similars(1, gmf)

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 "2286    Bug's Life, A (1998)",
 '584    Aladdin (1992)',
 '2252    Pleasantville (1998)',
 '360    Lion King, The (1994)',
 '591    Beauty and the Beast (1991)',
 '33    Babe (1995)',
 '1245    Groundhog Day (1993)',
 '1179    Princess Bride, The (1987)']

In [55]:
get_recommendations(4, gmf)

['1178    Star Wars: Episode V - The Empire Strikes Back...',
 '2502    Matrix, The (1999)',
 '1267    Ben-Hur (1959)',
 '585    Terminator 2: Judgment Day (1991)',
 '847    Godfather, The (1972)',
 '453    Fugitive, The (1993)',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)',
 '740    Dr. Strangelove or: How I Learned to Stop Worr...',
 '1203    Godfather: Part II, The (1974)',
 '1271    Indiana Jones and the Last Crusade (1989)']

In [56]:
metrics(gmf, dataset)

                                                    

NDCG@10: 0.092125
HR@k10: 0.183504




In [57]:
ncf_parametrs = {"data": dataset,
                 "lr":1e-2, 
                 "batch_size":512,
                 "epochs": 10,
                 "optimizer_type": "SGD",
                 "alpha": 0.5,
                 "model_type" : 'NCF'}

In [58]:
ncf = NCF(ncf_parametrs, mlp, gmf).to(device)
ncf.fit()

Loss: 775.59: 100%|██████████| 10/10 [05:23<00:00, 32.28s/it]

In [59]:
get_similars(1, ncf)

['0    Toy Story (1995)',
 '3045    Toy Story 2 (1999)',
 '33    Babe (1995)',
 '1245    Groundhog Day (1993)',
 '584    Aladdin (1992)',
 '1179    Princess Bride, The (1987)',
 '360    Lion King, The (1994)',
 '1250    Back to the Future (1985)',
 '352    Forrest Gump (1994)',
 '591    Beauty and the Beast (1991)']

In [60]:
get_recommendations(4, ncf)

['1178    Star Wars: Episode V - The Empire Strikes Back...',
 '1267    Ben-Hur (1959)',
 '847    Godfather, The (1972)',
 '585    Terminator 2: Judgment Day (1991)',
 '2502    Matrix, The (1999)',
 '453    Fugitive, The (1993)',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)',
 '740    Dr. Strangelove or: How I Learned to Stop Worr...',
 '1203    Godfather: Part II, The (1974)',
 '1950    Seven Samurai (The Magnificent Seven) (Shichin...']

In [61]:
metrics(ncf, dataset)

                                                    

NDCG@10: 0.127359
HR@k10: 0.247764




Не очень получилось, видимо где-то ошибка