In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pytorch_lightning.utilities.seed import seed_everything
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import Trainer
import wandb

seed_everything(42)
wandb.login()
wandb_logger = WandbLogger(project='neural-collaborative-filtering')

Global seed set to 42
wandb: Currently logged in as: szymon_wozniak (use `wandb login --relogin` to force relogin)


In [3]:
from ncf import NCF
from movielens_dataset import MovielensDataModule
data = MovielensDataModule()

data.setup()

trainer = Trainer(
    logger=wandb_logger,
    max_epochs=5,
    gpus=-1
)


model = NCF(16, 64, 32, 0.2, data.n_users(), data.n_movies())

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [4]:
trainer.fit(model, data)


  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 96.6 K
1 | item_embedding | Embedding | 62.1 K
2 | fc1            | Linear    | 2.1 K 
3 | d1             | Dropout   | 0     
4 | fc2            | Linear    | 2.1 K 
5 | d2             | Dropout   | 0     
6 | output         | Linear    | 33    
---------------------------------------------
162 K     Trainable params
0         Non-trainable params
162 K     Total params
0.652     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

In [5]:
def show_user_known_ratings(user_id, top_ratings: int, data: MovielensDataModule):
    whole_data = data.users.merge(data.ratings, left_on='UserID', right_on='UserID').merge(data.movies, left_on='MovieID', right_on='MovieID')
    user_ratings = whole_data[whole_data['UserID'] == user_id]
    ratings_ordered = user_ratings.sort_values(by=['Rating', 'Timestamp'], ascending=[False, True])
    top = ratings_ordered.head(top_ratings)[['MovieID', 'Title', 'Genres', 'Rating']]
    return top

In [6]:
import torch

def get_user_unrated_movies(user_id: int, data: MovielensDataModule):
    user_ratings = data.ratings[data.ratings['UserID'] == user_id]
    temp_movies = data.movies.merge(user_ratings, how='left', indicator=True)
    unrated_movies = temp_movies[temp_movies['_merge'] == "left_only"][['MovieID', 'Title', 'Genres', 'm_id']]
    return unrated_movies

def recommend_new_movies(user_id, data: MovielensDataModule, model: NCF):
    unrated_movies = get_user_unrated_movies(user_id, data)
    unrated_movies['UserID'] = user_id
    unrated_movies['u_id'] = data.users[data.users['UserID'] == user_id]['u_id'].values[0]
    model_user_ids = torch.from_numpy(unrated_movies['u_id'].values[:, None])
    model_movies_ids = torch.from_numpy(unrated_movies['m_id'].values[:, None])

    predictions = model(model_user_ids, model_movies_ids).detach().squeeze(-1).numpy()
    print(predictions.shape)
    unrated_movies['predicted'] = predictions
    unrated_movies.sort_values(by='predicted', ascending=False, inplace=True)
    return unrated_movies

In [16]:
import pandas as pd
show_user_known_ratings(2, 150, data)

Unnamed: 0,MovieID,Title,Genres,Rating
66518,1293,Gandhi (1982),Drama,5
67349,1225,Amadeus (1984),Drama,5
12641,1193,One Flew Over the Cuckoo's Nest (1975),Drama,5
68731,318,"Shawshank Redemption, The (1994)",Drama,5
74601,1945,On the Waterfront (1954),Crime|Drama,5
77298,593,"Silence of the Lambs, The (1991)",Drama|Thriller,5
79876,515,"Remains of the Day, The (1993)",Drama,5
80326,3468,"Hustler, The (1961)",Drama,5
82178,2501,October Sky (1999),Drama,5
83187,110,Braveheart (1995),Action|Drama|War,5


In [12]:
recommend_new_movies(2, data, model).head(25)


(3754, 1)


Unnamed: 0,MovieID,Title,Genres,m_id,UserID,u_id,predicted
1328,1349,Nosferatu a Venezia (1986),Horror,1328,2,1,0.735111
3185,3254,Wayne's World 2 (1993),Comedy,3185,2,1,0.725243
3103,3172,Ulysses (Ulisse) (1954),Adventure,3103,2,1,0.725149
246,249,Immortal Beloved (1994),Drama|Romance,246,2,1,0.719561
3771,3841,Air America (1990),Action|Comedy,3771,2,1,0.719549
270,273,Mary Shelley's Frankenstein (1994),Drama|Horror,270,2,1,0.719351
840,851,Basquiat (1996),Drama,840,2,1,0.719255
3855,3925,Stranger Than Paradise (1984),Comedy,3855,2,1,0.719123
691,700,Angus (1995),Comedy,691,2,1,0.718301
1969,2038,"Cat from Outer Space, The (1978)",Children's|Comedy|Sci-Fi,1969,2,1,0.718168


In [13]:
data.users[data.users['UserID'] == 2]


Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code,u_id
1,2,M,56,16,70072,1


In [14]:
data.ratings


Unnamed: 0,UserID,MovieID,Rating,Timestamp,m_id,u_id,rating_scaled
456790,6040,858,4,956703932,847,6039,0.75
456672,6040,593,5,956703954,589,6039,0.75
456732,6040,2384,4,956703954,2315,6039,0.75
456641,6040,1961,4,956703977,1892,6039,1.00
456842,6040,2019,5,956703977,1950,6039,0.50
...,...,...,...,...,...,...,...
373230,4958,2399,1,1046454338,2330,4957,0.75
373293,4958,1407,5,1046454443,1384,4957,0.50
373382,4958,2634,3,1046454548,2565,4957,0.25
373229,4958,3264,4,1046454548,3195,4957,0.50


In [17]:
model.item_embedding

Embedding(3883, 16)

In [19]:
data.movies


Unnamed: 0,MovieID,Title,Genres,m_id
0,1,Toy Story (1995),Animation|Children's|Comedy,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama,3
4,5,Father of the Bride Part II (1995),Comedy,4
...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,3878
3879,3949,Requiem for a Dream (2000),Drama,3879
3880,3950,Tigerland (2000),Drama,3880
3881,3951,Two Family House (2000),Drama,3881


In [50]:
toy_story = model.item_embedding.weight[0].detach().numpy()

In [51]:
all_embeddings = model.item_embedding.weight.detach().numpy()

In [52]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
toy_story = toy_story[None, :]

In [53]:
similarity = cosine_similarity(toy_story, all_embeddings)
distances = euclidean_distances(toy_story, all_embeddings)

In [54]:
similarity

array([[ 0.99999994,  0.03530335,  0.38939586, ..., -0.28143138,
         0.21651693,  0.07017481]], dtype=float32)

In [55]:
import numpy as np
most_similar = np.flip(similarity.argsort())
most_similar_2 = distances.argsort()

In [56]:
most_similar

array([[   0, 3243, 2635, ..., 2783, 3199, 1375]], dtype=int64)

In [57]:
data.movies.iloc[most_similar.squeeze()].head(10)

Unnamed: 0,MovieID,Title,Genres,m_id
0,1,Toy Story (1995),Animation|Children's|Comedy,0
3243,3312,"McCullochs, The (1975)",Drama,3243
2635,2704,"Lovers on the Bridge, The (Les Amants du Pont-...",Drama|Romance,2635
2455,2524,"Towering Inferno, The (1974)",Action|Drama,2455
853,864,"Wife, The (1995)",Comedy|Drama,853
468,472,I'll Do Anything (1994),Comedy|Drama,468
2612,2681,Free Enterprise (1998),Comedy|Romance|Sci-Fi,2612
919,931,Spellbound (1945),Mystery|Romance|Thriller,919
2296,2365,King Kong vs. Godzilla (Kingukongu tai Gojira)...,Action|Sci-Fi,2296
3625,3694,"Toxic Avenger, Part II, The (1989)",Comedy|Horror,3625


In [58]:
data.movies.iloc[most_similar_2.squeeze()].head(10)


Unnamed: 0,MovieID,Title,Genres,m_id
0,1,Toy Story (1995),Animation|Children's|Comedy,0
2635,2704,"Lovers on the Bridge, The (Les Amants du Pont-...",Drama|Romance,2635
853,864,"Wife, The (1995)",Comedy|Drama,853
919,931,Spellbound (1945),Mystery|Romance|Thriller,919
2455,2524,"Towering Inferno, The (1974)",Action|Drama,2455
3625,3694,"Toxic Avenger, Part II, The (1989)",Comedy|Horror,3625
2612,2681,Free Enterprise (1998),Comedy|Romance|Sci-Fi,2612
468,472,I'll Do Anything (1994),Comedy|Drama,468
2135,2204,Saboteur (1942),Thriller,2135
3243,3312,"McCullochs, The (1975)",Drama,3243
