In [1]:
import logging
import math
import random
import time
from sklearn.manifold import TSNE
import networkx as nx
import torch
from ignite.contrib.handlers import TensorboardLogger, global_step_from_engine
from ignite.engine import (
    Engine,
    Events,
    create_supervised_evaluator,
    create_supervised_trainer,
)
from ignite.handlers import ModelCheckpoint
from ignite.metrics import Accuracy, Loss
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from torchvision.datasets import MNIST
from torchvision.models import resnet18
from torchvision.transforms import Compose, Normalize, ToTensor
from tqdm import tqdm
from utils import construct_graph, random_walk

logging.basicConfig(level=logging.INFO)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
PATH = "./ml-latest-small/"
logging.info("Constructing graph")
movies_graph, movies = construct_graph(PATH + "movies.csv", PATH + "ratings.csv")
# movies_graph = nx.relabel.convert_node_labels_to_integers(movies_graph, first_label=0, ordering='default')
logging.info("Sampling walks")
walks = random_walk(movies_graph, 5, 10, 1, 1)
nodes2id = dict(zip(movies_graph.nodes(), range(len(movies_graph.nodes()))))
id2nodes = list(movies_graph.nodes())
id2title = [movies[movies.movieId == idx].iloc[0].title for idx in id2nodes]

INFO:root:Constructing graph
100%|██████████| 573/573 [00:00<00:00, 2461.46it/s]
INFO:root:Sampling walks
100%|██████████| 1405/1405 [00:02<00:00, 650.03it/s]
100%|██████████| 1405/1405 [00:01<00:00, 851.72it/s]
100%|██████████| 1405/1405 [00:01<00:00, 828.97it/s]
100%|██████████| 1405/1405 [00:01<00:00, 718.21it/s]
100%|██████████| 1405/1405 [00:02<00:00, 686.50it/s]


In [20]:
class TripletDataset(Dataset):
    def __init__(self, movies_graph, id2nodes, nodes2id):
        self.G = movies_graph
        self.id2nodes = id2nodes
        self.nodeIdx = set(id2nodes)
        self.nodes2id = nodes2id

    def __len__(self):
        return len(self.nodeIdx)

    def __getitem__(self, idx):
        anchor = self.id2nodes[idx]
        neighborhood = set(self.G.adj[anchor].keys())
        neg_neighborhood = self.nodeIdx.difference(neighborhood)
        pos = random.choice(list(neighborhood))
        neg = random.choice(list(neg_neighborhood))
        return self.nodes2id[anchor], self.nodes2id[pos], self.nodes2id[neg]
TripletDataset(movies_graph, id2nodes, nodes2id)[4]

(4, 238, 558)

In [36]:
class SimpleNN(nn.Module):
    def __init__(self, num_nodes, embedding_size):
        super(SimpleNN, self).__init__()
        self.embedding = nn.Embedding(num_nodes, embedding_size)  # Embedding layer
        self.fc = nn.Sequential(
            nn.Linear(embedding_size, 128), nn.ReLU(), nn.Linear(128, embedding_size)
        )

    def forward(self, x):
        x = self.embedding(x)  # Convert indices to embeddings
        return self.fc(x)


dataset = TripletDataset(movies_graph, id2nodes, nodes2id)
train_loader = DataLoader(dataset)
embedding_size = len(dataset)
model = SimpleNN(embedding_size, 256)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = nn.TripletMarginLoss()

trainer = create_supervised_trainer(model, optimizer, criterion, device)

val_metrics = {"loss": Loss(criterion)}


def train_step(engine, batch):
    model.train()
    optimizer.zero_grad()
    a, p, n = batch[0].to(device), batch[1].to(device), batch[2].to(device)
    a, p, n = model(a), model(p), model(n)
    loss = criterion(a, p, n)
    loss.backward()
    optimizer.step()
    return loss.item()
trainer = Engine(train_step)

def validation_step(engine, batch):
    model.eval()
    with torch.no_grad():
        a = batch[0].to(device)
        a = model(a)
        return a
train_evaluator = Engine(validation_step)


@trainer.on(Events.EPOCH_COMPLETED)
def log_training_loss(engine):
    print(
        f"Epoch[{engine.state.epoch}], Iter[{engine.state.iteration}] Loss: {engine.state.output:.2f}"
    )
    

tb_logger = TensorboardLogger(log_dir="tb-logger")
@trainer.on(Events.COMPLETED)
def log_emb(engine):
    model.eval()
    embeddings = torch.zeros(len(nodes2id), embedding_size)
    with torch.no_grad():
        for index, _, _ in train_loader:
            embedding = engine.(index)
            embeddings[index] = embedding
    tsne = TSNE(n_components=2, random_state=0)
    embeddings_np = embeddings.numpy()
    embeddings_2d = tsne.fit_transform(embeddings_np)
    tb_logger.add_embedding(embeddings_2d, metadata=id2title)

# Attach handler to plot trainer's loss every 100 iterations
tb_logger.attach_output_handler(
    trainer,
    event_name=Events.EPOCH_COMPLETED,
    tag="training",
    output_transform=lambda loss: {"batch_loss": loss},
)

<ignite.engine.events.RemovableEventHandle at 0x7fe612769790>

In [37]:
trainer.run(train_loader, max_epochs=5)

INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=5.
INFO:ignite.engine.engine.Engine:Epoch[1] Complete. Time taken: 00:00:05.343


Epoch[1], Iter[1405] Loss: 80.68


INFO:ignite.engine.engine.Engine:Epoch[2] Complete. Time taken: 00:00:05.400


Epoch[2], Iter[2810] Loss: 0.00


INFO:ignite.engine.engine.Engine:Epoch[3] Complete. Time taken: 00:00:05.679


Epoch[3], Iter[4215] Loss: 104.92


INFO:ignite.engine.engine.Engine:Epoch[4] Complete. Time taken: 00:00:07.685


Epoch[4], Iter[5620] Loss: 11.31


INFO:ignite.engine.engine.Engine:Epoch[5] Complete. Time taken: 00:00:08.580
INFO:ignite.engine.engine.Engine:Engine run complete. Time taken: 00:00:32.692


Epoch[5], Iter[7025] Loss: 0.00


State:
	iteration: 7025
	epoch: 5
	epoch_length: 1405
	max_epochs: 5
	output: 0.0
	batch: <class 'list'>
	metrics: <class 'dict'>
	dataloader: <class 'torch.utils.data.dataloader.DataLoader'>
	seed: <class 'NoneType'>
	times: <class 'dict'>

In [38]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.manifold import TSNE

# Assuming TripletDataset, SimpleNN, and other necessary imports are already defined

# ... (Your SimpleNN class definition)
writer = SummaryWriter()

# Dataset and DataLoader setup
dataset = TripletDataset(movies_graph, id2nodes, nodes2id)
train_loader = DataLoader(dataset)
embedding_size = len(dataset)

# Model, Optimizer, Criterion
model = SimpleNN(embedding_size, 256)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = nn.TripletMarginLoss()

# Device configuration (for GPU/CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training Loop
num_epochs = 10  # Set the number of epochs
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        a, p, n = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        
        optimizer.zero_grad()
        a, p, n = model(a), model(p), model(n)
        loss = criterion(a, p, n)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        writer

    # Print statistics
    print(f"Epoch[{epoch + 1}], Loss: {running_loss / len(train_loader):.2f}")

    # Validation or additional logging can be added here

# Generate Embeddings for Visualization (optional)
model.eval()
embeddings = torch.zeros(len(nodes2id), embedding_size)
with torch.no_grad():
    for index, _, _ in train_loader:
        index = index.to(device)
        embedding = model(index)
        embeddings[index] = embedding.cpu()  # Move to CPU for TSNE

# TSNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=0)
embeddings_np = embeddings.numpy()
embeddings_2d = tsne.fit_transform(embeddings_np)

# Here you can add code to log or visualize the embeddings


Epoch[1], Loss: 8.10
Epoch[2], Loss: 12.72
Epoch[3], Loss: 14.14
Epoch[4], Loss: 14.37
Epoch[5], Loss: 16.40
Epoch[6], Loss: 17.25
Epoch[7], Loss: 18.13
Epoch[8], Loss: 21.57
Epoch[9], Loss: 18.20


KeyboardInterrupt: 