<a href="https://colab.research.google.com/github/upriyam-cmu/EDGE-Rec/blob/main/execute.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install project code

In [None]:
!pip uninstall -y edge-rec
!pip install -e .

# Train model

In [None]:
from edge_rec.datasets import MovieLensDataHolder, RatingsTransform, FeatureTransform

from edge_rec.model import GraphReconstructionModel, GraphTransformer
from edge_rec.model.embed import MovieLensFeatureEmbedder, SinusoidalPositionalEmbedding

from edge_rec.diffusion import GaussianDiffusion
from edge_rec.exec import Trainer, compute_metrics_from_ratings as compute_metrics

from torch import nn

In [None]:
data_holder = MovieLensDataHolder(
    ml100k=True,
    augmentations=dict(
        ratings=RatingsTransform.ToGaussian(),
        rating_counts=FeatureTransform.LogPolynomial(2),  # degree 2 --> dim_size = 2 (for embedder, below)
    ),
)

In [None]:
embed = MovieLensFeatureEmbedder(
    ml100k=True,
    user_id_dim=128,
    user_age_dim=None,
    user_gender_dim=None,
    user_occupation_dim=None,
    user_rating_counts_dims=None,
    movie_id_dim=128,
    movie_genre_ids_dim=None,
    movie_genre_multihot_dims=None,
    movie_rating_counts_dims=None,
)
core = GraphTransformer(
    n_blocks=16,
    n_channels=1,
    n_channels_internal=5,
    n_features=embed.output_sizes,
    time_embedder=SinusoidalPositionalEmbedding(32),
    attn_kwargs=dict(heads=4, dim_head=32, num_mem_kv=4, speed_hack=True, share_weights=False, dropout=0.1),
    feed_forward_kwargs=dict(hidden_dims=(2, 4, 2), activation_fn=nn.SiLU()),
)
model = GraphReconstructionModel(embed, core, feature_dim_size=None)

In [None]:
diffusion_model = GaussianDiffusion(model, image_size=50)
trainer = Trainer(
    # model
    diffusion_model=diffusion_model,
    # datasets
    train_dataset=data_holder.get_dataset(subgraph_size=50, target_density=None, train=True),
    test_dataset=data_holder.get_dataset(subgraph_size=50, target_density=None, train=False),
    # training
    batch_size=1,
    gradient_accumulate_every=1,
    force_batch_size=True,
    train_num_steps=int(1e5),
    train_mask_unknown_ratings=True,
    # eval
    eval_batch_size=None,  # copy training batch size if None
    n_eval_iters=100,
    eval_every=200,
    sample_on_eval=False,
    # optim
    train_lr=1e-4,
    adam_betas=(0.9, 0.99),
    max_grad_norm=1.,
    # logging
    results_folder="./results",
    ema_update_every=10,
    ema_decay=0.995,
    save_every_nth_eval=1,
    use_wandb=False,
    # accelerator
    amp=False,
    mixed_precision_type='fp16',
    split_batches=True,
)
print("Using device:", trainer.device)

In [None]:
trainer.train()

# Sample ratings

In [None]:
def eval_model(use_inpainting: bool, milestone: int):
    user_indices, product_indices = data_holder.get_subgraph_indices(50, 0.7)  # full graph
    rating_data_train = data_holder.slice_subgraph(
        user_indices=user_indices,
        product_indices=product_indices,
        return_train_edges=True,
        return_test_edges=False,
    )
    rating_data_test = data_holder.slice_subgraph(
        user_indices=user_indices,
        product_indices=product_indices,
        return_train_edges=False,
        return_test_edges=True,
    )
    denoised_graph = trainer.eval(
        rating_data=rating_data_train.clone(),
        milestone=milestone,
        do_inpainting_sampling=use_inpainting,
        tiled_sampling=False,
        # batch_size=16,
        # subgraph_size=128,
        silence_inner_tqdm=True,
    )
    return denoised_graph, rating_data_train, rating_data_test

In [None]:
def compute_all_metrics(milestones: tuple, n_samples: int = 5):
    metrics_per_milestone = {}
    for milestone in milestones:
        all_metrics = {key: [] for key in hrv_name}
        for _ in range(n_samples):
            denoised_graph, rating_data_train, rating_data_test = eval_model(use_inpainting=True, milestone=milestone)
            metrics = compute_metrics(
                predicted_graph=denoised_graph,
                train_rating_data=rating_data_train,
                test_rating_data=rating_data_test,
                rating_transform=data_holder.ratings_transform,
            )
            assert len(all_metrics) == len(metrics)
            for key, stats in metrics.items():
                all_metrics[key].append(stats)
        metrics_per_milestone[milestone] = {
            key: sum(values) / len(values)
            for key, values in all_metrics.items()
        }
    return metrics_per_milestone

In [None]:
all_metrics = compute_all_metrics(milestones=(4000, 6000, 8000, 10000), n_samples=10)

# Evaluate metrics

In [None]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)

mpl.rcParams.update({'figure.dpi': 300, 'font.size': 4})
plt.style.use('bmh')

hrv_name = {
    'precision': 'Precision',
    'recall': 'Recall',
    'mean_reciprocal_rank': 'MRR',
    'hit_rate': 'HR',
    'ndcg': 'NDCG'
}


def plot_metrics(metrics: dict, plot_name: str, nested_key_name=None):
    def _format_plot(ax, curve_name, min_value, max_value):
        ax.set_title(hrv_name[curve_name])
        ax.xaxis.grid(True, which='major')
        ax.xaxis.set_major_locator(MultipleLocator(10))
        ax.xaxis.set_major_formatter('{x:.0f}')

        ax.set_xlabel("Top-K")
        ax.set_aspect(50 / (max_value - min_value))

    def _plot(ax, data):
        ks = (1, 5, 10, 20, 30, 40, 50)
        ax.plot(ks, data, '.-', linewidth=1.0, markersize=4.0)

    if nested_key_name is not None:
        keys = sorted(list(metrics.keys()))
    else:
        keys = None

    fig, axs = plt.subplots(1, 5)
    for ax, name in zip(axs, ['precision', 'recall', 'ndcg', 'mean_reciprocal_rank', 'hit_rate']):
        if nested_key_name is None:
            _format_plot(ax, name, min_value=np.min(metrics[name]), max_value=np.max(metrics[name]))
            _plot(ax, metrics[name])
        else:
            all_data = [m[name] for m in metrics.values()]
            _format_plot(ax, name, min_value=np.min(all_data), max_value=np.max(all_data))
            for key in keys:
                _plot(ax, metrics[key][name])

        if name == 'precision':
            ax.set_ylabel(plot_name)

    plt.tight_layout(h_pad=-25.0)
    if nested_key_name is not None:
        plt.legend([f"{nested_key_name}={key}" for key in keys])

    plt.show()

In [None]:
plot_metrics(metrics=all_metrics, plot_name=f"ML-100k", nested_key_name="milestone")

# Display sampled ratings distribution

In [None]:
denoised_graph, _, _ = eval_model(use_inpainting=True, milestone=10000)

In [None]:
plt.figure(figsize=(3, 2))
plt.hist(data_holder.ratings_transform.invert(denoised_graph).numpy().flatten(), bins=20)
plt.show()