In [1]:
# training a skipgram model for networks

import numpy as np
import pandas as pd

auction_items = pd.read_csv("assets/auction_item_buyer_bidder.csv")
num_items = auction_items["class"].nunique()
num_bidders = auction_items.supplier_name.nunique()
num_buyers = auction_items.uge.nunique()

item_to_idx = {item: idx for idx, item in enumerate(auction_items["class"].unique())}
bidder_to_idx = {
    bidder: idx for idx, bidder in enumerate(auction_items["supplier_name"].unique())
}
buyer_to_idx = {buyer: idx for idx, buyer in enumerate(auction_items["uge"].unique())}

item_to_idx_r = {idx: item for item, idx in item_to_idx.items()}
bidder_to_idx_r = {idx: bidder for bidder, idx in bidder_to_idx.items()}
buyer_to_idx_r = {idx: buyer for buyer, idx in buyer_to_idx.items()}

auction_items = auction_items.to_numpy()

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader


class AuctionDataset(Dataset):
    def __init__(self, n_samples):
        """
        Args:
            n_samples (int): Number of samples you want this dataset to produce
                             (could be large or you might want an "infinite" style).
            sample_function (callable): Function that returns a single data point:
                buyer_idx, supplier_idx, item_idx, buyers_participation_vec
        """
        self.n_samples = n_samples

    def __len__(self):
        return self.n_samples

    def __getitem__(self, idx):
        # Call the user-provided sampling function
        buyer_idx = buyer_to_idx[auction_items[idx, 1]]
        supplier_idx = bidder_to_idx[auction_items[idx, 2]]
        item_idx = item_to_idx[auction_items[idx, 0]]
        buyers_vec = auction_items[idx, 3:]
        return buyer_idx, supplier_idx, item_idx, buyers_vec


def auction_collate_fn(batch):
    """
    batch: List of tuples (buyer_idx, supplier_idx, item_idx, buyers_vec).
    We want to return them batched as tensors:
    - buyer_ids: (batch_size,)
    - supplier_ids: (batch_size,)
    - item_ids: (batch_size,)
    - buyers_vec: (batch_size, n_buyers)
    """
    buyer_ids = torch.tensor([b[0] for b in batch], dtype=torch.long)
    supplier_ids = torch.tensor([b[1] for b in batch], dtype=torch.long)
    item_ids = torch.tensor([b[2] for b in batch], dtype=torch.long)
    buyers_vec = torch.stack(
        [torch.tensor(b[3], dtype=torch.float) for b in batch], dim=0
    )

    return buyer_ids, supplier_ids, item_ids, buyers_vec


def make_dataloader(n_samples, batch_size):
    dataset = AuctionDataset(n_samples)
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=auction_collate_fn,
    )
    return loader

In [4]:
import torch
import torch.nn as nn


class AuctionEmbeddingModel(nn.Module):
    def __init__(self, n_buyers, n_suppliers, n_items, embed_dim=32):
        super(AuctionEmbeddingModel, self).__init__()

        # Buyer embedding (input is buyer_idx)
        self.buyer_embedding = nn.Embedding(
            num_embeddings=n_buyers, embedding_dim=embed_dim
        )

        # Separate heads
        self.supplier_head = nn.Linear(embed_dim, n_suppliers)  # For suppliers
        self.buyer_head = nn.Linear(embed_dim, n_buyers)  # For multi-buyer prediction
        self.item_head = nn.Linear(embed_dim, n_items)  # For items

    def forward(self, buyer_ids):
        """
        buyer_ids: (batch_size,) of buyer indices
        """
        # Embed the buyers
        buyer_emb = self.buyer_embedding(buyer_ids)  # shape: (batch_size, embed_dim)

        # Three heads → three sets of logits
        supplier_logits = self.supplier_head(buyer_emb)  # (batch_size, n_suppliers)
        buyer_logits = self.buyer_head(buyer_emb)  # (batch_size, n_buyers)
        item_logits = self.item_head(buyer_emb)  # (batch_size, n_items)

        return supplier_logits, buyer_logits, item_logits

In [4]:
import torch.optim as optim


def train(
    n_samples=10000,
    batch_size=64,
    embed_dim=128,
    epochs=5,
    lr=1e-3,
    weight_suppliers=1.0,
    weight_buyers=1.0,
    weight_items=1.0,
):
    """
    Args:
        n_buyers, n_suppliers, n_items (int): Vocabulary sizes for each.
        sample_function (callable): user-provided function to sample a datapoint
        n_samples (int): how many samples to draw from dataset
        batch_size (int)
        embed_dim (int): dimension of the embedding
        epochs (int): number of training epochs
        lr (float): learning rate
        weight_suppliers, weight_buyers, weight_items (float): weight for each loss component
    """

    # -------------------------
    # 1. Data Loader
    # -------------------------
    dataloader = make_dataloader(n_samples, batch_size)

    # -------------------------
    # 2. Model, Loss, Optimizer
    # -------------------------
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = "cpu"
    model = AuctionEmbeddingModel(num_buyers, num_bidders, num_items, embed_dim).to(
        device
    )

    # Define separate loss functions
    supplier_criterion = (
        nn.CrossEntropyLoss()
    )  # expects shape (batch, n_suppliers), target shape (batch,)
    buyer_criterion = (
        nn.BCEWithLogitsLoss()
    )  # expects shape (batch, n_buyers), target shape (batch, n_buyers)
    item_criterion = (
        nn.CrossEntropyLoss()
    )  # expects shape (batch, n_items), target shape (batch,)

    optimizer = optim.Adam(model.parameters(), lr=lr)

    # -------------------------
    # 3. Training Loop
    # -------------------------
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for batch in dataloader:
            buyer_ids, supplier_ids, item_ids, buyers_vec = batch
            buyer_ids = buyer_ids.to(device)
            supplier_ids = supplier_ids.to(device)
            item_ids = item_ids.to(device)
            buyers_vec = buyers_vec.to(device)

            # Forward pass
            supplier_logits, buyer_logits, item_logits = model(buyer_ids)

            # Compute losses
            loss_suppliers = supplier_criterion(supplier_logits, supplier_ids)
            loss_buyers = buyer_criterion(buyer_logits, buyers_vec)
            loss_items = item_criterion(item_logits, item_ids)

            # Weighted sum of losses
            loss = (
                weight_suppliers * loss_suppliers
                + weight_buyers * loss_buyers
                + weight_items * loss_items
            )

            # Backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f}")

    print("Training complete!")
    return model

In [26]:
%%time

model = train(
    n_samples=auction_items.shape[0],
    epochs=20,
    weight_suppliers=2.0,
    weight_items=1.0,
    weight_buyers=5.0,
    lr=5e-4,
    embed_dim=128,
)

Epoch [1/20] - Loss: 17.3148
Epoch [2/20] - Loss: 11.8637
Epoch [3/20] - Loss: 10.6908
Epoch [4/20] - Loss: 10.1928
Epoch [5/20] - Loss: 9.9233
Epoch [6/20] - Loss: 9.7565
Epoch [7/20] - Loss: 9.6465
Epoch [8/20] - Loss: 9.5734
Epoch [9/20] - Loss: 9.5200
Epoch [10/20] - Loss: 9.4799
Epoch [11/20] - Loss: 9.4459
Epoch [12/20] - Loss: 9.4219
Epoch [13/20] - Loss: 9.4034
Epoch [14/20] - Loss: 9.3867
Epoch [15/20] - Loss: 9.3724
Epoch [16/20] - Loss: 9.3609
Epoch [17/20] - Loss: 9.3509
Epoch [18/20] - Loss: 9.3416
Epoch [19/20] - Loss: 9.3380
Epoch [20/20] - Loss: 9.3293
Training complete!
CPU times: user 25min 15s, sys: 1min 35s, total: 26min 50s
Wall time: 6min 58s


In [27]:
embeddings_buyers_bkup = embeddings_buyers

In [28]:
def extract_buyer_embeddings(model):
    """
    Given a trained AuctionEmbeddingModel, return a NumPy array of shape (n_buyers, embed_dim)
    containing the learned buyer embeddings.
    """
    # Grab the weight matrix from the buyer_embedding layer
    emb_tensor = (
        model.buyer_embedding.weight.detach().cpu()
    )  # shape: (n_buyers, embed_dim)

    # Convert to NumPy array
    emb_array = emb_tensor.numpy()
    return emb_array


buyer_embeddings = extract_buyer_embeddings(model)
buyer_embeddings.shape

(377, 128)

In [29]:
embeddings_buyers = pd.DataFrame(
    buyer_embeddings,
    index=[buyer_to_idx_r[i] for i in range(num_buyers)],
    columns=[f"emb_{i}" for i in range(buyer_embeddings.shape[1])],
)

uge_column_translations = pd.read_csv(
    "assets/uge_column_translation.tsv", index_col=0, sep="\t"
)
supplier_column_translations = pd.read_csv(
    "assets/bidder_translations.tsv", index_col=0, sep="\t"
)
bidder_mapping = pd.read_csv("assets/bidder_mapping.csv", index_col=0)
buyer_mapping = pd.read_csv("assets/buyer_mapping.csv", index_col=0)


embeddings_buyers = embeddings_buyers.assign(
    uge=[buyer_mapping.loc[int(n), "name"] for n in embeddings_buyers.index]
)
embeddings_buyers = embeddings_buyers.assign(
    uge_translated=[
        (
            uge_column_translations.loc[name, "uge_translation"]
            if name in uge_column_translations.index
            else pd.NA
        )
        for name in embeddings_buyers.uge
    ]
)

embeddings_buyers.to_csv("embeddings/new_embeddings_buyers.tsv", sep="\t")

In [30]:
embeddings_buyers_bkup.to_csv("embeddings/new_embeddings_buyers.tsv", sep="\t")