In [1]:
import pandas as pd

df = pd.read_csv("assets/processed_data.tsv", sep="\t")

In [2]:
# granularity choice, either "group" or "class" (class is more granular)
granularity = "class"

# 1. we need buyer sample rates
# 2. we need seller sample rates
# 3. for each buyer, we need item sample rates
# 4. for each seller, we need item sample rates
# 5. for each buyer + item, we need seller sample rates
# 6. for each seller + item, we need buyer sample rates
# 7. we need item sample rates for sampling from background distribution

buyer_sample_rates = (
    df[["id", "uge"]].drop_duplicates()["uge"].value_counts(normalize=True)
)
seller_sample_rates = (
    df[["id", "supplier_name"]]
    .drop_duplicates()["supplier_name"]
    .value_counts(normalize=True)
)

buyer_to_item_sample_rates = (
    df[["id", "uge", "sequence_item", granularity]]
    .drop_duplicates()
    .groupby(["uge"])[granularity]
    .value_counts(normalize=True)
)
seller_to_item_sample_rates = (
    df[["id", "supplier_name", "sequence_item", granularity]]
    .drop_duplicates()
    .groupby(["supplier_name"])[granularity]
    .value_counts(normalize=True)
)

buyer_item_to_seller_sample_rates = (
    df[["id", "uge", "sequence_item", "supplier_name", granularity]]
    .drop_duplicates()
    .groupby(["uge", granularity])["supplier_name"]
    .value_counts(normalize=True)
)
seller_item_to_buyer_sample_rates = (
    df[["id", "supplier_name", "sequence_item", "uge", granularity]]
    .drop_duplicates()
    .groupby(["supplier_name", granularity])["uge"]
    .value_counts(normalize=True)
)

item_sample_rates = (
    df[["id", "uge", "sequence_item", granularity]]
    .drop_duplicates()[granularity]
    .value_counts(normalize=True)
)

print(buyer_to_item_sample_rates.head())
print(buyer_item_to_seller_sample_rates.head())

uge  class
0    3862     0.500
     3860     0.125
     3861     0.125
     3863     0.125
     3864     0.125
Name: proportion, dtype: float64
uge  class  supplier_name
0    3860   377              0.111111
            378              0.111111
            379              0.111111
            380              0.111111
            381              0.111111
Name: proportion, dtype: float64


In [3]:
df.uge.max()

376

In [4]:
item_sample_rates 

class
3902    0.162237
3882    0.121984
3878    0.046036
4061    0.042922
4106    0.036473
          ...   
4064    0.000111
4059    0.000111
4055    0.000111
4054    0.000111
4176    0.000111
Name: proportion, Length: 317, dtype: float64

In [5]:
index_to_seller_mapping = (
    pd.read_csv("assets/bidder_mapping.csv").set_index("supplier_name").name.to_dict()
)
seller_to_index_mapping = {seller: i for i, seller in index_to_seller_mapping.items()}

index_to_buyer_mapping = (
    pd.read_csv("assets/buyer_mapping.csv").set_index("uge").name.to_dict()
)
buyer_to_index_mapping = {buyer: i for i, buyer in index_to_buyer_mapping.items()}

if granularity == "group":
    index_to_item_mapping = (
        pd.read_csv("assets/item_group_mapping.csv").set_index("group").name.to_dict()
    )
    item_to_index_mapping = {item: i for i, item in index_to_item_mapping.items()}
else:
    index_to_item_mapping = (
        pd.read_csv("assets/item_class_mapping.csv").set_index("class").name.to_dict()
    )
    item_to_index_mapping = {item: i for i, item in index_to_item_mapping.items()}

buyer_translations = (
    pd.read_csv("assets/uge_column_translation.tsv", sep="\t")
    .set_index("cd_uge")
    .uge_translation.to_dict()
)
seller_translations = (
    pd.read_csv("assets/bidder_translations.tsv", sep="\t")
    .set_index("supplier_name")
    .translated.to_dict()
)

In [6]:
# for background sampling, we modify buyer/seller/item sampling so that it is proportional to (sample_rate)**0.75

background_buyer_sample_rates = buyer_sample_rates**0.75
background_buyer_sample_rates = (
    background_buyer_sample_rates / background_buyer_sample_rates.sum()
)

background_seller_sample_rates = seller_sample_rates**0.75
background_seller_sample_rates = (
    background_seller_sample_rates / background_seller_sample_rates.sum()
)

background_item_sample_rates = item_sample_rates**0.75
background_item_sample_rates = (
    background_item_sample_rates / background_item_sample_rates.sum()
)

In [7]:
# random walk sampling

import random


def sample_buyer():
    return random.choices(
        buyer_sample_rates.index, weights=buyer_sample_rates.values, k=1
    )[0]


def sample_seller():
    return random.choices(
        seller_sample_rates.index, weights=seller_sample_rates.values, k=1
    )[0]


def sample_item():
    return random.choices(
        item_sample_rates.index, weights=item_sample_rates.values, k=1
    )[0]


def sample_background_buyer():
    return random.choices(
        background_buyer_sample_rates.index,
        weights=background_buyer_sample_rates.values,
        k=1,
    )[0]


def sample_background_seller():
    return random.choices(
        background_seller_sample_rates.index,
        weights=background_seller_sample_rates.values,
        k=1,
    )[0]


def sample_background_item():
    return random.choices(
        background_item_sample_rates.index,
        weights=background_item_sample_rates.values,
        k=1,
    )[0]


def sample_item_from_buyer(buyer):
    return random.choices(
        buyer_to_item_sample_rates[buyer].index,
        weights=buyer_to_item_sample_rates[buyer].values,
        k=1,
    )[0]


def sample_item_from_seller(seller):
    return random.choices(
        seller_to_item_sample_rates[seller].index,
        weights=seller_to_item_sample_rates[seller].values,
        k=1,
    )[0]


def sample_seller_from_buyer_item(buyer, item):
    return random.choices(
        buyer_item_to_seller_sample_rates[buyer][item].index,
        weights=buyer_item_to_seller_sample_rates[buyer][item].values,
        k=1,
    )[0]


def sample_buyer_from_seller_item(seller, item):
    return random.choices(
        seller_item_to_buyer_sample_rates[seller][item].index,
        weights=seller_item_to_buyer_sample_rates[seller][item].values,
        k=1,
    )[0]


def sample_random_walk(node_id, node_type, walk_length=5):
    """
    Returns a random sequence of node IDs, length = num_steps.
    (e.g., buyer -> item -> supplier -> item -> buyer ...).

    node_type is either "buyer" or "seller".
    """
    walk = []
    current_type = node_type

    walk.append(node_id)

    for _ in range(walk_length):
        if current_type == "buyer":
            node_id = sample_item_from_buyer(node_id)
            current_type = "item_from_buyer"
        elif current_type == "seller":
            node_id = sample_item_from_seller(node_id)
            current_type = "item_from_seller"
        elif current_type == "item_from_buyer":
            node_id = sample_seller_from_buyer_item(walk[-2], node_id)
            current_type = "seller"
        elif current_type == "item_from_seller":
            node_id = sample_buyer_from_seller_item(walk[-2], node_id)
            current_type = "buyer"
        walk.append(node_id)
    return walk

In [10]:
# generating skip-gram pairs


def generate_itemwalk_skip_gram_pairs(
    node_id, walk_length=5, window_size=2, negative_samples=3
) -> list[tuple[int, int, int]]:
    """
    Given a node ID, generate a walk, then generate (center, context) pairs
    within a certain window_size.

    Also generate negative samples, with `negative_samples` for each positive pair.

    E.g. walk = [A, B, C, D], window_size = 2
       center = B, context could be A, C
       center = C, context could be B, D
    We'll yield (center, context).

    We also need to sample negative samples, which we do by sampling from the background distribution.
    Return a list of (center, target, -1 or 1) depending on if it's a negative or positive sample.
    """

    if node_id in index_to_buyer_mapping:
        node_type = "buyer"
    elif node_id in index_to_seller_mapping:
        node_type = "seller"
    else:
        raise ValueError(f"node_id {node_id} not found in buyer or seller mappings")

    walk = sample_random_walk(node_id, node_type, walk_length=walk_length)

    pairs = []
    length = len(walk)
    for i in range(length):
        center = walk[i]
        # context is within [i-window_size, i+window_size] except i
        start = max(0, i - window_size)
        end = min(length, i + window_size + 1)
        for j in range(start, end):
            if j == i:
                continue
            pairs.append((center, walk[j], 1))
            for _ in range(negative_samples):
                if node_type == "buyer" or node_type == "seller":
                    pairs.append((center, sample_background_item(), -1))
                elif node_type == "item_from_buyer":
                    pairs.append((center, sample_background_buyer(), -1))
                elif node_type == "item_from_seller":
                    pairs.append((center, sample_background_seller(), -1))
        if node_type == "buyer":
            node_type = "item_from_buyer"
        elif node_type == "seller":
            node_type = "item_from_seller"
        elif node_type == "item_from_buyer":
            node_type = "seller"
        elif node_type == "item_from_seller":
            node_type = "buyer"

    return pairs

In [20]:
from torch.utils.data import Dataset

class ItemWalkDataset(Dataset):
    def __init__(self, 
                node_ids: list[int],
                samples = None,
                num_walks=10, 
                walk_length=5, 
                window_size=2,
                negative_samples=3,
    ):
        """
        - node_ids: list of node ids to generate walks with
        - samples: pre-generated samples
        - num_walks: how many random walks
        - walk_length: length of each walk
        - window_size: skip-gram window size
        - num_neg: how many negatives per positive
        """
        if samples is not None:
            self.samples = samples
            return 
        self.samples = []
        
        for i, node in enumerate(node_ids):
            if i % 100 == 0:
                print(f"Generating samples for node {i} / {len(node_ids)}")
            for _ in range(num_walks):
                sg_pairs = generate_itemwalk_skip_gram_pairs(
                    node,
                    window_size=window_size, 
                    walk_length=walk_length,
                    negative_samples=negative_samples
                )
                self.samples.extend(sg_pairs)
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        return self.samples[idx]


In [12]:
import numpy as np
import torch
import torch.nn as nn


class DoubleEmbeddingModel(nn.Module):
    def __init__(self, num_nodes, embed_dim):
        super().__init__()
        self.num_nodes = num_nodes
        self.embed_dim = embed_dim

        self.input_embedding = nn.Embedding(num_nodes, embed_dim)
        self.output_embedding = nn.Embedding(num_nodes, embed_dim)

        # Initialize embeddings
        nn.init.xavier_uniform_(self.input_embedding.weight)
        nn.init.xavier_uniform_(self.output_embedding.weight)

    def forward(self, center_nodes, context_nodes):
        """
        center_nodes: (batch_size,) indices
        context_nodes: (batch_size,) indices

        Return the dot product: (batch_size,)
        """
        center_emb = self.input_embedding(center_nodes)  # (B, embed_dim)
        context_emb = self.output_embedding(context_nodes)  # (B, embed_dim)
        # Dot product
        scores = torch.sum(center_emb * context_emb, dim=1)  # (B,)
        return scores

    def get_embeddings(self):
        return np.concatenate(
            [
                self.input_embedding.weight.detach().cpu().numpy(),
                self.output_embedding.weight.detach().cpu().numpy(),
            ],
            axis=1,
        )

In [13]:
class SingleEmbeddingModel(nn.Module):
    def __init__(self, num_nodes, embed_dim):
        super().__init__()
        self.num_nodes = num_nodes
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(num_nodes, embed_dim)

        # Initialize embedding
        nn.init.xavier_uniform_(self.embedding.weight)

    def forward(self, center_nodes, context_nodes):
        """
        center_nodes: (batch_size,) indices
        context_nodes: (batch_size,) indices

        Return the dot product: (batch_size,)
        """
        center_emb = self.embedding(center_nodes)  # (B, embed_dim)
        context_emb = self.embedding(context_nodes)  # (B, embed_dim)
        # Dot product
        scores = torch.sum(center_emb * context_emb, dim=1)  # (B,)
        return scores

    def get_embeddings(self):
        return self.embedding.weight.detach().cpu().numpy()

In [29]:
import torch.nn.functional as F
from torch.utils.data import DataLoader

def train_itemwalk(
    embed_dim: int,
    dataset: ItemWalkDataset,
    embedding_type="double",
    batch_size=128,
    epochs=5,
    lr=1e-3,
    num_nodes=max(index_to_item_mapping.keys()) + 1,
):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # 2) Model
    device = "cpu"
    if embedding_type == "double":
        model = DoubleEmbeddingModel(num_nodes, embed_dim).to(device)
    else:
        model = SingleEmbeddingModel(num_nodes, embed_dim).to(device)
    
    # 3) Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    # 4) Training
    for epoch in range(epochs):
        total_loss = 0.0
        model.train()
        
        i = 0
        for centers, targets, labels in dataloader:
            if i % 1000 == 0:
                print(f"Epoch {epoch+1} - Iteration {i} / {len(dataloader)}")
            centers = centers.to(device)
            targets = targets.to(device)
            labels  = labels.to(device, dtype=torch.float)  # +1.0 or -1.0
            
            # Forward pass
            signed_scores = model(centers, targets)
            
            # Loss = -log(sigmoid(signed_scores))
            loss = -F.logsigmoid(signed_scores).mean()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            i+=1 
        
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f}")
    
    return model


In [None]:
seen_buyers_and_sellers = set(df.uge.unique()).union(df.supplier_name.unique())
dataset = ItemWalkDataset(
    node_ids=seen_buyers_and_sellers,
    num_walks=10,
    walk_length=5,
    window_size=2,
    negative_samples=3,
)
pd_dataset = pd.DataFrame.from_records(dataset.samples, columns=["center", "context", "label"])
pd_dataset.to_csv("assets/itemwalk_samples.tsv", sep="\t", index=False)

In [21]:
samples = list(pd_dataset.itertuples(index=False, name=None))
dataset = ItemWalkDataset(node_ids=None, samples=samples)

In [30]:
model_double = train_itemwalk(128, dataset, embedding_type="double", epochs=5)

Epoch 1 - Iteration 0 / 21229
Epoch 1 - Iteration 1000 / 21229
Epoch 1 - Iteration 2000 / 21229
Epoch 1 - Iteration 3000 / 21229
Epoch 1 - Iteration 4000 / 21229
Epoch 1 - Iteration 5000 / 21229
Epoch 1 - Iteration 6000 / 21229
Epoch 1 - Iteration 7000 / 21229
Epoch 1 - Iteration 8000 / 21229
Epoch 1 - Iteration 9000 / 21229
Epoch 1 - Iteration 10000 / 21229
Epoch 1 - Iteration 11000 / 21229
Epoch 1 - Iteration 12000 / 21229
Epoch 1 - Iteration 13000 / 21229
Epoch 1 - Iteration 14000 / 21229
Epoch 1 - Iteration 15000 / 21229
Epoch 1 - Iteration 16000 / 21229
Epoch 1 - Iteration 17000 / 21229
Epoch 1 - Iteration 18000 / 21229
Epoch 1 - Iteration 19000 / 21229
Epoch 1 - Iteration 20000 / 21229
Epoch 1 - Iteration 21000 / 21229
Epoch [1/5] - Loss: 0.0181
Epoch 2 - Iteration 0 / 21229
Epoch 2 - Iteration 1000 / 21229
Epoch 2 - Iteration 2000 / 21229
Epoch 2 - Iteration 3000 / 21229
Epoch 2 - Iteration 4000 / 21229
Epoch 2 - Iteration 5000 / 21229
Epoch 2 - Iteration 6000 / 21229
Epoch 2 - 

In [32]:
embeddings = np.concatenate([model_double.input_embedding.weight.detach().cpu().numpy(), model_double.output_embedding.weight.detach().cpu().numpy()], axis=1)
embeddings.shape

(4177, 256)

In [39]:
uge_column_translations = pd.read_csv(
    "assets/uge_column_translation.tsv", index_col=0, sep="\t"
)
supplier_column_translations = pd.read_csv(
    "assets/bidder_translations.tsv", index_col=0, sep="\t"
)
bidder_mapping = pd.read_csv("assets/bidder_mapping.csv", index_col=0)
buyer_mapping = pd.read_csv("assets/buyer_mapping.csv", index_col=0)

bidder_buyer_mapping = {
    **dict(zip(bidder_mapping.index, bidder_mapping.name)),
    **dict(zip(buyer_mapping.index, buyer_mapping.name)),
}

def process_embeddings(embeddings):
    embeddings = embeddings.assign(
        supplier_name=[
            bidder_buyer_mapping[n] if n in bidder_mapping.index else "not_supplier" for n in embeddings.index
        ]
    )
    embeddings = embeddings.assign(
        supplier_name_translated=[
            (
                supplier_column_translations.loc[name, "translated"]
                if name in supplier_column_translations.index
                else pd.NA
            )
            for name in embeddings.supplier_name
        ]
    )

    embeddings =embeddings.assign(
        uge=[buyer_mapping.loc[n, "name"] if n in buyer_mapping.index else "not_buyer" for n in embeddings.index]
    )
    embeddings = embeddings.assign(
        uge_translated=[
            (
                uge_column_translations.loc[name, "uge_translation"]
                if name in uge_column_translations.index
                else pd.NA
            )
            for name in embeddings.uge
        ]
    )

    embeddings = embeddings.assign(
        item_class = [
            index_to_item_mapping[n] if n in index_to_item_mapping else "not_item" for n in embeddings.index
        ]
    )

    return embeddings

In [None]:
embeddings = pd.DataFrame(embeddings, index=range(embeddings.shape[0]))
embeddings.columns = [f"emb_{i}" for i in range(embeddings.shape[1])]

In [40]:
processed_embeddings = process_embeddings(embeddings)
processed_embeddings 

Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_251,emb_252,emb_253,emb_254,emb_255,supplier_name,supplier_name_translated,uge,uge_translated,item_class
0,0.731551,-0.375008,-0.571914,0.581000,-0.556987,0.229452,0.549535,-0.523985,0.561984,-0.533132,...,-0.608045,0.523619,-0.528006,0.507977,-0.585342,not_supplier,,30030,FED-COURT OF JUSTICE,not_item
1,0.573513,-0.151575,-0.508381,0.536726,-0.501923,0.130391,0.480655,-0.478311,0.452971,-0.476044,...,-0.539418,0.453902,-0.502660,0.492334,-0.446596,not_supplier,,80101,,not_item
2,0.603562,-0.245835,-0.537313,0.545040,-0.477370,0.121336,0.417591,-0.445793,0.486382,-0.420184,...,-0.473029,0.498613,-0.481536,0.456729,-0.477192,not_supplier,,80102,,not_item
3,0.440584,-0.040289,-0.341106,0.422846,-0.345091,-0.248647,0.316007,-0.303121,0.304782,-0.326025,...,-0.415571,0.350345,-0.287183,0.325448,-0.345992,not_supplier,,80104,STATE COUNCIL OF EDUCATION-CEE,not_item
4,0.275123,0.148665,-0.303283,0.274029,-0.256393,-0.097348,0.280908,-0.264297,0.292053,-0.253449,...,-0.360640,0.249160,-0.326155,0.313843,-0.225223,not_supplier,,80261,DIR.ENS.-REG.CENTRO,not_item
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4172,0.388750,-0.245933,-0.331393,0.309766,-0.299210,0.206066,0.358286,-0.317579,0.327094,-0.298681,...,-0.543369,0.525889,-0.489960,0.542277,-0.514055,not_supplier,,not_buyer,,6092
4173,0.733462,-0.548901,-0.651002,0.624577,-0.558501,0.492347,0.550152,-0.608413,0.618378,-0.615394,...,-0.549108,0.513900,-0.532773,0.680567,-0.622711,not_supplier,,not_buyer,,7710
4174,0.622479,-0.529199,-0.584203,0.588253,-0.530147,0.376010,0.503848,-0.554078,0.568994,-0.573283,...,-0.579503,0.539232,-0.475303,0.535658,-0.565375,not_supplier,,not_buyer,,503
4175,0.700517,-0.572247,-0.698432,0.580243,-0.549358,0.488593,0.611860,-0.683755,0.601240,-0.651479,...,-0.611020,0.515947,-0.578271,0.602341,-0.639196,not_supplier,,not_buyer,,117


In [42]:
processed_embeddings.to_csv("embeddings/itemwalk_embeddings_double.tsv", sep="\t")

In [43]:
model_single = train_itemwalk(128, dataset, embedding_type="single", epochs=3)

Epoch 1 - Iteration 0 / 21229
Epoch 1 - Iteration 1000 / 21229
Epoch 1 - Iteration 2000 / 21229
Epoch 1 - Iteration 3000 / 21229
Epoch 1 - Iteration 4000 / 21229
Epoch 1 - Iteration 5000 / 21229
Epoch 1 - Iteration 6000 / 21229
Epoch 1 - Iteration 7000 / 21229
Epoch 1 - Iteration 8000 / 21229
Epoch 1 - Iteration 9000 / 21229
Epoch 1 - Iteration 10000 / 21229
Epoch 1 - Iteration 11000 / 21229
Epoch 1 - Iteration 12000 / 21229
Epoch 1 - Iteration 13000 / 21229
Epoch 1 - Iteration 14000 / 21229
Epoch 1 - Iteration 15000 / 21229
Epoch 1 - Iteration 16000 / 21229
Epoch 1 - Iteration 17000 / 21229
Epoch 1 - Iteration 18000 / 21229
Epoch 1 - Iteration 19000 / 21229
Epoch 1 - Iteration 20000 / 21229
Epoch 1 - Iteration 21000 / 21229
Epoch [1/3] - Loss: 0.0128
Epoch 2 - Iteration 0 / 21229
Epoch 2 - Iteration 1000 / 21229
Epoch 2 - Iteration 2000 / 21229
Epoch 2 - Iteration 3000 / 21229
Epoch 2 - Iteration 4000 / 21229
Epoch 2 - Iteration 5000 / 21229
Epoch 2 - Iteration 6000 / 21229
Epoch 2 - 

In [44]:
embeddings = model_single.get_embeddings()
embeddings = pd.DataFrame(embeddings, index=range(embeddings.shape[0]))
embeddings.columns = [f"emb_{i}" for i in range(embeddings.shape[1])]
processed_embeddings = process_embeddings(embeddings)
processed_embeddings.to_csv("embeddings/itemwalk_embeddings_single.tsv", sep="\t")