In [1]:
import networkx as nx

In [2]:
G = nx.read_edgelist(
    "assets/edges_unweighted.csv", create_using=nx.Graph(), delimiter=",", nodetype=None
)

In [3]:
G.number_of_nodes(), G.number_of_edges()

(3774, 11984)

In [4]:
import pandas as pd

uge_column_translations = pd.read_csv(
    "assets/uge_column_translation.tsv", index_col=0, sep="\t"
)
supplier_column_translations = pd.read_csv(
    "assets/bidder_translations.tsv", index_col=0, sep="\t"
)
bidder_mapping = pd.read_csv("assets/bidder_mapping.csv", index_col=0)
buyer_mapping = pd.read_csv("assets/buyer_mapping.csv", index_col=0)


bidder_buyer_mapping = {
    **dict(zip(bidder_mapping.index, bidder_mapping.name)),
    **dict(zip(buyer_mapping.index, buyer_mapping.name)),
}

In [21]:
from ge import LINE

def train_LINE_embeddings(G, embedding_size=128, batch_size=1024, epochs=50):
    m = LINE(G, embedding_size=embedding_size, order="first")
    m.train(batch_size=batch_size, epochs=epochs, verbose=0)
    firstord_emb = m.get_embeddings()

    first_order_embeddings = pd.DataFrame(firstord_emb).T
    first_order_embeddings.columns = [
        f"first_order_{i}" for i in first_order_embeddings.columns
    ]
    first_order_embeddings = first_order_embeddings.sort_index()

    m = LINE(G, embedding_size=embedding_size, order="second")
    m.train(batch_size=batch_size, epochs=epochs, verbose=0)
    secondord_emb = m.get_embeddings()

    second_order_embeddings = pd.DataFrame(secondord_emb).T
    second_order_embeddings.columns = [
        f"second_order_{i}" for i in second_order_embeddings.columns
    ]
    second_order_embeddings = second_order_embeddings.sort_index()

    LINE_embeddings = second_order_embeddings.join(first_order_embeddings)
    return LINE_embeddings

def process_embeddings(embeddings):
    embeddings = embeddings.assign(
        supplier_name=[
            bidder_buyer_mapping[n] if n in bidder_mapping.index else "not_supplier" for n in embeddings.index
        ]
    )
    embeddings = embeddings.assign(
        supplier_name_translated=[
            (
                supplier_column_translations.loc[name, "translated"]
                if name in supplier_column_translations.index
                else pd.NA
            )
            for name in embeddings.supplier_name
        ]
    )

    embeddings =embeddings.assign(
        uge=[buyer_mapping.loc[n, "name"] if n in buyer_mapping.index else "not_buyer" for n in embeddings.index]
    )
    embeddings = embeddings.assign(
        uge_translated=[
            (
                uge_column_translations.loc[name, "uge_translation"]
                if name in uge_column_translations.index
                else pd.NA
            )
            for name in embeddings.uge
        ]
    )

    return embeddings

In [9]:
embeddings = train_LINE_embeddings(
    G, embedding_size=128, batch_size=1024, epochs=50
)

In [22]:
embeddings.index = embeddings.index.astype(int)
embeddings = embeddings.sort_index()
embeddings = process_embeddings(embeddings)
embeddings.to_csv("embeddings/LINE_embeddings.tsv", sep="\t")

In [25]:
from gensim.models.callbacks import CallbackAny2Vec
import numpy as np
from node2vec import Node2Vec

node2vec = Node2Vec(G, dimensions=128, walk_length=5, num_walks=10, workers=4)

loss_list = []

class callback(CallbackAny2Vec):
    """Callback to print loss after each epoch."""

    def __init__(self):
        self.epoch = 0
        self.every = 20

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_list.append(loss)
        self.epoch += 1

        if self.epoch % self.every == 0:
            print(
                f"epoch {self.epoch}, average loss epochs ({self.epoch-self.every}-{self.epoch-1}): {np.sum(loss_list[-self.every:]) / self.every}"
            )
        model.running_training_loss = 0


# Embed nodes
model = node2vec.fit(
    window=2, min_count=1, callbacks=[callback()], compute_loss=True, epochs=20
)

Computing transition probabilities:   1%|          | 25/3774 [00:00<00:15, 247.16it/s]

Computing transition probabilities: 100%|██████████| 3774/3774 [00:08<00:00, 425.76it/s] 
Generating walks (CPU: 1): 100%|██████████| 3/3 [00:10<00:00,  3.40s/it]
Generating walks (CPU: 2): 100%|██████████| 3/3 [00:10<00:00,  3.64s/it]
Generating walks (CPU: 3): 100%|██████████| 2/2 [00:05<00:00,  2.80s/it]
Generating walks (CPU: 4): 100%|██████████| 2/2 [00:01<00:00,  1.17it/s]


epoch 20, average loss epochs (0-19): 101466.376953125


In [26]:
model = node2vec.fit(
    window=2, min_count=1, callbacks=[callback()], compute_loss=True, epochs=200
)

epoch 20, average loss epochs (0-19): 98467.9330078125
epoch 40, average loss epochs (20-39): 57767.086328125
epoch 60, average loss epochs (40-59): 54571.5751953125
epoch 80, average loss epochs (60-79): 51600.2982421875
epoch 100, average loss epochs (80-99): 50517.2810546875
epoch 120, average loss epochs (100-119): 49191.73046875
epoch 140, average loss epochs (120-139): 48830.1818359375
epoch 160, average loss epochs (140-159): 47711.3515625
epoch 180, average loss epochs (160-179): 46595.513671875
epoch 200, average loss epochs (180-199): 46049.61640625


In [28]:
embeddings = {word: model.wv[word] for word in G.nodes()}
embeddings = pd.DataFrame(embeddings).T
embeddings.columns = [f"emb_{i}" for i in embeddings.columns]
embeddings.index = embeddings.index.astype(int)
embeddings = embeddings.sort_index()
embeddings = process_embeddings(embeddings)
embeddings.to_csv("embeddings/node2vec_embeddings.tsv", sep="\t")

In [30]:
embeddings

Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127,supplier_name,supplier_name_translated,uge,uge_translated
0,0.309829,-0.814386,0.347980,0.995084,0.173959,0.080703,-0.115008,-0.552635,0.639877,0.545313,...,-0.086057,0.780358,-0.056968,0.351025,-0.504943,-0.043137,not_supplier,,30030,FED-COURT OF JUSTICE
1,0.147302,-0.339653,0.249569,1.744981,1.118846,-0.185612,0.120552,-0.380483,-0.347499,0.560059,...,-0.351947,0.531129,0.054915,0.743829,-0.478763,-0.199828,not_supplier,,80101,
2,0.663155,-0.579159,-0.264292,0.557519,0.032749,0.494681,0.177326,-0.115157,0.111801,-0.211022,...,0.498620,-0.183190,0.179657,0.758300,-0.354648,-0.242429,not_supplier,,80102,
3,0.453806,-0.482251,0.114109,0.105661,0.637176,0.150782,-0.077948,-0.482876,-0.192504,0.349706,...,-0.502927,0.827053,-0.637465,0.512491,0.011196,-0.111360,not_supplier,,80104,STATE COUNCIL OF EDUCATION-CEE
4,0.770835,-0.594367,-0.053370,-0.010173,0.998811,0.312823,0.159028,-0.456464,0.232722,-0.098896,...,-0.433708,0.506628,-0.340293,-0.303454,-0.445536,0.104090,not_supplier,,80261,DIR.ENS.-REG.CENTRO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3856,0.142752,-0.009379,-0.363940,0.494758,0.853324,-0.154540,0.152566,-0.617153,0.205513,-0.651387,...,0.029118,0.466034,-0.501628,0.477862,-0.332353,0.576511,Teto Construtora S.A,Teto Construtora S.A,not_buyer,
3857,-0.018205,-0.024940,-0.075652,0.435533,0.551790,-0.492256,0.417799,-0.517681,-0.100992,-0.781634,...,0.138287,0.708059,-0.069319,0.197343,-0.417775,0.372562,PEMA ENGENHARIA LTDA.,PEMA ENGENHARIA LTDA.,not_buyer,
3858,0.178811,-0.381734,-0.244766,0.347437,0.424175,-0.239775,0.280995,-0.569306,-0.079546,-0.552313,...,0.223812,0.837864,-0.618433,0.200852,-0.437864,0.274753,SOUSA E FIGUEIREDO CONSTRUCOES LTDA,SOUSA E FIGUEIREDO CONSTRUCOES LTDA,not_buyer,
3859,0.011693,-0.539441,-0.428226,0.642909,0.387752,-0.090114,0.445424,-0.502614,-0.199881,-0.637638,...,0.152553,0.860629,-0.245292,0.333979,-0.271174,0.433200,Solint Construtora Ltda,Solint Construtora Ltda,not_buyer,
