In [None]:
! pip install wheel \
    numpy==1.23.2 \
    scikit-learn==1.1.2 \
    pandas==1.4.4 \
    tqdm==4.64.1

In [5]:
import os
import random
import sys
import pickle as pkl

import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from tqdm import tqdm

In [None]:
random.seed(0)
np.random.seed(0)

In [6]:
config = {
    #embedding computation
    'cleora_n_iter': 5,
    'cleora_dim': 1024,
    
    #dataset preparation
    'train_test_split': 0.2,
    
    #training classification
    'input_embeddings': [
                    'output/emb__cluster_id__StarNode.out',
                    'output/emb__CliqueNode__CliqueNode.out',
                   ],
    'batch_size': 256,
    'test_batch_size': 1000,
    'epochs': [20],
}

# Dataset preparation

1. Download the Facebook dataset from SNAP: https://snap.stanford.edu/data/facebook-large-page-page-network.html
2. Extract the dataset to ./facebook_large/
3. Compute Cleora embeddings as shown in "Cleora training" section in `example_link_prediction.ipynb`

In [None]:
! wget https://snap.stanford.edu/data/facebook_large.zip
! unzip facebook_large.zip

In [9]:
df_cleora = pd.read_csv("./facebook_large/musae_facebook_edges.csv")

In [10]:
df_cleora.head()

Unnamed: 0,id_1,id_2
0,0,18427
1,1,21708
2,1,22208
3,1,22171
4,1,6829


In [11]:
train_cleora, test_cleora = train_test_split(df_cleora, test_size=config['train_test_split'])

In [12]:
fb_cleora_input_clique_filename = "fb_cleora_input_clique.txt"
fb_cleora_input_star_filename = "fb_cleora_input_star.txt"
output_dir = 'output'

In [13]:
with open(fb_cleora_input_clique_filename, "w") as f_cleora_clique, open(fb_cleora_input_star_filename, "w") as f_cleora_star:
    grouped_train = train_cleora.groupby('id_1')
    for n, (name, group) in enumerate(grouped_train):
        group_list = group['id_2'].tolist()
        group_elems = list(map(str, group_list))
        f_cleora_clique.write("{} {}\n".format(name, ' '.join(group_elems)))
        f_cleora_star.write("{}\t{}\n".format(n, name))
        for elem in group_elems:
            f_cleora_star.write("{}\t{}\n".format(n, elem))

In [14]:
df = pd.read_csv("facebook_large/musae_facebook_target.csv")

In [15]:
classes = df['page_type'].unique()
class_ids = list(range(0, len(classes)))
class_dict = {k:v for k,v in zip(classes, class_ids)}
df['page_type'] = [class_dict[item] for item in df['page_type']] 

In [16]:
train_filename = "fb_classification_train.txt"
test_filename = "fb_classification_test.txt"

In [17]:
train, test = train_test_split(df, test_size=config['train_test_split'])

In [18]:
with open(train_filename, "w") as f_train:
    for index, row in train.iterrows():
        f_train.write("{} {}\n".format(row['id'], row['page_type']))

In [19]:
with open(test_filename, "w") as f_test:
    for index, row in test.iterrows():
        f_test.write("{} {}\n".format(row['id'], row['page_type']))

# Cleora training

Download an appropriate binary Cleora release from: https://github.com/Synerise/cleora/releases . 

A Linux GNU version is assumed in this example, but any other will do.

In [20]:
import subprocess

cleora_binary_path = "./cleora-v1.0.1-x86_64-unknown-linux-gnu"

def columns2output_filename(output_dir, columns):
    columns_split = columns.split()
    if len(columns_split) == 1 and 'reflexive' in columns:
        column_name = columns.split('::')[-1]
        return os.path.join(output_dir, f'emb__{column_name}__{column_name}.out')

    column_names = [i.split('::')[-1] for i in columns_split]
    return os.path.join(output_dir, 'emb__' + '__'.join(column_names) + '.out')


def train_cleora(dim, n_iter, columns, input_filename, output_dir):
    command = [cleora_binary_path,
                '--columns', columns,
                '--dimension', str(dim), 
                '-n', str(n_iter), 
                '--input', input_filename, 
                '-o', output_dir]
    subprocess.run(command, check=True)
    return columns2output_filename(output_dir, columns)

## Star expansion

In the `fb_cleora_input_star.txt` file the first column is a virtual node. The parameter `-c "transient::cluster_id node"` means that embeddings will not be created for nodes from this column. This translates to star expansion scheme.

In [22]:
%%time
cleora_output_star_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], "transient::cluster_id StarNode", fb_cleora_input_star_filename, output_dir)

[0m[38;5;8m[[0m2022-09-09T12:47:18Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Reading args...
[src/main.rs:222] &config = Configuration {
    produce_entity_occurrence_count: true,
    embeddings_dimension: 1024,
    max_number_of_iteration: 5,
    seed: None,
    prepend_field: false,
    log_every_n: 10000,
    in_memory_embedding_calculation: true,
    input: [
        "fb_cleora_input_star.txt",
    ],
    file_type: Tsv,
    output_dir: Some(
        "output",
    ),
    output_format: TextFile,
    relation_name: "emb",
    columns: [
        Column {
            name: "cluster_id",
            transient: true,
            complex: false,
            reflexive: false,
            ignored: false,
        },
        Column {
            name: "StarNode",
            transient: false,
            complex: false,
            reflexive: false,
            ignored: false,
        },
    ],
}
[0m[38;5;8m[[0m2022-09-09T12:47:18Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Startin

CPU times: user 9.6 ms, sys: 17.2 ms, total: 26.8 ms
Wall time: 2.95 s


[0m[38;5;8m[[0m2022-09-09T12:47:21Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done saving embeddings.
[0m[38;5;8m[[0m2022-09-09T12:47:21Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Finalizing embeddings calculations!
[0m[38;5;8m[[0m2022-09-09T12:47:21Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Finished in 2 sec


## Clique expansion

The `fb_cleora_input_clique.txt` file has the structure of adjacency list. The parameter `-c "complex::reflexive::node"` means that edges will be created for all cominations of nodes from each line. This translates to clique expansion scheme.

In [23]:
%%time
cleora_output_clique_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], "complex::reflexive::CliqueNode", fb_cleora_input_clique_filename, output_dir)

[0m[38;5;8m[[0m2022-09-09T12:47:33Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Reading args...
[src/main.rs:222] &config = Configuration {
    produce_entity_occurrence_count: true,
    embeddings_dimension: 1024,
    max_number_of_iteration: 5,
    seed: None,
    prepend_field: false,
    log_every_n: 10000,
    in_memory_embedding_calculation: true,
    input: [
        "fb_cleora_input_clique.txt",
    ],
    file_type: Tsv,
    output_dir: Some(
        "output",
    ),
    output_format: TextFile,
    relation_name: "emb",
    columns: [
        Column {
            name: "CliqueNode",
            transient: false,
            complex: true,
            reflexive: true,
            ignored: false,
        },
    ],
}
[0m[38;5;8m[[0m2022-09-09T12:47:33Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Starting calculation...
[src/pipeline.rs:25] &sparse_matrices = [
    SparseMatrix {
        col_a_id: 0,
        col_a_name: "CliqueNode",
        col_b_id: 1,
        col_b_name:

CPU times: user 10.6 ms, sys: 10.1 ms, total: 20.8 ms
Wall time: 6.26 s


[0m[38;5;8m[[0m2022-09-09T12:47:39Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done saving embeddings.
[0m[38;5;8m[[0m2022-09-09T12:47:39Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Finalizing embeddings calculations!
[0m[38;5;8m[[0m2022-09-09T12:47:39Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Finished in 6 sec


## No expansion

You can also compute Cleora without any expansion scheme by providing an input file in the edgelist format (single pair of nodes per line). Run with a simple parameter: `-c "node1 node2"`.

# Classification

We train a simple multiclass Logistic Regression classifier to predict the class of node based on its embedding. We assess the quality of the classifier with of 2 metrics: micro-F1 and macro-F1.

In [24]:
def read_embeddings(input_file):
    df_full = pd.read_csv(input_file, delimiter = " ", skiprows=[0], header=None, 
                     index_col=0)
    df_full = df_full.drop([1], axis=1)

    return df_full

In [29]:
def read_train_test(embeddings):
    valid_idx = embeddings.index.to_numpy()
    
    train = np.loadtxt(train_filename, delimiter=" ", dtype=int) 
    test = np.loadtxt(test_filename, delimiter=" ", dtype=int)
    
    train = train[np.isin(train[:,0], valid_idx) & np.isin(train[:,1], valid_idx)]
    test = [t for t in test if (t[0] in valid_idx) and (t[1] in valid_idx)] 
     
    train = np.array(train)
    test = np.array(test)
    
    return train,test

In [26]:
batch_size = config['batch_size']
test_batch_size = config['test_batch_size']

In [30]:
for algo in config['input_embeddings']:
    embeddings = read_embeddings(algo)
    train,test = read_train_test(embeddings)
                                 
    y_train = train[:, 1]
    y_test = test[:, 1]

    clf = SGDClassifier(random_state=0, loss='log_loss', alpha=0.0001)
    for e in tqdm(range(0, max(config['epochs']))):
        for idx in range(0,train.shape[0],batch_size):
            ex=train[idx:min(idx+batch_size,train.shape[0]),:]

            ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()
            ex_y = y_train[idx:min(idx+batch_size,train.shape[0])]
    
            clf.partial_fit(ex_emb_in, ex_y, classes=[0,1,2,3])
        
        if e+1 in config['epochs']:
            acc = 0.0
            y_pred = []
            for n, idx in enumerate(range(0,test.shape[0],test_batch_size)):
                ex=test[idx:min(idx+test_batch_size,train.shape[0]),:]
                ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()
                pred = clf.predict_proba(ex_emb_in)
    
                classes = np.argmax(pred, axis=1)
                y_pred.extend(classes)

            f1_micro = f1_score(y_test, y_pred, average='micro')
            f1_macro = f1_score(y_test, y_pred, average='macro')
            print('algo: {} epochs: {}, micro f1: {}, macro f1:{}'.format(algo, e+1, f1_micro, f1_macro))


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:12<00:00,  1.55it/s]

algo: output/emb__cluster_id__StarNode.out epochs: 20, micro f1: 0.880957810718358, macro f1:0.871848594284028



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:13<00:00,  1.51it/s]

algo: output/emb__CliqueNode__CliqueNode.out epochs: 20, micro f1: 0.8793614595210947, macro f1:0.8713233961627364



