In [1]:
import XGCN
from XGCN.data import io, csr
from XGCN.utils.utils import ensure_dir, set_random_seed

import os.path as osp

  from .autonotebook import tqdm as notebook_tqdm


## Prepare the raw data

Let's begin with a small social network dataset: facebook.
The data is included in the XGCN repository: ``data/raw_facebook``. 
You can also download it from SNAP: http://snap.stanford.edu/data/facebook_combined.txt.gz .

We recommend to arrange the data with a clear directory structure. 
To get started, you may manually setup an ``XGCN_data`` (or other names you like) directory as follows: 
(It's recommended to put your ``XGCN_data`` somewhere else than in this repository.)

```
XGCN_data
└── dataset
    └── raw_facebook
        └── facebook_combined.txt
```

From now on, we'll use this directory to hold all the different datasets 
and models outputs. 
We refer to its path as ``all_data_root`` in our python code and shell scripts. 


In [2]:
# set your own all_data_root:
all_data_root = '/home/wuyao/songxiran/code/XGCN_coda_and_data/data'

### Load the raw graph

In [9]:
dataset = 'facebook'
raw_data_root = osp.join(all_data_root, 'dataset/raw_' + dataset)
file_raw_graph = osp.join(raw_data_root, 'facebook_combined.txt')
print(file_raw_graph)

/home/wuyao/songxiran/code/XGCN_coda_and_data/data/dataset/raw_facebook/facebook_combined.txt


In [10]:
E_src, E_dst = io.load_txt_edges(file_raw_graph)
print(E_src)
print(E_dst)

[   0    0    0 ... 4027 4027 4031]
[   1    2    3 ... 4032 4038 4038]


In [11]:
info, indptr, indices = csr.from_edges_to_csr_with_info(
    E_src, E_dst, graph_type='homo'
)
print(info)

# from_edges_to_csr ...
# remove_repeated_edges ...
## 0 edges are removed
{'graph_type': 'homo', 'num_nodes': 4039, 'num_edges': 88234}


Loading large graphs from text files can be time-consuming (though the facebook graph here is a small one), we can cache the graph using ``io.save_pickle``: 

In [12]:
raw_csr_root = osp.join(raw_data_root, 'csr')
ensure_dir(raw_csr_root)

io.save_yaml(osp.join(raw_csr_root, 'info.yaml'), info)
io.save_pickle(osp.join(raw_csr_root, 'indptr.pkl'), indptr)
io.save_pickle(osp.join(raw_csr_root, 'indices.pkl'), indices)

## Split validation/test set

Assume that we don't have existing evaluation set 
and want to split some edges for model evaluation.

In [13]:
set_random_seed(1999)

num_sample = 10_000       # number of edges to split
min_src_out_degree = 3    # guarantee the minimum out-degree of a source node
min_dst_in_degree = 1     # guarantee the minimum in-degree of a destination node

indptr, indices, pos_edges = XGCN.data.split_edges(
    indptr, indices,
    num_sample, min_src_out_degree, min_dst_in_degree
)
info['num_edges'] = len(indices)
print(info)

sampling edges 9999/10000 (99.99%)
num sampled edges: 10000
{'graph_type': 'homo', 'num_nodes': 4039, 'num_edges': 78234}


Now we have all the positive edges: ``pos_edges``, let's divide them for 
validation set and test set, and we’ll use the "whole-graph-multi-pos" evaluation method:

In [14]:
num_validation = 2000
val_edges = pos_edges[:num_validation]
test_edges = pos_edges[num_validation:]

val_set = XGCN.data.from_edges_to_adj_eval_set(val_edges)
test_set = XGCN.data.from_edges_to_adj_eval_set(test_edges)

Now we have already generated a complete dataset instance, let's save it:

In [16]:
data_root = osp.join(all_data_root, 'dataset/instance_' + dataset)
ensure_dir(data_root)

io.save_yaml(osp.join(data_root, 'info.yaml'), info)
io.save_pickle(osp.join(data_root, 'indptr.pkl'), indptr)
io.save_pickle(osp.join(data_root, 'indices.pkl'), indices)
io.save_pickle(osp.join(data_root, 'pos_edges.pkl'), pos_edges)
io.save_pickle(osp.join(data_root, 'val_set.pkl'), val_set)
io.save_pickle(osp.join(data_root, 'test_set.pkl'), test_set)

Here we also save the ``pos_edges``, so you can use it to make evaluation sets for 
"one-pos-k-neg" or "whole-graph-one-pos" method by concatenating some randomly 
sampled negative nodes. 

If you have done the above steps successfully, your data directory will be like follows: 

```
XGCN_data
└── dataset
    ├── raw_facebook
    |   ├── facebook_combined.txt
    |   └── csr
    |       ├── indices.pkl
    |       ├── indptr.pkl
    |       └── info.yaml
    └── instance_facebook
        ├── indices.pkl
        ├── indptr.pkl
        ├── 
        ├── pos_edges.pkl
        ├── test_set.pkl
        └── val_set.pkl
```
