Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ version = "0.8.0"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
GraphMLDatasets = "21828b05-d3b3-40ad-870e-a4bc2f52d5e8"
Expand All @@ -17,7 +18,9 @@ NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Word2Vec = "c64b6f0f-98cd-51d1-af78-58ae84944834"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[compat]
Expand All @@ -27,17 +30,18 @@ DataStructures = "0.18"
FillArrays = "0.12"
Flux = "0.12"
GraphMLDatasets = "0.1"
GraphSignals = "0.3"
Graphs = "1.4"
NNlib = "0.7"
NNlibCUDA = "0.1"
Reexport = "1.1"
Word2Vec = "0.5"
Zygote = "0.6"
julia = "1.6 - 1.7"

[extras]
Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["SparseArrays", "Test"]
test = ["Clustering", "SparseArrays", "Test"]
3 changes: 2 additions & 1 deletion docs/Project.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[deps]
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"

[compat]
Documenter = "0.24"
Documenter = "0.27"
17 changes: 17 additions & 0 deletions docs/bibliography.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
@inproceedings{node2vec2016,
author = {Grover, Aditya and Leskovec, Jure},
title = {Node2vec: Scalable Feature Learning for Networks},
year = {2016},
isbn = {9781450342322},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2939672.2939754},
doi = {10.1145/2939672.2939754},
abstract = {Prediction tasks over nodes and edges in networks require careful effort in engineering features used by learning algorithms. Recent research in the broader field of representation learning has led to significant progress in automating prediction by learning the features themselves. However, present feature learning approaches are not expressive enough to capture the diversity of connectivity patterns observed in networks. Here we propose node2vec, an algorithmic framework for learning continuous feature representations for nodes in networks. In node2vec, we learn a mapping of nodes to a low-dimensional space of features that maximizes the likelihood of preserving network neighborhoods of nodes. We define a flexible notion of a node's network neighborhood and design a biased random walk procedure, which efficiently explores diverse neighborhoods. Our algorithm generalizes prior work which is based on rigid notions of network neighborhoods, and we argue that the added flexibility in exploring neighborhoods is the key to learning richer representations.We demonstrate the efficacy of node2vec over existing state-of-the-art techniques on multi-label classification and link prediction in several real-world networks from diverse domains. Taken together, our work represents a new way for efficiently learning state-of-the-art task-independent representations in complex networks.},
booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
pages = {855–864},
numpages = {10},
keywords = {node embeddings, information networks, graph representations, feature learning},
location = {San Francisco, California, USA},
series = {KDD '16}
}
7 changes: 6 additions & 1 deletion docs/make.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
using Documenter
using DocumenterCitations
using GeometricFlux

bib = CitationBibliography(joinpath(@__DIR__, "bibliography.bib"), sorting=:nyt)

makedocs(
bib,
sitename = "GeometricFlux.jl",
format = Documenter.HTML(
assets = ["assets/flux.css"],
Expand All @@ -24,7 +28,8 @@ makedocs(
["Convolutional Layers" => "manual/conv.md",
"Pooling Layers" => "manual/pool.md",
"Models" => "manual/models.md",
"Linear Algebra" => "manual/linalg.md"]
"Linear Algebra" => "manual/linalg.md"],
"References" => "references.md",
]
)

Expand Down
4 changes: 4 additions & 0 deletions docs/src/references.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# References

```@bibliography
```
39 changes: 39 additions & 0 deletions examples/node2vec.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
using GeometricFlux
using GraphSignals
using Graphs
using SparseArrays
using Plots
using GraphPlot
using Clustering
using Cairo, Compose

clusters = [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

int2col_str(x::Int) = x==1 ? "lightblue" : "red"


g = smallgraph(:karate)
fg = FeaturedGraph(g)
vectors = node2vec(fg; walks_per_node=10, len=80, p=1.0, q=1.0)
R = kmeans(vectors, 2)


learned_clusters = copy(assignments(R))
# ensure that the cluster containing node 1 is cluster 1
if assignments(R)[1] != 1
learned_clusters = [i == 1 ? 2 : 1 for i in assignments(R)]
end

output_plot_name = "karateclub.pdf"
draw(
PDF(output_plot_name, 16cm, 16cm),
gplot(g,
nodelabel=map(string, 1:34),
nodefillc=[int2col_str(learned_clusters[i]) for i in 1:34],
nodestrokec=["white" for _ in 1:34]
)
)

incorrect = sum(learned_clusters .!= clusters)
println(incorrect, " incorrect cluster labelings")
println("Drawn graph to ", output_plot_name)
17 changes: 14 additions & 3 deletions src/GeometricFlux.jl
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
module GeometricFlux

using DelimitedFiles
using SparseArrays
using Statistics: mean
using LinearAlgebra: Adjoint, norm, Transpose
using Random
using Reexport

using CUDA
using ChainRulesCore: @non_differentiable
using FillArrays: Fill
using Flux
using Flux: glorot_uniform, leakyrelu, GRUCell, @functor
using NNlib, NNlibCUDA
using GraphSignals
@reexport using GraphSignals
using Graphs
using NNlib, NNlibCUDA
using Zygote

import Word2Vec: word2vec, wordvectors, get_vector

export
# layers/graphlayers
AbstractGraphLayer,
Expand Down Expand Up @@ -52,7 +57,10 @@ export
bypass_graph,

# utils
generate_cluster
generate_cluster,

#node2vec
node2vec

include("datasets.jl")

Expand All @@ -67,6 +75,9 @@ include("layers/pool.jl")
include("models.jl")
include("layers/misc.jl")

include("sampling.jl")
include("embedding/node2vec.jl")

include("cuda/conv.jl")

using .Datasets
Expand Down
154 changes: 154 additions & 0 deletions src/embedding/node2vec.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
const Alias = Tuple{SparseVector{Int}, SparseVector{Float64}}

"""
node2vec(g; walks_per_node, len, p, q, dims)

Returns an embedding matrix with size of `nv(g)` x `dims`. It computes node embeddings
on graph `g` accroding to node2vec [node2vec2016](@cite). It performs biased random walks on the graph,
then computes word embeddings by treating those random walks as sentences.

# Arguments

- `g::FeaturedGraph`: The graph to perform random walk on.
- `walks_per_node::Int`: Number of walks starting on each node,
total number of walks is `nv(g) * walks_per_node`
- `len::Int`: Length of random walks
- `p::Real`: Return parameter from [node2vec2016](@cite)
- `q::Real`: In-out parameter from [node2vec2016](@cite)
- `dims::Int`: Number of vector dimensions
"""
function node2vec(g::FeaturedGraph; walks_per_node::Int=100, len::Int=5, p::Real=0.5, q::Real=0.5, dims::Int=128)
walks = simulate_walks(g; walks_per_node=walks_per_node, len=len, p=p, q=q)
model = walks2vec(walks,dims=dims)
vecs = []
println(typeof(model))
for i in 1:nv(g)
push!(vecs, get_vector(model, string(i)))
end
matrix = cat(vecs..., dims=2)
return matrix
end

"""
Modified version of Node2Vec.learn_embeddings[1]. Uses
a Julia interface[2] to the original word2vec C code[3].

Treats each random walk like a sentence, and computed word
embeddings using node ID as words.

[1] https://github.com/ollin18/Node2Vec.jl
[2] https://github.com/JuliaText/Word2Vec.jl
[3] https://code.google.com/archive/p/word2vec/
"""
function walks2vec(walks::Vector{Vector{Int}}; dims::Int=100)
str_walks=map(x -> string.(x),walks)

if Sys.iswindows()
rpath = pwd()
else
rpath = "/tmp"
end
the_walks = joinpath(rpath,"str_walk.txt")
the_vecs = joinpath(rpath,"str_walk-vec.txt")

writedlm(the_walks,str_walks)
word2vec(the_walks,the_vecs,verbose=true,size=dims)
model=wordvectors(the_vecs)
rm(the_walks)
rm(the_vecs)
model
end


"""
Conducts a random walk over `g` in O(l) time,
weighted by alias sampling probabilities `alias_nodes`
and `alias_edges`.
"""
function node2vec_walk(
g::FeaturedGraph,
alias_nodes::Dict{Int, Alias},
alias_edges::Dict{Tuple{Int, Int}, Alias};
start_node::Int,
walk_length::Int)::Vector{Int}
walk::Vector{Int} = [start_node]
for _ in 2:walk_length
curr = walk[end]
cur_nbrs = sort(neighbors(g, curr; dir=:out))
if length(walk) == 1
push!(walk, cur_nbrs[alias_sample(alias_nodes[curr]...)])
else
prev = walk[end-1]
next = cur_nbrs[alias_sample(alias_edges[(prev, curr)]...)]
push!(walk, next)
end
end
return walk
end

"""
Returns J and q for a given edge
"""
function get_alias_edge(g::FeaturedGraph, src::Int, dst::Int, p::Float64, q::Float64)::Alias
unnormalized_probs = spzeros(length(neighbors(g, dst; dir=:out)))
neighbor_weight_pairs = zip(weighted_outneighbors(g, dst)...)
for (i, (dst_nbr, weight)) in enumerate(neighbor_weight_pairs)
if dst_nbr == src
unnormalized_probs[i] = weight/p
elseif has_edge(g, dst_nbr, src)
unnormalized_probs[i] = weight
else
unnormalized_probs[i] = weight/q
end
end
normalized_probs = unnormalized_probs ./ sum(unnormalized_probs)
return alias_setup(normalized_probs)
end

# Returns (neighbors::Vector{Int}, weights::Vector{Float64})
function weighted_outneighbors(fg::FeaturedGraph, i::Int)
nbrs = neighbors(fg, i; dir=:out)
nbrs, sparse(graph(fg))[i, nbrs]
end

"""
Computes weighted probability transition aliases J and q for nodes and edges
using return parameter `p` and In-out parameter `q`

Implementation as specified in the node2vec paper [node2vec2016](@cite).
"""
function preprocess_modified_weights(g::FeaturedGraph, p::Real, q::Real)

alias_nodes = Dict{Int, Alias}()
alias_edges = Dict{Tuple{Int, Int}, Alias}()

for node in 1:nv(g)
nbrs = neighbors(g, node, dir=:out)
probs = fill(1, length(nbrs)) ./ length(nbrs)
alias_nodes[node] = alias_setup(probs)
end
for (_, edge) in edges(g)
src, dst = edge
alias_edges[(src, dst)] = get_alias_edge(g, src, dst, p, q)
if !is_directed(g)
alias_edges[(dst, src)] = get_alias_edge(g, dst, src, p, q)
end
end
return alias_nodes, alias_edges
end


"""
Given a graph, compute `walks_per_node` * nv(g) random walks.
"""
function simulate_walks(g::FeaturedGraph; walks_per_node::Int, len::Int, p::Real, q::Real)::Vector{Vector{Int}}
alias_nodes, alias_edges = preprocess_modified_weights(g, p, q)
walks = Vector{Int}[]
for _ in 1:walks_per_node
for node in shuffle(1:nv(g))
walk::Vector{Int} = node2vec_walk(g, alias_nodes, alias_edges; start_node=node, walk_length=len)
push!(walks, walk)
end
end
return walks
end
55 changes: 55 additions & 0 deletions src/sampling.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
alias_setup(probs)

Computes alias probabilities.
"""
alias_setup(probs::AbstractVector{<:Real}) = alias_setup(sparse(probs))

function alias_setup(probs::SparseVector{<:Real})
K = length(probs)
J = spzeros(Int, K)
q = probs * K

smaller = Int[] # prob idxs < 1/K
larger = Int[] # prob idxs >= 1/k

for i in 1:length(probs)
if q[i] < 1.0 # equivalent to prob < 1/K but saves the division
push!(smaller, i)
else
push!(larger, i)
end
end

while length(smaller) > 0 && length(larger) > 0
small = pop!(smaller)
large = pop!(larger)
J[small] = large
q[large] = q[large] + q[small] - 1.0
if q[large] < 1.0
push!(smaller, large)
else
push!(larger, large)
end
end

return J, q
end

"""
alias_sample(J, q)

Alias Sampling first described in [1]. [2] might be a helpful resource to understand alias sampling.

[1] A. Kronmal and A. V. Peterson. On the alias method for generating random variables from a
discrete distribution. The American Statistician, 33(4):214-218, 1979.
[2] https://lips.cs.princeton.edu/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
"""
function alias_sample(J::AbstractVector{<:Integer}, q::AbstractVector{<:Real})
small_index = ceil(Int, rand() * length(J))
if rand() < q[small_index]
return small_index
else
return J[small_index]
end
end
Loading