diff --git a/Project.toml b/Project.toml
index a6f3df0f1..d92fdd945 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,6 +7,7 @@ version = "0.8.0"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 GraphMLDatasets = "21828b05-d3b3-40ad-870e-a4bc2f52d5e8"
@@ -17,7 +18,9 @@ NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 NNlibCUDA = "a00861dc-f156-4864-bf3c-e6376f28a68d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Word2Vec = "c64b6f0f-98cd-51d1-af78-58ae84944834"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
@@ -27,17 +30,18 @@ DataStructures = "0.18"
 FillArrays = "0.12"
 Flux = "0.12"
 GraphMLDatasets = "0.1"
-GraphSignals = "0.3"
 Graphs = "1.4"
 NNlib = "0.7"
 NNlibCUDA = "0.1"
 Reexport = "1.1"
+Word2Vec = "0.5"
 Zygote = "0.6"
 julia = "1.6 - 1.7"
 
 [extras]
+Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["SparseArrays", "Test"]
+test = ["Clustering", "SparseArrays", "Test"]
diff --git a/docs/Project.toml b/docs/Project.toml
index 1b9ab1f81..059c2a123 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 
 [compat]
-Documenter = "0.24"
+Documenter = "0.27"
diff --git a/docs/bibliography.bib b/docs/bibliography.bib
new file mode 100644
index 000000000..66a4f48eb
--- /dev/null
+++ b/docs/bibliography.bib
@@ -0,0 +1,17 @@
+@inproceedings{node2vec2016,
+    author = {Grover, Aditya and Leskovec, Jure},
+    title = {Node2vec: Scalable Feature Learning for Networks},
+    year = {2016},
+    isbn = {9781450342322},
+    publisher = {Association for Computing Machinery},
+    address = {New York, NY, USA},
+    url = {https://doi.org/10.1145/2939672.2939754},
+    doi = {10.1145/2939672.2939754},
+    abstract = {Prediction tasks over nodes and edges in networks require careful effort in engineering features used by learning algorithms. Recent research in the broader field of representation learning has led to significant progress in automating prediction by learning the features themselves. However, present feature learning approaches are not expressive enough to capture the diversity of connectivity patterns observed in networks. Here we propose node2vec, an algorithmic framework for learning continuous feature representations for nodes in networks. In node2vec, we learn a mapping of nodes to a low-dimensional space of features that maximizes the likelihood of preserving network neighborhoods of nodes. We define a flexible notion of a node's network neighborhood and design a biased random walk procedure, which efficiently explores diverse neighborhoods. Our algorithm generalizes prior work which is based on rigid notions of network neighborhoods, and we argue that the added flexibility in exploring neighborhoods is the key to learning richer representations.We demonstrate the efficacy of node2vec over existing state-of-the-art techniques on multi-label classification and link prediction in several real-world networks from diverse domains. Taken together, our work represents a new way for efficiently learning state-of-the-art task-independent representations in complex networks.},
+    booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
+    pages = {855–864},
+    numpages = {10},
+    keywords = {node embeddings, information networks, graph representations, feature learning},
+    location = {San Francisco, California, USA},
+    series = {KDD '16}
+}
diff --git a/docs/make.jl b/docs/make.jl
index 0ad4151e7..609ec81b0 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,7 +1,11 @@
 using Documenter
+using DocumenterCitations
 using GeometricFlux
 
+bib = CitationBibliography(joinpath(@__DIR__, "bibliography.bib"), sorting=:nyt)
+
 makedocs(
+    bib,
     sitename = "GeometricFlux.jl",
     format = Documenter.HTML(
       assets = ["assets/flux.css"],
@@ -24,7 +28,8 @@ makedocs(
                ["Convolutional Layers" => "manual/conv.md",
                 "Pooling Layers" => "manual/pool.md",
                 "Models" => "manual/models.md",
-                "Linear Algebra" => "manual/linalg.md"]
+                "Linear Algebra" => "manual/linalg.md"],
+             "References" => "references.md",
     ]
 )
 
diff --git a/docs/src/references.md b/docs/src/references.md
new file mode 100644
index 000000000..78f29bc41
--- /dev/null
+++ b/docs/src/references.md
@@ -0,0 +1,4 @@
+# References
+
+```@bibliography
+```
diff --git a/examples/node2vec.jl b/examples/node2vec.jl
new file mode 100644
index 000000000..14212ea4b
--- /dev/null
+++ b/examples/node2vec.jl
@@ -0,0 +1,39 @@
+using GeometricFlux
+using GraphSignals
+using Graphs
+using SparseArrays
+using Plots
+using GraphPlot
+using Clustering
+using Cairo, Compose
+
+clusters = [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+
+int2col_str(x::Int) = x==1 ? "lightblue" : "red"
+
+
+g = smallgraph(:karate)
+fg = FeaturedGraph(g)
+vectors = node2vec(fg; walks_per_node=10, len=80, p=1.0, q=1.0)
+R = kmeans(vectors, 2)
+
+
+learned_clusters = copy(assignments(R))
+# ensure that the cluster containing node 1 is cluster 1
+if assignments(R)[1] != 1
+    learned_clusters = [i == 1 ? 2 : 1 for i in assignments(R)]
+end
+
+output_plot_name = "karateclub.pdf"
+draw(
+    PDF(output_plot_name, 16cm, 16cm),
+    gplot(g,
+        nodelabel=map(string, 1:34),
+        nodefillc=[int2col_str(learned_clusters[i]) for i in 1:34],
+        nodestrokec=["white" for _ in 1:34]
+    )
+)
+
+incorrect = sum(learned_clusters .!= clusters)
+println(incorrect, " incorrect cluster labelings")
+println("Drawn graph to ", output_plot_name)
diff --git a/src/GeometricFlux.jl b/src/GeometricFlux.jl
index dfa179f03..c6bd41a4a 100644
--- a/src/GeometricFlux.jl
+++ b/src/GeometricFlux.jl
@@ -1,7 +1,10 @@
 module GeometricFlux
 
+using DelimitedFiles
+using SparseArrays
 using Statistics: mean
 using LinearAlgebra: Adjoint, norm, Transpose
+using Random
 using Reexport
 
 using CUDA
@@ -9,11 +12,13 @@ using ChainRulesCore: @non_differentiable
 using FillArrays: Fill
 using Flux
 using Flux: glorot_uniform, leakyrelu, GRUCell, @functor
-using NNlib, NNlibCUDA
-using GraphSignals
+@reexport using GraphSignals
 using Graphs
+using NNlib, NNlibCUDA
 using Zygote
 
+import Word2Vec: word2vec, wordvectors, get_vector
+
 export
     # layers/graphlayers
     AbstractGraphLayer,
@@ -52,7 +57,10 @@ export
     bypass_graph,
 
     # utils
-    generate_cluster
+    generate_cluster,
+
+    #node2vec
+    node2vec
 
 include("datasets.jl")
 
@@ -67,6 +75,9 @@ include("layers/pool.jl")
 include("models.jl")
 include("layers/misc.jl")
 
+include("sampling.jl")
+include("embedding/node2vec.jl")
+
 include("cuda/conv.jl")
 
 using .Datasets
diff --git a/src/embedding/node2vec.jl b/src/embedding/node2vec.jl
new file mode 100644
index 000000000..3647e194b
--- /dev/null
+++ b/src/embedding/node2vec.jl
@@ -0,0 +1,154 @@
+const Alias = Tuple{SparseVector{Int}, SparseVector{Float64}}
+
+"""
+    node2vec(g; walks_per_node, len, p, q, dims)
+
+Returns an embedding matrix with size of `nv(g)` x `dims`. It computes node embeddings
+on graph `g` accroding to node2vec [node2vec2016](@cite). It performs biased random walks on the graph,
+then computes word embeddings by treating those random walks as sentences.
+
+# Arguments
+
+- `g::FeaturedGraph`: The graph to perform random walk on.
+- `walks_per_node::Int`: Number of walks starting on each node,
+total number of walks is `nv(g) * walks_per_node`
+- `len::Int`: Length of random walks
+- `p::Real`: Return parameter from [node2vec2016](@cite)
+- `q::Real`: In-out parameter from [node2vec2016](@cite)
+- `dims::Int`: Number of vector dimensions
+"""
+function node2vec(g::FeaturedGraph; walks_per_node::Int=100, len::Int=5, p::Real=0.5, q::Real=0.5, dims::Int=128)
+    walks = simulate_walks(g; walks_per_node=walks_per_node, len=len, p=p, q=q)
+    model = walks2vec(walks,dims=dims)
+    vecs = []
+    println(typeof(model))
+    for i in 1:nv(g)
+        push!(vecs, get_vector(model, string(i)))
+    end
+    matrix = cat(vecs..., dims=2)
+    return matrix
+end
+
+"""
+Modified version of Node2Vec.learn_embeddings[1]. Uses
+a Julia interface[2] to the original word2vec C code[3].
+
+Treats each random walk like a sentence, and computed word
+embeddings using node ID as words.
+
+[1] https://github.com/ollin18/Node2Vec.jl
+[2] https://github.com/JuliaText/Word2Vec.jl
+[3] https://code.google.com/archive/p/word2vec/
+"""
+function walks2vec(walks::Vector{Vector{Int}}; dims::Int=100)
+    str_walks=map(x -> string.(x),walks)
+
+    if Sys.iswindows()
+        rpath = pwd()
+    else
+        rpath = "/tmp"
+    end
+    the_walks = joinpath(rpath,"str_walk.txt")
+    the_vecs = joinpath(rpath,"str_walk-vec.txt")
+
+    writedlm(the_walks,str_walks)
+    word2vec(the_walks,the_vecs,verbose=true,size=dims)
+    model=wordvectors(the_vecs)
+    rm(the_walks)
+    rm(the_vecs)
+    model
+end
+
+
+"""
+    Conducts a random walk over `g` in O(l) time,
+weighted by alias sampling probabilities `alias_nodes`
+and `alias_edges`.
+"""
+function node2vec_walk(
+    g::FeaturedGraph,
+    alias_nodes::Dict{Int, Alias},
+    alias_edges::Dict{Tuple{Int, Int}, Alias};
+    start_node::Int,
+    walk_length::Int)::Vector{Int}
+    walk::Vector{Int} = [start_node]
+    for _ in 2:walk_length
+        curr = walk[end]
+        cur_nbrs = sort(neighbors(g, curr; dir=:out))
+        if length(walk) == 1
+            push!(walk, cur_nbrs[alias_sample(alias_nodes[curr]...)])
+        else
+            prev = walk[end-1]
+            next = cur_nbrs[alias_sample(alias_edges[(prev, curr)]...)]
+            push!(walk, next)
+        end
+    end
+    return walk
+end
+
+"""
+Returns J and q for a given edge
+"""
+function get_alias_edge(g::FeaturedGraph, src::Int, dst::Int, p::Float64, q::Float64)::Alias
+    unnormalized_probs = spzeros(length(neighbors(g, dst; dir=:out)))
+    neighbor_weight_pairs = zip(weighted_outneighbors(g, dst)...)
+    for (i, (dst_nbr, weight)) in enumerate(neighbor_weight_pairs)
+        if dst_nbr == src
+            unnormalized_probs[i] = weight/p
+        elseif has_edge(g, dst_nbr, src)
+            unnormalized_probs[i] = weight
+        else
+            unnormalized_probs[i] = weight/q
+        end
+    end
+    normalized_probs = unnormalized_probs ./ sum(unnormalized_probs)
+    return alias_setup(normalized_probs)
+end
+
+# Returns (neighbors::Vector{Int}, weights::Vector{Float64})
+function weighted_outneighbors(fg::FeaturedGraph, i::Int)
+    nbrs = neighbors(fg, i; dir=:out)
+    nbrs, sparse(graph(fg))[i, nbrs]
+end
+
+"""
+    Computes weighted probability transition aliases J and q for nodes and edges
+using return parameter `p` and In-out parameter `q`
+
+Implementation as specified in the node2vec paper [node2vec2016](@cite).
+"""
+function preprocess_modified_weights(g::FeaturedGraph, p::Real, q::Real)
+
+    alias_nodes = Dict{Int, Alias}()
+    alias_edges = Dict{Tuple{Int, Int}, Alias}()
+
+    for node in 1:nv(g)
+        nbrs = neighbors(g, node, dir=:out)
+        probs = fill(1, length(nbrs)) ./ length(nbrs)
+        alias_nodes[node] =  alias_setup(probs)
+    end
+    for (_, edge) in edges(g)
+        src, dst = edge
+        alias_edges[(src, dst)] = get_alias_edge(g, src, dst, p, q)
+        if !is_directed(g)
+            alias_edges[(dst, src)] = get_alias_edge(g, dst, src, p, q)
+        end
+    end
+    return alias_nodes, alias_edges
+end
+
+
+"""
+Given a graph, compute `walks_per_node` * nv(g) random walks.
+"""
+function simulate_walks(g::FeaturedGraph; walks_per_node::Int, len::Int, p::Real, q::Real)::Vector{Vector{Int}}
+    alias_nodes, alias_edges = preprocess_modified_weights(g, p, q)
+    walks = Vector{Int}[]
+    for _ in 1:walks_per_node
+        for node in shuffle(1:nv(g))
+            walk::Vector{Int} = node2vec_walk(g, alias_nodes, alias_edges; start_node=node, walk_length=len)
+            push!(walks, walk)
+        end
+    end
+    return walks
+end
diff --git a/src/sampling.jl b/src/sampling.jl
new file mode 100644
index 000000000..8d8da7417
--- /dev/null
+++ b/src/sampling.jl
@@ -0,0 +1,55 @@
+"""
+    alias_setup(probs)
+
+Computes alias probabilities.
+"""
+alias_setup(probs::AbstractVector{<:Real}) = alias_setup(sparse(probs))
+
+function alias_setup(probs::SparseVector{<:Real})
+    K = length(probs)
+    J = spzeros(Int, K)
+    q = probs * K
+
+    smaller = Int[] # prob idxs < 1/K
+    larger = Int[]  # prob idxs >= 1/k
+
+    for i in 1:length(probs)
+        if q[i] < 1.0  # equivalent to prob < 1/K but saves the division
+            push!(smaller, i)
+        else
+            push!(larger, i)
+        end
+    end
+
+    while length(smaller) > 0 && length(larger) > 0
+        small = pop!(smaller)
+        large = pop!(larger)
+        J[small] = large
+        q[large] = q[large] + q[small] - 1.0
+        if q[large] < 1.0
+            push!(smaller, large)
+        else
+            push!(larger, large)
+        end
+    end
+
+    return J, q
+end
+
+"""
+    alias_sample(J, q)
+
+Alias Sampling first described in [1]. [2] might be a helpful resource to understand alias sampling.
+
+[1] A. Kronmal and A. V. Peterson. On the alias method for generating random variables from a
+    discrete distribution. The American Statistician, 33(4):214-218, 1979.
+[2] https://lips.cs.princeton.edu/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
+"""
+function alias_sample(J::AbstractVector{<:Integer}, q::AbstractVector{<:Real})
+    small_index = ceil(Int, rand() * length(J))
+    if rand() < q[small_index]
+        return small_index
+    else
+        return J[small_index]
+    end
+end
diff --git a/test/embedding/node2vec.jl b/test/embedding/node2vec.jl
new file mode 100644
index 000000000..d1206fbe3
--- /dev/null
+++ b/test/embedding/node2vec.jl
@@ -0,0 +1,17 @@
+@testset "node2vec" begin
+    clusters = [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+
+    g = smallgraph(:karate)
+    fg = FeaturedGraph(g)
+    vectors = node2vec(fg; walks_per_node=10, len=80, p=1.0, q=1.0)
+    R = kmeans(vectors, 2)
+
+    learned_clusters = copy(assignments(R))
+    # ensure that the cluster containing node 1 is cluster 1
+    if assignments(R)[1] != 1
+        learned_clusters = [i == 1 ? 2 : 1 for i in assignments(R)]
+    end
+
+    incorrect = sum(learned_clusters .!= clusters)
+    @test incorrect < 4
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index d8f332a67..198af2ba3 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,5 +1,6 @@
 using GeometricFlux
 using GeometricFlux.Datasets
+using Clustering
 using CUDA
 using Flux
 using Flux: @functor
@@ -24,6 +25,8 @@ tests = [
     "layers/conv",
     "layers/pool",
     "layers/misc",
+    "sampling",
+    "embedding/node2vec",
     "models",
 ]
 
diff --git a/test/sampling.jl b/test/sampling.jl
new file mode 100644
index 000000000..0404a0f4d
--- /dev/null
+++ b/test/sampling.jl
@@ -0,0 +1,12 @@
+@testset "alias sampling" begin
+    probs = [0.05, 0.1, 0.2, 0.3, 0.2, 0.1, 0.05]
+    J, q = GeometricFlux.alias_setup(probs)
+    samples = [GeometricFlux.alias_sample(J, q) for _ in 1:1000]
+    print
+    @test length(J) == length(q) == length(probs)
+    @test max(q...) <= 1.1
+    @test min(q...) >= 0.0
+    @test max(J...) <= length(probs)
+    @test min(J...) >= 0.0
+    @test length(samples) == 1000
+end