In [3]:
from gensim.models import word2vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import networkx as nx
from networkx.algorithms import bipartite
import csv
import node2vec
import itertools



In [6]:
B = nx.Graph()
documents = []
names = []
edges = []
# Add nodes with the node attribute "bipartite"
with open('all_names.csv') as all_names:
    name_reader = csv.reader(all_names)
    for row in name_reader:
        documents.append(row[0])
        for name in row[1:]:
            names.append(name)
            edges.append((row[0], name))
            
B.add_nodes_from(documents, bipartite=0)
B.add_nodes_from(names, bipartite=1)
B.add_edges_from(edges)

In [7]:
print(len(B.nodes))
one_edge_nodes = []
for node in B.nodes:
    if len([n for n in B[node]]) <= 1:
        one_edge_nodes.append(node)
        
B.remove_nodes_from(one_edge_nodes)
print(len(B.nodes))

5273
1048


# Clustering
## Girvan Newman

In [3]:
from igraph import *
# either import or make your own like the one above

bipartite_graph = Graph.Read_GML('bipartite.gml')

# Girvan Newman clusters
gn_clusters = bipartite_graph.community_edge_betweenness().as_clustering()
# plot(gn_clusters)
writer = csv.writer(open("gn_clusters.csv", "w", newline=''))
writer.writerow(["cluster", "members"])
count = 0
for cluster in gn_clusters:
    members = [bipartite_graph.vs[member]["label"] for member in cluster]
    writer.writerow([count] + members)
    count += 1

## D3.js viz

In [51]:
import json
nodes = []
links = []
count = 0
for cluster in gn_clusters:
    nodes += [{"id" : bipartite_graph.vs[member]['label'], "group": count} for member in cluster]
    count += 1

for e in bipartite_graph.es:
    links += [{"source": bipartite_graph.vs[e.source]['label'], "target": bipartite_graph.vs[e.target]['label']}]

json.dumps({"nodes" : nodes, "links" : links})

'{"nodes": [{"id": "1830.PA-09.24.PHIL.txt", "group": 0}, {"id": "1831.PA-06.06.PHIL.txt", "group": 0}, {"id": "1832.PA-06.04.PHIL.txt", "group": 0}, {"id": "1833.PA-06.03.PHIL.txt", "group": 0}, {"id": "1834.NY-06.02.NEWY_ADDRESS.txt", "group": 0}, {"id": "1834.NY-06.02.NEWY_MINUTES.txt", "group": 0}, {"id": "1879.TX-07.02.HOUS.txt", "group": 0}, {"id": "Cyrus Black", "group": 0}, {"id": "Junius C. Morel", "group": 0}, {"id": "James Cornish", "group": 0}, {"id": "Allen", "group": 0}, {"id": "Richard Allen", "group": 0}, {"id": "Belfast Burton", "group": 0}, {"id": "Robert Cowley", "group": 0}, {"id": "Peter Gardiner", "group": 0}, {"id": "Charles H. Leveck", "group": 0}, {"id": "Frederick A. Hinton", "group": 0}, {"id": "Austin Steward", "group": 0}, {"id": "C. Augustus", "group": 0}, {"id": "George C. Willis", "group": 0}, {"id": "Alfred Niger", "group": 0}, {"id": "Hezekiah Grice", "group": 0}, {"id": "Abraham D. Shad", "group": 0}, {"id": "Wm. Duncan", "group": 0}, {"id": "Robert B

## Louvain

In [4]:
# Louvain clusters
louvain_clusters = bipartite_graph.community_multilevel()
# plot(louvain_clusters)
writer = csv.writer(open("louvain_clusters.csv", "w", newline=''))
writer.writerow(["cluster", "members"])
count = 0
for cluster in louvain_clusters:
    members = [bipartite_graph.vs[member]["label"] for member in cluster]
    writer.writerow([count] + members)
    count += 1
louvain_clusters.summary()

'Clustering with 1048 elements and 11 clusters'

In [5]:
import json
nodes = []
links = []
count = 0
for cluster in louvain_clusters:
    nodes += [{"id" : bipartite_graph.vs[member]['label'], "group": count} for member in cluster]
    count += 1

for e in bipartite_graph.es:
    links += [{"source": bipartite_graph.vs[e.source]['label'], "target": bipartite_graph.vs[e.target]['label']}]

json.dumps({"nodes" : nodes, "links" : links})

'{"nodes": [{"id": "1840.NY-08.18.ALBA.1.txt", "group": 0}, {"id": "1840.NY-08.18.ALBA.2.txt", "group": 0}, {"id": "1843.NY-08.15.BUFF.txt", "group": 0}, {"id": "1844.NY-09.18.SCHE-Minutes.txt", "group": 0}, {"id": "1844.NY-09.18.SCHE-Report.txt", "group": 0}, {"id": "1847.NY-11.06.TROY.txt", "group": 0}, {"id": "1850.NY-08.21.CAZE.txt", "group": 0}, {"id": "1851.NY-07.22.ALBA.txt", "group": 0}, {"id": "1854.NY-01.02.ALBA.txt", "group": 0}, {"id": "1855.CT-04-18.HART.txt", "group": 0}, {"id": "1855.NY-01.20.ALBA.txt", "group": 0}, {"id": "1855.NY-09.04.TROY.txt", "group": 0}, {"id": "1858.NY-09.14.TROY.txt", "group": 0}, {"id": "1863.NY-07.16.POUG.txt", "group": 0}, {"id": "Schuyler", "group": 0}, {"id": "J. C. Morel", "group": 0}, {"id": "Philip A. Bell", "group": 0}, {"id": "Gurley", "group": 0}, {"id": "P. A. Bell", "group": 0}, {"id": "Charles Morton", "group": 0}, {"id": "Henry Thompson", "group": 0}, {"id": "Austin Stewart", "group": 0}, {"id": "Abner H. Francis", "group": 0}, {"

## Node2Vec

In [9]:
# FILES
EMBEDDING_FILENAME = './bipartite.emb'
EMBEDDING_MODEL_FILENAME = './bipartite.model'

# Precompute probabilities and generate walks
node2vec = node2vec.Node2Vec(B, dimensions=64, walk_length=30, num_walks=200, workers=4)

# Embed
model = node2vec.fit(window=10, min_count=1, batch_words=4)

# Any keywords acceptable by gensim.Word2Vec can be passed,
# `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)

# Look for most similar nodes
# print(model.wv.most_similar('Cornish'))  # Output node names are always strings

# Save embeddings for later use
model.wv.save_word2vec_format(EMBEDDING_FILENAME)

# Save model for later use
model.save(EMBEDDING_MODEL_FILENAME)

# Loading the word2vec/node2vec model

In [12]:
model = word2vec.Word2Vec.load('bipartite.model')
X = model.wv[model.wv.vocab]

In [14]:
model.wv.most_similar('Johnson')

[('J.A. Harris', 0.5270813703536987),
 ('Whittlesey', 0.5191370248794556),
 ('Isaac Sweet', 0.5173918604850769),
 ('Granville', 0.5136761665344238),
 ('Eliphalet Whittlesey', 0.509271502494812),
 ('J.P. Shooks', 0.5082486271858215),
 ('Draven', 0.501055896282196),
 ('Mr. J.H. Harris', 0.49939534068107605),
 ('Holden', 0.49890297651290894),
 ('Edgecomb', 0.4983765184879303)]

In [25]:
model.wv.most_similar('Ray')

[('Charles B. Ray', 0.6662619113922119),
 ('Patrick H. Reason', 0.5894392728805542),
 ('R. P. G. Wright', 0.5465505123138428),
 ('Wm. P. Johnson', 0.5354915261268616),
 ('Beman', 0.5277462601661682),
 ('John J. Zuille', 0.5266906023025513),
 ('Theodore S. Wright', 0.5250387787818909),
 ('Fisk', 0.5201966762542725),
 ('Francis Thompson', 0.5119348168373108),
 ('A. G. Beman', 0.5045610666275024)]

In [26]:
model.wv.most_similar('Charles B. Ray')

[('Ray', 0.6662619113922119),
 ('Rich', 0.6139453053474426),
 ('S. R. Ward', 0.6038385629653931),
 ('T. S. Wright', 0.5802088975906372),
 ('William P. Johnson', 0.576372504234314),
 ('B. Ray', 0.5599508881568909),
 ('H. H. Garnet', 0.5591388940811157),
 ('C. B. Ray', 0.5578903555870056),
 ('Uriah Boston', 0.5485678911209106),
 ('William P. Powell', 0.5462288856506348)]

In [22]:
model.wv.most_similar('Smith')

[('Wilson', 0.666872501373291),
 ('Robertson', 0.5891510844230652),
 ('Duffin', 0.48796573281288147),
 ('Drayton', 0.47797441482543945),
 ('Loguen', 0.47647303342819214),
 ('Wm. Butler', 0.4760018587112427),
 ('Miller', 0.46731191873550415),
 ('Turpin', 0.46493780612945557),
 ('S. R. Ward', 0.46382153034210205),
 ('George W. Clark', 0.45514166355133057)]

In [31]:
model.wv.most_similar('James M\'Cune Smith')

[('William P. Powell', 0.6907972097396851),
 ('William P. Johnson', 0.6840863227844238),
 ('R. Francis', 0.6832716464996338),
 ('Anthony', 0.6758710145950317),
 ('Ulysses B. Vidal', 0.6625845432281494),
 ('Jeremiah Powers', 0.643680214881897),
 ('T. S. Wright', 0.6127962470054626),
 ('Thompson', 0.5318195819854736),
 ('C. B. Ray', 0.528550922870636),
 ('Rich', 0.5085271000862122)]

In [29]:
model.wv.most_similar('William P. Powell')

[('William P. Johnson', 0.9821459054946899),
 ('H. H. Garnet', 0.7815438508987427),
 ('Garnet', 0.7548297643661499),
 ('Anthony', 0.710107684135437),
 ("James M'Cune Smith", 0.6907972097396851),
 ('T. S. Wright', 0.6891352534294128),
 ('R. Francis', 0.6783553957939148),
 ('Ulysses B. Vidal', 0.670825719833374),
 ('Thompson', 0.6603851318359375),
 ('Jeremiah Powers', 0.6599832773208618)]

# Visualizing a word2vec model (with words)
[Reference here](https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne)

In [None]:
labels = []
tokens = []

for word in model.wv.vocab:
    tokens.append(model.wv[word])
    labels.append(word)

tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(tokens)

x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])

plt.figure(figsize=(16, 16)) 

for i in range(len(x)):
    plt.scatter(x[i],y[i])
    plt.annotate(labels[i],
                 xy=(x[i], y[i]),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
plt.show()