In [2]:
# Example on how to use chars2vec and DBSCAN to cluster simarly spelled words
# I chose chars2vec because it vectorizes words based on 
# individual characters, rather than words, meaning that 
# two similar phrases will have similar vectors
# but the tradeoff is that chars2vec doesn't really have a concept 
# of category unlike a model like BERT or OpenGPT-2

import chars2vec
import numpy as np
import pandas as pd
import re

from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics.pairwise import cosine_similarity

# Model Dimensions. Comes in size eng_50, eng_100, eng_200
# Feel free to tweak this. Smaller models have lower accuracy
# but are quicker to process

c2v_model = chars2vec.load_model('eng_300')
data = pd.read_csv("/scratch/hcnorris/data/fsplit/700.tsv",delimiter='\t')


Using TensorFlow backend.
  interactivity=interactivity, compiler=compiler, result=result)


In [164]:
data.head()

Unnamed: 0,id,1,2,3,4,5,6,a,b,c,...,p,q,r,s,t,u,v,x,y,z
0,1015503448,http://viaf.org/viaf/54181731,,,,,,"Pegge, Samuel,",,,...,,,,,,,,,,
1,1015503448,http://viaf.org/viaf/59902408,,,,,,"Simler, Georg,",,,...,,,,,,,,,,
2,1015506616,,,,,,,"Gebhard,",,,...,,,,,,,,,,
3,1015512402,http://viaf.org/viaf/26078421,,,,,,"Mortlock, Henry,",,,...,,,,,,,,,,
4,1015512402,,,,,,,"Foalkes, Robert,",,,...,,,,,,,,,,


In [103]:
# Example on how to normalize names
# As you can imagine, stripping out all punctuation 
# is bound to have edge cases where two names appear 
# the same but actually aren't

def normName(x):
    return re.sub(r'[^\w\s]','',x.lower())
    
names = data['a'].dropna().apply(normName).sort_values().unique()
names[:1000]

array(['', ' ', '  ', '   ', ' 16121685', ' abbe',
       ' abu gafar aøhmad b muøhammad al naøhøhas', ' adam', ' an h',
       ' arnold heinrich westerhoff', ' bartolt van gent', ' bentinck',
       ' berage', ' besson', ' bishop of', ' christiaen van heule',
       ' comte de', ' comtesse de', ' connye bonaventura', ' corisandre',
       ' cruce', ' de', ' de bourdonne', ' de maisoncelle', ' dutot',
       ' enoch krook', ' foquet', ' gerard kempher', ' hauptpastor in h',
       ' heinrich leonhard schurtzfleisch', ' hendrik benjamin',
       ' hendrik matthysz lussing', ' hesiodus', ' isaac vos',
       ' jan hendrik verheyk', ' jeanne antoinette poisson',
       ' joan pluimer', ' joannes matthaeus phrissemius',
       ' johan nieuhof johan nieuhof', ' johann friedrich leissner',
       ' le clerc de septchenes', ' le fevre de morsan', ' m', ' m de',
       ' m labbe', ' madame de', ' michiel de renichon', ' monsieur',
       ' nedham', ' nicolaas grevinchoven', ' nicolaas hoefnage

In [12]:
# Generate word embeddings (vectors) for first 5000 names
# If you run this on a GPU, it will be much faster

word_embeddings = c2v_model.vectorize_words(names[:5000])

In [100]:
# Use word embeddings to cluster names
# eps is the maximum distance between any two points
# min_samples is the minimum number of items to be considered
# a cluster or neighborhood
# Avoid setting eps too high as you will get more clusters of 
# items that are only similar and not mispellings of each other

# sklearn's DBSCAN isn't optimized for multiple cores,
# so I'd recommend finding a faster library
# I used this simply as a quick POC

cluster = DBSCAN(eps=.5, min_samples=2).fit(word_embeddings)

# Iterate over all unique labels
for ln in sorted(list(set(cluster.labels_))):
    print("----CLUSTER {}----".format(ln))
    
    # Print out all names that reside in that cluster
    for name,label in zip(names, cluster.labels_):
      if label == ln:
          print(name)

----CLUSTER -1----
 16121685
 abbe
 abu gafar aøhmad b muøhammad al naøhøhas
 an h
 arnold heinrich westerhoff
 bartolt van gent
 bentinck
 berage
 besson
 bishop of
 christiaen van heule
 comte de
 comtesse de
 connye bonaventura
 corisandre
 cruce
 de
 de bourdonne
 de maisoncelle
 dutot
 enoch krook
 foquet
 gerard kempher
 hauptpastor in h
 heinrich leonhard schurtzfleisch
 hendrik benjamin
 hendrik matthysz lussing
 hesiodus
 isaac vos
 jan hendrik verheyk
 jeanne antoinette poisson
 joan pluimer
 joannes matthaeus phrissemius
 johan nieuhof johan nieuhof
 johann friedrich leissner
 le clerc de septchenes
 le fevre de morsan
 m
 m de
 m labbe
 madame de
 michiel de renichon
 monsieur
 nedham
 nicolaas grevinchoven
 nicolaas hoefnagel
 nicolaas willem op den hooff
 norton roger
 osiandrus
 pernin des chavanettes
 petit oudin
 petrus jacobi austrosylvius
 petrus peckius
 phillipps thomas
 pieter nuyts
 r p
 rebenlein georg
 roelof wouters
 santes pagninus
 songbooks
 spencer helen f

----CLUSTER 1----
 adam
adam
----CLUSTER 2----
a country gentleman
a countrygentleman
----CLUSTER 3----
a gentleman of lincolns inn
a gentleman of lincolnsinn
----CLUSTER 4----
a gentleman of the inner temple
a gentleman of the innertemple
----CLUSTER 5----
a true churchman
a truechurchman
----CLUSTER 6----
aa pierter van der
aa pieter van der
----CLUSTER 7----
abarbanel isaac
abrabanel isaac
----CLUSTER 8----
abbarelli luigi
albarelli luigi
----CLUSTER 9----
abeille louis paul
abeille louispaul
----CLUSTER 10----
aben hamin
abenhamin
----CLUSTER 11----
ableiges jacques d
ablieges jacques d
----CLUSTER 12----
ablijn cornelius
albijn cornelius
----CLUSTER 13----
abluzio ottavio
albuzio ottavio
----CLUSTER 14----
abreu y bertodano joseph antonio de
abreu y bertonado joseph antonio de
----CLUSTER 15----
abrial andre joseph
abrial andrejoseph
----CLUSTER 16----
acciaioli donato
acciaiuoli donato
----CLUSTER 17----
acciaioli filippo
acciaiuoli filippo
----CLUSTER 18----
acciaioli onofrio
ac

In [102]:
# Example of how to take the pairwise cosine distances
# of all members of each cluster
# Substitue in whatever distance metric you'd like

# You probably want to just use numpy for this
# as Pandas incurs a significant overhead
# Pandas is used here just for a quick POC

ndf = pd.DataFrame(np.hstack((names.reshape(-1,1)[:5000], cluster.labels_.reshape(-1,1))), columns=['name','cid'])
for i in ndf['cid'].unique():
    print("---- Cluster {} ----".format(i))
    print(cosine_similarity(word_embeddings[ndf.query("cid == {}".format(i)).index,:]))

---- Cluster 0 ----
[[0.99999994 0.99999994 0.99999994 0.99999994]
 [0.99999994 0.99999994 0.99999994 0.99999994]
 [0.99999994 0.99999994 0.99999994 0.99999994]
 [0.99999994 0.99999994 0.99999994 0.99999994]]
---- Cluster -1 ----
[[1.         0.52742004 0.59591484 ... 0.68496567 0.5810586  0.6855994 ]
 [0.52742004 0.99999964 0.58275634 ... 0.7903449  0.511636   0.7580968 ]
 [0.59591484 0.58275634 1.0000001  ... 0.730366   0.6351837  0.7381768 ]
 ...
 [0.68496567 0.7903449  0.730366   ... 0.9999997  0.76358986 0.96752185]
 [0.5810586  0.511636   0.6351837  ... 0.76358986 1.0000001  0.7880773 ]
 [0.6855994  0.7580968  0.7381768  ... 0.96752185 0.7880773  0.9999995 ]]
---- Cluster 1 ----
[[1.0000001 1.0000001]
 [1.0000001 1.0000001]]
---- Cluster 2 ----
[[0.99999976 0.99778885]
 [0.99778885 0.99999976]]
---- Cluster 3 ----
[[0.99999964 0.99727714]
 [0.99727714 0.99999964]]
---- Cluster 4 ----
[[0.99999994 0.9962782 ]
 [0.9962782  0.9999999 ]]
---- Cluster 5 ----
[[1.         0.9980929 ]
 

In [99]:
ndf.query('cid == 87')

Unnamed: 0,name,cid
2956,agostini anna di lucca,87
2977,agostini pietro di lucca,87
2987,agostino da montefalco,87


In [106]:
# Example of combining the results of BERT embeddings and chars2vec
# to get clusters containing strings that are 
# similar both in topic and spelling
# Not really that useful, just an interesting POC

from bert_embedding import BertEmbedding
bem = BertEmbedding(model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased')


In [108]:
b_embeddings = bem(names[:500])
b_embeddings = np.asarray([x[1][0] for x in b_embeddings[4:]], dtype=np.float32)

In [134]:
b_embeddings.shape

(496, 768)

In [135]:
word_embeddings[4:500].shape

(496, 300)

In [142]:
combined_embeddings = np.hstack((b_embeddings, word_embeddings[4:500]))

In [176]:
# Clustering using KMeans and a cluster size that matches DBSCAN

cluster2 = KMeans(n_clusters=20, random_state=0).fit(combined_embeddings)
for ln in sorted(list(set(cluster2.labels_))):
    print("----CLUSTER {}----".format(ln))
    
    # Print out all names that reside in that cluster
    for name,label in zip(names[4:500], cluster2.labels_):
      if label == ln:
          print(name)

----CLUSTER 0----
 an h
a  b 
a  m
a a
a a de c
a a f
a a p
a ar
a b
a b don
a b m
a c
a c generosus
a c linc coll oxon
a c s
a d
a d c
a d m
a de b
a e
a f
a f m
a fd
a g
a i
a k
----CLUSTER 1----
a believer in politicks
a call to the jews author of
a citizen of the world
a disinterested bystander and a sincere friend to him in the truth
a friend of great britain
a friend of moses and servant of jesus christ
a friend of the authors
a friend of the clergy
a friend of true reformation and his native countrey
a friend to all mankind
a friend to liberty and justice
a friend to liberty and property
a friend to political equality
a friend to real bible religion and common sense
a friend to the church of england
a friend to the church of england and a lover of truth and peace
a friend to the parliament army and congregational churches
a friend to trade and liberty
a friend to truth and liberty
a friend to truth and peace
a hearty lover of his country
a hearty lover of the church and monarchy

In [175]:
# Clustering using DBSCAN

cluster2 = DBSCAN(eps=7.5, min_samples=2).fit(combined_embeddings)
for ln in sorted(list(set(cluster2.labels_))):
    print("----CLUSTER {}----".format(ln))
    
    # Print out all names that reside in that cluster
    for name,label in zip(names[4:500], cluster2.labels_):
      if label == ln:
          print(name)

----CLUSTER -1----
 abbe
 abu gafar aøhmad b muøhammad al naøhøhas
 adam
 an h
 arnold heinrich westerhoff
 bartolt van gent
 bentinck
 berage
 besson
 bishop of
 christiaen van heule
 comte de
 comtesse de
 connye bonaventura
 corisandre
 cruce
 de
 de bourdonne
 de maisoncelle
 dutot
 enoch krook
 foquet
 gerard kempher
 hauptpastor in h
 heinrich leonhard schurtzfleisch
 hendrik benjamin
 hendrik matthysz lussing
 hesiodus
 isaac vos
 jan hendrik verheyk
 jeanne antoinette poisson
 joan pluimer
 joannes matthaeus phrissemius
 johan nieuhof johan nieuhof
 johann friedrich leissner
 le clerc de septchenes
 le fevre de morsan
 m
 m de
 m labbe
 madame de
 michiel de renichon
 monsieur
 nedham
 nicolaas willem op den hooff
 norton roger
 osiandrus
 pernin des chavanettes
 petit oudin
 phillipps thomas
 pieter nuyts
 r p
 rebenlein georg
 roelof wouters
 santes pagninus
 songbooks
 spencer helen foresman donor
 stephani
 thomas
 titus petronius arbiter
 vavasseur
 vogel
 y bertin d
00
1 