## The Programming Historian 

# Understanding and Using Common Similarity Measures for Text Analysis

#### This Jupyter notebook is based on John R. Ladd, "Understanding and Using Common Similarity Measures for Text Analysis," The Programming Historian 9 (2020), https://doi.org/10.46430/phen0089.

In [5]:
import os
import glob
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import pdist, squareform

In [69]:
# Vasari Version

# Use the glob library to create a list of file names
filenames = glob.glob('data/results/Dante/*.txt')
# Parse those filenames to create a list of file keys (ID numbers)
# You'll use these later on.
filekeys = [f.split('/')[-1].split('.')[0] for f in filenames]

# Create a CountVectorizer instance with the parameters you need
vectorizer = CountVectorizer(input="filename", max_features=1000, max_df=0.7)
# Run the vectorizer on your list of filenames to create your wordcounts
# Use the toarray() function so that SciPy will accept the results
wordcounts = vectorizer.fit_transform(filenames).toarray()


In [70]:
euclidean_distances = pd.DataFrame(squareform(pdist(wordcounts)), index=filekeys, columns=filekeys)
#print(euclidean_distances)

In [71]:
euclidean_distances

Unnamed: 0,Vasari_509,Vasari_535,Vasari_253,Vasari_247,unkown_Pesello-e-Francesco-Peselli,Borghini_302,unkown_Pulidoro-da-Caravaggio-e-Maturino-Fiorentino,unkown_Cosimo-Rosselli,Borghini_666,Bartoli_948,...,Bartoli_979,Borghini_125,unkown_Tommaso-Fiorentino-detto-Giottino,Borghini_870,Vasari_706,Vasari_289,Borghini_1033,Vasari_262,Vasari_276,Vasari_510
Vasari_509,0.000000,16.822604,10.862780,10.908712,30.099834,15.329710,105.323312,46.054316,22.338308,11.704700,...,13.820275,24.596748,67.126746,18.275667,10.440307,40.087405,10.862780,20.124612,10.535654,23.280893
Vasari_535,16.822604,0.000000,17.804494,16.970563,31.384710,21.118712,102.800778,45.934736,25.179357,18.814888,...,19.646883,26.870058,65.612499,21.748563,17.549929,39.115214,17.748239,22.671568,16.970563,25.709920
Vasari_253,10.862780,17.804494,0.000000,11.269428,29.664794,16.583124,104.952370,45.814845,22.956481,12.206556,...,14.387495,25.238859,66.962676,18.165902,11.180340,38.691084,12.000000,20.371549,11.532563,21.771541
Vasari_247,10.908712,16.970563,11.269428,0.000000,29.782545,15.165751,103.990384,45.254834,22.000000,11.832160,...,13.856406,24.698178,66.445466,17.916473,11.135529,40.174619,11.357817,20.199010,10.295630,24.392622
unkown_Pesello-e-Francesco-Peselli,30.099834,31.384710,29.664794,29.782545,0.000000,30.049958,94.005319,39.230090,33.570821,29.546573,...,30.149627,35.482390,58.223707,30.983867,29.849623,46.206060,29.899833,32.357379,29.478806,36.687873
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vasari_289,40.087405,39.115214,38.691084,40.174619,46.206060,41.569219,99.769735,52.915026,40.224371,41.737274,...,42.071368,40.496913,67.668309,39.255573,41.569219,0.000000,40.681691,36.249138,40.024992,34.102786
Borghini_1033,10.862780,17.748239,12.000000,11.357817,29.899833,14.387495,105.142760,45.442271,21.563859,10.723805,...,12.369317,24.062419,67.126746,18.110770,10.908712,40.681691,0.000000,21.142375,10.816654,25.337719
Vasari_262,20.124612,22.671568,20.371549,20.199010,32.357379,22.847319,100.737282,44.609416,25.179357,21.354157,...,22.360680,26.570661,64.691576,23.388031,20.832667,36.249138,21.142375,0.000000,20.199010,27.073973
Vasari_276,10.535654,16.970563,11.532563,10.295630,29.478806,14.832397,104.556205,45.321077,22.090722,11.135529,...,13.564660,24.372115,66.865537,17.464249,10.862780,40.024992,10.816654,20.199010,0.000000,24.799194


## Euclidean distance

In [72]:
#unkown_Proemio-della-terza-parte-delle-Vite.txt

top5_euclidean = euclidean_distances.nsmallest(6, 'unkown_Proemio-della-terza-parte-delle-Vite')['unkown_Proemio-della-terza-parte-delle-Vite'][1:]
print(top5_euclidean)

unkown_Antonio-da-Correggio                          56.213877
unkown_Desiderio-da-Settignano                       61.057350
unkown_Antonio-Rossellino-e-Bernardo-suo-fratello    62.337790
unkown_Ambruogio-Lorenzetti                          62.449980
unkown_Torrigiano                                    62.506000
Name: unkown_Proemio-della-terza-parte-delle-Vite, dtype: float64


In [73]:
#unkown_Giorgio-Vasari.txt

top5_euclidean = euclidean_distances.nsmallest(6, 'unkown_Giorgio-Vasari')['unkown_Giorgio-Vasari'][1:]
print(top5_euclidean)

unkown_Benvenuto-Garofalo-e-Girolamo-da-Carpi-e-altri-lombardi    276.488698
unkown_Taddeo-Zucchero                                            280.937716
unkown_Francesco-detto-de-Salviati                                284.966665
unkown_Accademici-del-Disegno-e-il-Bronzino                       287.815913
unkown_Iacopo-da-Puntormo                                         295.304927
Name: unkown_Giorgio-Vasari, dtype: float64


## Cosine distance

In [76]:
cosine_distances = pd.DataFrame(squareform(pdist(wordcounts, metric='cosine')), index=filekeys, columns=filekeys)

top5_cosine = cosine_distances.nsmallest(50, 'unkown_Proemio-della-seconda-parte-delle-Vite')['unkown_Proemio-della-seconda-parte-delle-Vite'][1:]
print(top5_cosine)

unkown_Proemio-della-terza-parte-delle-Vite                       0.219535
unkown_Lorenzo-Ghiberti                                           0.340875
unkown_Raffaello-dUrbino                                          0.342655
unkown_Pulidoro-da-Caravaggio-e-Maturino-Fiorentino               0.350241
unkown_Masaccio-da-San-Giovanni-di-Valdarno                       0.359570
unkown_Giorgio-Vasari                                             0.361300
unkown_Perino-del-Vaga                                            0.369893
unkown_Francesco-Primaticcio-bolognese-abate-di-San-Martino       0.370143
unkown_Michelagnolo-Buonarruoti                                   0.370506
unkown_Battista-Franco                                            0.374902
unkown_Antonio-da-Correggio                                       0.375717
unkown_Andrea-Pisano                                              0.379119
unkown_Cronaca                                                    0.380467
unkown_Arnolfo-di-Lapo   

In [78]:
cosine_distances = pd.DataFrame(squareform(pdist(wordcounts, metric='cosine')), index=filekeys, columns=filekeys)

top5_cosine = cosine_distances.nsmallest(50, 'unkown_Michelagnolo-Buonarruoti')['unkown_Michelagnolo-Buonarruoti']
print(top5_cosine)

unkown_Michelagnolo-Buonarruoti                                                                                               0.000000
unkown_Sebastian-Viniziano-frate-del-Piombo                                                                                   0.172913
unkown_Raffaello-dUrbino                                                                                                      0.176769
unkown_Baccio-Bandinelli                                                                                                      0.200637
unkown_Filippo-Brunelleschi                                                                                                   0.202926
unkown_Perino-del-Vaga                                                                                                        0.203476
unkown_Bramante-da-Urbino                                                                                                     0.204870
unkown_Bastiano-detto-Aristotile-da-San-Gallo          

In [68]:
cosine_distances = pd.DataFrame(squareform(pdist(wordcounts, metric='cosine')), index=filekeys, columns=filekeys)

#top5_cosine = cosine_distances.nsmallest(30, 'Vasari_17')['Vasari_17']
print(top5_cosine)

unkown_Michelagnolo-Buonarruoti                      0.000000
unkown_Giuliano-Bugiardini                           0.504794
unkown_Francesco-Granacci                            0.529109
unkown_Fra-GiovannAgnolo-Montorsoli                  0.537853
unkown_Lione-Lioni-e-altri-scultori-et-architetti    0.542122
unkown_Accademici-del-Disegno-e-il-Bronzino          0.542525
unkown_Francesco-detto-de-Salviati                   0.557830
unkown_Baccio-Bandinelli                             0.565661
unkown_Raffaello-dUrbino                             0.577913
unkown_Sebastian-Viniziano-frate-del-Piombo          0.578664
Name: unkown_Michelagnolo-Buonarruoti, dtype: float64


In [79]:
cosine_distances

Unnamed: 0,Vasari_509,Vasari_535,Vasari_253,Vasari_247,unkown_Pesello-e-Francesco-Peselli,Borghini_302,unkown_Pulidoro-da-Caravaggio-e-Maturino-Fiorentino,unkown_Cosimo-Rosselli,Borghini_666,Bartoli_948,...,Bartoli_979,Borghini_125,unkown_Tommaso-Fiorentino-detto-Giottino,Borghini_870,Vasari_706,Vasari_289,Borghini_1033,Vasari_262,Vasari_276,Vasari_510
Vasari_509,0.000000,0.618767,0.620527,0.657453,0.913934,0.817899,0.904360,0.926305,0.779102,0.819813,...,0.845773,0.791385,0.872280,0.765757,0.666895,0.649585,0.744345,0.601081,0.671366,0.594195
Vasari_535,0.618767,0.000000,0.705100,0.632533,0.808811,0.859819,0.750207,0.812791,0.761831,0.854158,...,0.827601,0.759535,0.754169,0.731066,0.713457,0.611693,0.743594,0.632130,0.646629,0.661200
Vasari_253,0.620527,0.705100,0.000000,0.664549,0.857113,0.935217,0.867683,0.895130,0.830566,0.840455,...,0.879507,0.846519,0.855192,0.738889,0.719101,0.526107,0.854479,0.621553,0.757513,0.461813
Vasari_247,0.657453,0.632533,0.664549,0.000000,0.878745,0.796266,0.766181,0.841971,0.742926,0.832752,...,0.846623,0.801003,0.801730,0.726281,0.754615,0.658931,0.809315,0.608059,0.636863,0.703500
unkown_Pesello-e-Francesco-Peselli,0.913934,0.808811,0.857113,0.878745,0.000000,0.811438,0.482004,0.472273,0.817971,0.868181,...,0.868238,0.850225,0.483725,0.784536,0.908418,0.788246,0.921613,0.768870,0.862507,0.889074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vasari_289,0.649585,0.611693,0.526107,0.658931,0.788246,0.775984,0.661299,0.718240,0.640477,0.819983,...,0.833608,0.632429,0.686459,0.618842,0.801319,0.000000,0.696935,0.480771,0.629752,0.405390
Borghini_1033,0.744345,0.743594,0.854479,0.809315,0.921613,0.764316,0.880803,0.870178,0.713426,0.792704,...,0.739073,0.750204,0.872775,0.784413,0.843873,0.696935,0.000000,0.718004,0.817964,0.823738
Vasari_262,0.601081,0.632130,0.621553,0.608059,0.768870,0.750562,0.670763,0.708121,0.644220,0.734996,...,0.772391,0.642728,0.704900,0.673228,0.679803,0.480771,0.718004,0.000000,0.610391,0.645346
Vasari_276,0.671366,0.646629,0.757513,0.636863,0.862507,0.800519,0.815565,0.853679,0.768315,0.815774,...,0.870805,0.780802,0.843027,0.698495,0.797279,0.629752,0.817964,0.610391,0.000000,0.755051


In [80]:
cosine_distances = pd.DataFrame(squareform(pdist(wordcounts, metric='cosine')), index=filekeys, columns=filekeys)

top5_cosine = cosine_distances.nsmallest(50, 'Borghini_586')['Borghini_586']
print(top5_cosine)

Borghini_586                       0.000000
Borghini_584                       0.295851
Borghini_595                       0.340086
Borghini_846                       0.355624
Borghini_860                       0.356481
Borghini_520                       0.358828
Borghini_658                       0.360819
Borghini_873                       0.360891
Borghini_1043                      0.371984
Borghini_580                       0.375757
Borghini_629                       0.378190
Borghini_842                       0.379479
Borghini_1026                      0.381099
Borghini_981                       0.381851
Borghini_1059                      0.382523
Borghini_120                       0.385630
Borghini_578                       0.387735
Borghini_519                       0.389963
unkown_Giorgio-Vasari              0.393611
Borghini_525                       0.395324
Borghini_598                       0.395501
Borghini_848                       0.401613
unkown_Michelagnolo-Buonarruoti 

# Save and open dataframe as csv

In [81]:
filename = 'cosine_distance.csv'
#cosine_distances.to_csv(filename, index = False, header=True)
#cosine_distances = pd.read_csv(filename)
