## The Programming Historian 

# Understanding and Using Common Similarity Measures for Text Analysis

#### This Jupyter notebook is based on John R. Ladd, "Understanding and Using Common Similarity Measures for Text Analysis," The Programming Historian 9 (2020), https://doi.org/10.46430/phen0089.

In [1]:
import os
import glob
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import pdist, squareform

In [96]:
# create a list of file names
#filenames = glob.glob('data/Cosine/*.txt') # Cosine: every author in a single .txt file, plus "other" (9 in total)
#filenames = glob.glob('data/Cosine2/*.txt') # Cosine plus every biography in a single file named 'unkown_artistname.txt'
filenames = glob.glob('data/Cosine3/*.txt') # Cosine2, but Vasari and VCopy in one single file

# create a list of file keys 
filekeys = [f.split('/')[-1].split('.')[0] for f in filenames]

# Create a CountVectorizer instance with the parameters you need
vectorizer = CountVectorizer(input="filename", max_features=1000, max_df=0.7)

# vectorize the documents in the list of filenames to create the wordcounts
wordcounts = vectorizer.fit_transform(filenames).toarray()

# Cosine similarity

In [97]:
# make one big datafram with all the distances
cosine_distances = pd.DataFrame(squareform(pdist(wordcounts, metric='cosine')), index=filekeys, columns=filekeys)
cosine_distances.drop('other', 1).drop('other',0)

Unnamed: 0,Minerbetti,unkown_Pesello-e-Francesco-Peselli,unkown_Pulidoro-da-Caravaggio-e-Maturino-Fiorentino,unkown_Cosimo-Rosselli,unkown_Andrea-Pisano,unkown_Giovan-Francesco-detto-il-Fattore-e-Pellegrino-da-Modana,unkown_Lazzaro-Vasari,unkown_Margaritone,unkown_Simone-Mosca,unkown_Iacopo-di-Casentino,...,unkown_Benvenuto-Garofalo-e-Girolamo-da-Carpi-e-altri-lombardi,unkown_Giulio-Romano,unkown_Andrea-dal-Castagno-di-Mugello-e-Dominico-Viniziano,unkown_Sebastian-Viniziano-frate-del-Piombo,unkown_Giorgio-Vasari,unkown_Giovannantonio-detto-il-Soddoma-da-Verzelli,unkown_Stefano-e-Ugolino,unkown_Perino-del-Vaga,unkown_Tommaso-Fiorentino-detto-Giottino,Giambullari
Minerbetti,0.000000,0.937885,0.906162,0.947169,0.909371,0.947427,0.683737,0.862213,0.882098,0.943773,...,0.761848,0.930914,0.937144,0.872896,0.393716,0.910866,0.891347,0.870870,0.910391,0.288782
unkown_Pesello-e-Francesco-Peselli,0.937885,0.000000,0.847712,0.764926,0.758686,0.853956,0.752021,0.783096,0.799383,0.823015,...,0.779603,0.890405,0.740917,0.830888,0.884712,0.815657,0.805895,0.776782,0.804624,0.921121
unkown_Pulidoro-da-Caravaggio-e-Maturino-Fiorentino,0.906162,0.847712,0.000000,0.820358,0.744969,0.762968,0.800752,0.787596,0.766415,0.824409,...,0.673929,0.770628,0.793174,0.723898,0.781679,0.738942,0.786399,0.643657,0.764297,0.897592
unkown_Cosimo-Rosselli,0.947169,0.764926,0.820358,0.000000,0.847499,0.793508,0.792603,0.811116,0.763430,0.731423,...,0.718190,0.863666,0.758296,0.782834,0.815419,0.739551,0.731893,0.768825,0.711913,0.943116
unkown_Andrea-Pisano,0.909371,0.758686,0.744969,0.847499,0.000000,0.864086,0.787199,0.733546,0.617294,0.798240,...,0.645036,0.813696,0.439473,0.765510,0.774268,0.786064,0.734316,0.711590,0.697853,0.899845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
unkown_Giovannantonio-detto-il-Soddoma-da-Verzelli,0.910866,0.815657,0.738942,0.739551,0.786064,0.620313,0.702540,0.781325,0.647650,0.747060,...,0.478277,0.713645,0.657126,0.577383,0.704622,0.000000,0.673612,0.640054,0.649514,0.915510
unkown_Stefano-e-Ugolino,0.891347,0.805895,0.786399,0.731893,0.734316,0.785885,0.750145,0.769120,0.759882,0.781073,...,0.639111,0.826864,0.717389,0.746979,0.768611,0.673612,0.000000,0.722195,0.515369,0.889447
unkown_Perino-del-Vaga,0.870870,0.776782,0.643657,0.768825,0.711590,0.618413,0.746380,0.769340,0.630630,0.832264,...,0.586599,0.682474,0.717991,0.600833,0.711609,0.640054,0.722195,0.000000,0.744462,0.854887
unkown_Tommaso-Fiorentino-detto-Giottino,0.910391,0.804624,0.764297,0.711913,0.697853,0.778086,0.691882,0.662528,0.732002,0.614806,...,0.606039,0.808347,0.744560,0.760812,0.771813,0.649514,0.515369,0.744462,0.000000,0.924728


In [98]:
#cosine_distances.drop('other',0).sort_values(by = ['Vasariplus'])['Vasariplus'][:20]



cosine_distances.drop('other',0).drop('other',1).sort_values(by = ['Vasari'])                #['Vasari'][:20]

KeyError: 'Vasari'

In [100]:
# make new dataframe only with the author candidate columns

cosine_distances2 = pd.DataFrame()                         # comment out depending on which folder was chosen!


cosine_distances2['Vasari'] = cosine_distances['Vasariplus']   # Vasari and copies together or
#cosine_distances2['Vasari'] = cosine_distances['Vasari']         # Vasari and copies split
#cosine_distances2['VasariC'] = cosine_distances['Vcopy']

cosine_distances2['Minerbetti'] = cosine_distances['Minerbetti']
cosine_distances2['Ghiberti'] = cosine_distances['Ghiberti']
cosine_distances2['Borghini'] = cosine_distances['Borghini']
cosine_distances2['Bartoli'] = cosine_distances['Bartoli']
cosine_distances2['Sanga'] = cosine_distances['Sanga']
cosine_distances2['Giambullari'] = cosine_distances['Giambullari']

#cosine_distances2.drop('other')#.drop('unkown')
cosine_distances2

Unnamed: 0,Vasari,Minerbetti,Ghiberti,Borghini,Bartoli,Sanga,Giambullari
Minerbetti,0.268090,0.000000,0.768238,0.197637,0.230707,0.484728,0.288782
unkown_Pesello-e-Francesco-Peselli,0.922825,0.937885,0.961876,0.930589,0.912532,0.925048,0.921121
unkown_Pulidoro-da-Caravaggio-e-Maturino-Fiorentino,0.871934,0.906162,0.924489,0.863298,0.871101,0.925096,0.897592
unkown_Cosimo-Rosselli,0.936107,0.947169,0.967797,0.932236,0.867683,0.956386,0.943116
unkown_Andrea-Pisano,0.872480,0.909371,0.926984,0.868171,0.878459,0.939373,0.899845
...,...,...,...,...,...,...,...
unkown_Giovannantonio-detto-il-Soddoma-da-Verzelli,0.871368,0.910866,0.941805,0.853703,0.868761,0.925126,0.915510
unkown_Stefano-e-Ugolino,0.844073,0.891347,0.941158,0.840815,0.869462,0.917184,0.889447
unkown_Perino-del-Vaga,0.819733,0.870870,0.937129,0.812193,0.839988,0.904458,0.854887
unkown_Tommaso-Fiorentino-detto-Giottino,0.889407,0.910391,0.939892,0.897734,0.888480,0.931625,0.924728


In [82]:
cosine_distances2.loc['unkown_Vite-Introduction']

Vasari         0.671876
Minerbetti     0.759471
Ghiberti       0.819443
Borghini       0.670485
Bartoli        0.698431
Sanga          0.803080
Giambullari    0.737087
Name: unkown_Vite-Introduction, dtype: float64

In [38]:
#cosine_distances = pd.DataFrame(squareform(pdist(wordcounts, metric='cosine')), index=filekeys, columns=filekeys)
topVasari_cosine = cosine_distances.nsmallest(20, 'Vasari')['Vasari']
#print(topVasari_cosine[1:])

In [40]:
#have a look at first results
cosine_distances2.loc['unkown_Francia-Bigio']['Vasari'], cosine_distances.loc['unkown_Francia-Bigio']['Borghini']

(0.9054131305373565, 0.9094371821045837)

(0.9056021423714086, 0.9099339737093348)


In [83]:
cosine_distances2

Unnamed: 0,Vasari,Minerbetti,Ghiberti,Borghini,Bartoli,Sanga,Giambullari
Minerbetti,0.268090,0.000000,0.768238,0.197637,0.230707,0.484728,0.288782
unkown_Pesello-e-Francesco-Peselli,0.922825,0.937885,0.961876,0.930589,0.912532,0.925048,0.921121
unkown_Pulidoro-da-Caravaggio-e-Maturino-Fiorentino,0.871934,0.906162,0.924489,0.863298,0.871101,0.925096,0.897592
unkown_Cosimo-Rosselli,0.936107,0.947169,0.967797,0.932236,0.867683,0.956386,0.943116
unkown_Andrea-Pisano,0.872480,0.909371,0.926984,0.868171,0.878459,0.939373,0.899845
...,...,...,...,...,...,...,...
unkown_Giovannantonio-detto-il-Soddoma-da-Verzelli,0.871368,0.910866,0.941805,0.853703,0.868761,0.925126,0.915510
unkown_Stefano-e-Ugolino,0.844073,0.891347,0.941158,0.840815,0.869462,0.917184,0.889447
unkown_Perino-del-Vaga,0.819733,0.870870,0.937129,0.812193,0.839988,0.904458,0.854887
unkown_Tommaso-Fiorentino-detto-Giottino,0.889407,0.910391,0.939892,0.897734,0.888480,0.931625,0.924728


In [101]:
# Biographies written by Vasari, argued by Charles Hope

x = cosine_distances2.loc['unkown_Giorgio-Vasari']
y = cosine_distances2.loc['unkown_Spinello-Aretino']
z = cosine_distances2.loc['unkown_Niccolò-Aretino']
d = cosine_distances2.loc['unkown_Benvenuto-Garofalo-e-Girolamo-da-Carpi-e-altri-lombardi']
f = cosine_distances2.loc['unkown_Taddeo-Gaddi']

temp = pd.DataFrame()
temp['Giorgio Vasari'] = x
temp['Spinello Aretino'] = y
temp['Niccolò Aretino'] = z
temp['Garofalo & Girolamo'] = d
temp['Taddeo Gaddi'] = f


#temp['Francia'] = q

temp
#temp.loc['Vasari'], 

Unnamed: 0,Giorgio Vasari,Spinello Aretino,Niccolò Aretino,Garofalo & Girolamo,Taddeo Gaddi
Vasari,0.368305,0.895824,0.894559,0.684761,0.914213
Minerbetti,0.393716,0.903461,0.8996,0.761848,0.938799
Ghiberti,0.922314,0.919966,0.940599,0.907216,0.938157
Borghini,0.330595,0.90123,0.893514,0.698738,0.915288
Bartoli,0.384883,0.906465,0.904549,0.727574,0.921279
Sanga,0.646803,0.943909,0.949016,0.795127,0.943848
Giambullari,0.457802,0.923857,0.902747,0.767394,0.93381


In [74]:
#temp.loc['Vcopy'], 

In [73]:
#temp.loc['Borghini'], 

In [48]:
#temp.sort_values(by = 'Garofalo & Girolamo')[:40]

In [102]:
# Biorgaphies NOT written by Vasari (Charles Hope, 2014)


x = cosine_distances2.loc['unkown_Lionardo-da-Vinci']
y = cosine_distances2.loc['unkown_Sebastian-Viniziano-frate-del-Piombo']
z = cosine_distances2.loc['unkown_Perino-del-Vaga']
a = cosine_distances2.loc['unkown_Giulio-Romano']
b = cosine_distances2.loc['unkown_Antonio-da-San-Gallo']
c = cosine_distances2.loc['unkown_Raffaello-dUrbino']
d = cosine_distances2.loc['unkown_Bramante-da-Urbino']
q = cosine_distances2.loc['unkown_Francia-Bigio']


temp = pd.DataFrame()
temp['Lionardo da Vinci'] = x
temp['Sebastian del Piombo'] = y
temp['Perino del Vaga'] = z
temp['Giulio Romano'] = a
temp['Antonio da San Gallo'] = b
temp['Raffaello'] = c
temp['Bramante'] = d
temp['Francia Bigio'] = q


temp

Unnamed: 0,Lionardo da Vinci,Sebastian del Piombo,Perino del Vaga,Giulio Romano,Antonio da San Gallo,Raffaello,Bramante,Francia Bigio
Vasari,0.838227,0.83196,0.819733,0.88765,0.887772,0.832026,0.816323,0.906122
Minerbetti,0.894706,0.872896,0.87087,0.930914,0.939161,0.886069,0.885258,0.938437
Ghiberti,0.926913,0.925713,0.937129,0.948187,0.945193,0.898223,0.912918,0.952901
Borghini,0.833627,0.823101,0.812193,0.895242,0.912065,0.822368,0.833868,0.909321
Bartoli,0.846568,0.843865,0.839988,0.899286,0.913919,0.833984,0.850107,0.913987
Sanga,0.893628,0.880148,0.904458,0.923969,0.919343,0.896218,0.877811,0.942674
Giambullari,0.893186,0.864241,0.854887,0.923079,0.931137,0.870369,0.862951,0.941179


# Save and open dataframe as csv

In [81]:
filename = 'cosine_distance.csv'
#cosine_distances.to_csv(filename, index = False, header=True)
#cosine_distances = pd.read_csv(filename)
