## The Programming Historian

# Analyzing Documents with TF-IDF

#### Matthew J. Lavin, "Analyzing Documents with TF-IDF," The Programming Historian 8 (2019), https://doi.org/10.46430/phen0082.

In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np

In [64]:
# get all filenames
all_txt_files =[]
for file in Path('data/TF-IDF').rglob("*.txt"):
     all_txt_files.append(file.parent / file.name)
# counts the length of the list
n_files = len(all_txt_files)
print(n_files)

8


In [65]:
# sort the filenames
all_txt_files.sort()
all_txt_files[0]

PosixPath('data/TF-IDF/Bartoli.txt')

In [66]:
# make a string containing all documents
all_docs = []
for txt_file in all_txt_files:
    with open(txt_file) as f:
        txt_file_as_string = f.read()
    all_docs.append(txt_file_as_string)

In [67]:
#import the TfidfVectorizer from Scikit-Learn.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=.65, min_df=1, stop_words=None, use_idf=True, norm=None)
transformed_documents = vectorizer.fit_transform(all_docs)

In [68]:
transformed_documents_as_array = transformed_documents.toarray()
len(transformed_documents_as_array)

8

In [69]:
# make output folder if it doesn't already exist
Path("./authors_tf_idf_output").mkdir(parents=True, exist_ok=True)

# construct a list of output file paths using the previous list of text files the relative path for tf_idf_output
output_filenames = [str(txt_file).replace(".txt", ".csv").replace('data/TF-IDF/', "authors_tf_idf_output/") for txt_file in all_txt_files]

# loop each item in transformed_documents_as_array
for counter, doc in enumerate(transformed_documents_as_array):
    # make dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)

    # output to a csv 
    one_doc_as_df.to_csv(output_filenames[counter])

### open authors-csv as dataframe to look at

In [70]:
#os.listdir('./authors_tf_idf_output')

In [72]:
# ./tf_idf_output
filenames = os.listdir('./authors_tf_idf_output')
for i in filenames:
    author = i.strip('.csv')
    #df = pd.read_csv('./authors_tf_idf_output/'+ (i))
    #df_tf_idf[author ] = df['term'] #+ '_term'
    #df_tf_idf[author + '_score'] = df['score']
    print(author)

Sanga
VasariC
Giambullari
Ghiberti
Minerbetti
Vasari
Borghini
Bartoli


In [73]:
df_tf_idf = pd.DataFrame()

for i in filenames:
    author = i.strip('.csv')
    df = pd.read_csv('./authors_tf_idf_output/'+ (i))
    df_tf_idf[author] = df['term'] 
    #df_tf_idf[author + '_score'] = df['score']

In [74]:
df_tf_idf[:20]

Unnamed: 0,Sanga,VasariC,Giambullari,Ghiberti,Minerbetti,Vasari,Borghini,Bartoli
0,guglielmo,intanto,giambullari,quäle,el,tenpo,vincenzo,bartoli
1,sangalletti,senpre,pf,uiso,vescovo,intanto,anche,cosimo
2,osservandissimo,spedalingo,pierfrancesco,el,dilettissimo,senpre,vorrei,venezia
3,no,tenpo,1549,addunque,arezo,spedalingo,borghini,osservandissimo
4,magnifica,disegnio,onoratissimo,uisa,bindo,auto,vedete,cavalier
5,bascio,reverendo,vivete,superficie,altar,disegnio,batista,danese
6,affettuosissimo,bisognio,priego,piü,altoviti,reverendo,concetto,feliciti
7,dicie,palazzo,onorandissimo,sarä,vecchio,palazzo,cioè,avuto
8,bosco,auto,annotazione,spetie,luigi,el,avuto,pennelli
9,bisognia,core,tolomei,imperö,1553,nocenti,certi,accompagnerei


In [53]:
df_tf_idf.columns

Index(['Sanga', 'VasariC', 'Giambullari', 'Ghiberti', 'Minerbetti', 'Vasari',
       'Borghini', 'Bartoli'],
      dtype='object')

## check the top 50 overlapping terms 

In [81]:
#df_tf_idf[:50]['Vcopy']
store = []
for i in df_tf_idf[:50]['Vasari']:
    for j in df_tf_idf[:50]['Borghini']:
        if i == j:
            store.append(i)
            print(i)
print(store, len(store))

el
anche
batista
cavalier
voluto
['el', 'anche', 'batista', 'cavalier', 'voluto'] 5


### check the rank of "disegno"

In [78]:
#words = "obligatissimo, reverendissimo, disegno, reverendo, bisognio".split(',')
#df_compare = pd.DataFrame()

for i, j in enumerate(df_tf_idf['Vasari']):
    if j.startswith('disegnio'):
        print('Vasari\'s ' + j + ' has index: ' + str(i) )
        
for i, j in enumerate(df_tf_idf['VasariC']):
    if j.startswith('disegnio'):
        print('Vasari_C\'s ' + j + ' has index: ' + str(i) )
        
for i, j in enumerate(df_tf_idf['Giambullari']):
    if j.startswith('disegnio'):
        print('Giambullari\'s ' + j + ' has index: ' + str(i) )
        
for i, j in enumerate(df_tf_idf['Bartoli']):
    if j.startswith('disegnio'):
        print('Bartoli\'s ' + j + ' has index: ' + str(i) )

for i, j in enumerate(df_tf_idf['Borghini']):
    if j.startswith('disegnio'):
        print('Borghini\'s ' + j + ' has index: ' + str(i) )
        
for i, j in enumerate(df_tf_idf['Ghiberti']):
    if j.startswith('disegnio'):
        print('Ghiberti\'s ' + j + ' has index: ' + str(i) )
    
for i, j in enumerate(df_tf_idf['Minerbetti']):
    if j.startswith('disegnio'):
        print('Minerbetti\'s ' + j + ' has index: ' + str(i) )
        
for i, j in enumerate(df_tf_idf['Sanga']):
    if j.startswith('disegnio'):
        print('Sanga\'s ' + j + ' has index: ' + str(i) )

Vasari's disegnio has index: 5
Vasari_C's disegnio has index: 4
Giambullari's disegnio has index: 22685
Bartoli's disegnio has index: 2657
Borghini's disegnio has index: 24023
Ghiberti's disegnio has index: 19488
Minerbetti's disegnio has index: 23982
Sanga's disegnio has index: 62


- Vasari's disegnio has index: 5
- Vasari_C's disegnio has index: 4
- Giambullari's disegnio has index: 22685
- Bartoli's disegnio has index: 2657
- Borghini's disegnio has index: 24023
- Ghiberti's disegnio has index: 19488
- Minerbetti's disegnio has index: 23982
- Sanga's disegnio has index: 62

In [42]:
df_tf_idf['Vasari'][:50]

0               tenpo
1                auto
2              senpre
3            disegnio
4                  el
5           reverendo
6             palazzo
7            bisognio
8               anche
9             batista
10            nocenti
11            caccini
12             stanze
13      michelagniolo
14            lorenzo
15     reverendissimo
16             pietre
17           signorie
18             pietro
19            fabrica
20               vole
21              unico
22           lionardo
23      obligatissimo
24            davitte
25             figure
26              scala
27                man
28           bisognia
29           qualcosa
30                vol
31              iddio
32             inperò
33             obligo
34                 gl
35            modello
36              messo
37              marmo
38         proveditor
39              dirli
40              saria
41              palco
42            stamani
43         dessiderio
44             altare
45        

# Save and open csv as dataframe

In [79]:
filename = 'author_tf_idf.csv'
df_tf_idf.to_csv(filename, index = False, header=True)
#cosine_distances = pd.read_csv(filename)