## The Programming Historian

# Analyzing Documents with TF-IDF

#### Matthew J. Lavin, "Analyzing Documents with TF-IDF," The Programming Historian 8 (2019), https://doi.org/10.46430/phen0082.

In [21]:
from pathlib import Path
import os
import pandas as pd


In [22]:
# get all the filenames
all_txt_files =[]
for file in Path('data/results/Dante_authors').rglob("*.txt"):
     all_txt_files.append(file.parent / file.name)
# counts the length of the list
n_files = len(all_txt_files)
print(n_files)

5


In [23]:
# sort the filenames
all_txt_files.sort()
all_txt_files[0]

PosixPath('data/results/Dante_authors/Bartoli.txt')

In [24]:
# make a string containing all documents
all_docs = []
for txt_file in all_txt_files:
    with open(txt_file) as f:
        txt_file_as_string = f.read()
    all_docs.append(txt_file_as_string)

In [25]:
#import the TfidfVectorizer from Scikit-Learn.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=.7, min_df=1, stop_words=None, use_idf=True, norm=None)
transformed_documents = vectorizer.fit_transform(all_docs)

In [26]:
transformed_documents_as_array = transformed_documents.toarray()
# use this line of code to verify that the numpy array represents the same number of documents that we have in the file list
len(transformed_documents_as_array)

5

In [27]:
# make the output folder if it doesn't already exist
Path("./authors_tf_idf_output").mkdir(parents=True, exist_ok=True)

# construct a list of output file paths using the previous list of text files the relative path for tf_idf_output
output_filenames = [str(txt_file).replace(".txt", ".csv").replace('data/results/Dante_authors/', "authors_tf_idf_output/") for txt_file in all_txt_files]

# loop each item in transformed_documents_as_array, using enumerate to keep track of the current position
for counter, doc in enumerate(transformed_documents_as_array):
    # construct a dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)

    # output to a csv using the enumerated value for the filename
    one_doc_as_df.to_csv(output_filenames[counter])

### open authors-csv as dataframe to look at

In [31]:
# ./tf_idf_output

filenames = os.listdir('./authors_tf_idf_output')
filenames

['Giambullari.csv', 'Vcopy.csv', 'Vasari.csv', 'Borghini.csv', 'Bartoli.csv']

In [43]:
df_tf_idf = pd.DataFrame()

for i in filenames:
    author = i.strip('.csv')
    df = pd.read_csv('./authors_tf_idf_output/'+ (i))
    df_tf_idf[author + '_term'] = df['term']
    df_tf_idf[author + '_score'] = df['score']

df_tf_idf[:100]

Unnamed: 0,Giambullari_term,Giambullari_score,Vcopy_term,Vcopy_score,Vasari_term,Vasari_score,Borghini_term,Borghini_score,Bartoli_term,Bartoli_score
0,giambullari,8.432791,senpre,23.704061,tenpo,247.199488,anche,238.929068,danese,33.577797
1,pf,6.295837,tenpo,18.624619,auto,198.170580,vincenzo,238.733752,bisogna,25.298372
2,pierfrancesco,6.295837,bisognio,16.865581,senpre,194.711926,bisogna,158.817557,pennelli,23.704061
3,1549,5.621860,reverendo,15.460116,disegnio,160.223022,po,99.788023,istoria,21.081977
4,priego,4.216395,disegnio,15.460116,el,160.223022,vedete,95.571627,padroni,19.676512
...,...,...,...,...,...,...,...,...,...,...
95,voletelo,2.098612,pianta,5.079442,dallei,41.972246,cugino,22.010913,farevi,8.394449
96,sareste,2.098612,scade,5.079442,dica,40.758488,academia,22.010913,febraro,8.394449
97,persuadetevi,1.693147,piacie,5.079442,satisfatto,40.758488,bronzino,22.010913,palladio,8.394449
98,ingannato,1.693147,partire,5.079442,siano,40.758488,architettore,22.010913,libre,8.394449


## Observations:

Vasari used frequently:
- obligatissimo
- reverendissimo
- disegno
- reverendo
- bisognio

In [60]:
words = "obligatissimo, reverendissimo, disegno, reverendo, bisognio".split(',')

df_compare = pd.DataFrame()

for i, j in enumerate(df_tf_idf['Vasari_term']):
    if j.startswith('disegni'):
        print('Vasari\'s disegno is ' +j + ' and has index: ' + str(i) + ' and score: ' + str(df_tf_idf['Vasari_score'][i]))
      
            
    

Vasari's disegno is disegnio and has index: 3 and score: 160.22302232433074
Vasari's disegno is disegniare and has index: 387 and score: 18.88751059801299
Vasari's disegno is disegniato and has index: 558 and score: 15.238324625039509
Vasari's disegno is disegnia and has index: 917 and score: 10.493061443340553
Vasari's disegno is disegniata and has index: 1403 and score: 8.39444915467244
Vasari's disegno is disegniate and has index: 3099 and score: 4.19722457733622
Vasari's disegno is disegniati and has index: 3101 and score: 4.19722457733622
Vasari's disegno is disegniar and has index: 3115 and score: 4.19722457733622
Vasari's disegno is disegnierò and has index: 7137 and score: 2.09861228866811
Vasari's disegno is disegniavo and has index: 7138 and score: 2.09861228866811
Vasari's disegno is disegniava and has index: 7158 and score: 2.09861228866811
Vasari's disegno is disegniassi and has index: 7163 and score: 2.09861228866811
Vasari's disegno is disegniammo and has index: 7166 and

In [58]:
df_tf_idf['Vasari_score'][i]

0.0

In [54]:
df_tf_idf['Vasari_term'][9924:10119]

9924              storni
9925           storiette
9926            accurato
9927          fondazione
9928           leggerete
              ...       
10114         intenderai
10115             impeto
10116          codicillo
10117         chiocciole
10118    sufizientissimo
Name: Vasari_term, Length: 195, dtype: object

# Save and open csv as dataframe

In [61]:
filename = 'author_tf_idf.csv'
df_tf_idf.to_csv(filename, index = False, header=True)
#cosine_distances = pd.read_csv(filename)