# DS 5001 Project Notebook: Greek and Roman Mythology

- David Vann (dv6bq@virginia.edu)
- DS 5001
- 5 May 2021

In [1]:
import os
from glob import glob

import numpy as np
import pandas as pd
import nltk

from eta_modules.preprocessing import Document, Corpus

In [2]:
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')

## Reading in the data

We start by loading in the XML files for each work and parsing them to a reasonable degree with BeautifulSoup and NLTK. 

Since these works are all either plays or poems/epics, the concept of a "chapter" or "paragraph" doesn't translate perfectly compared to, e.g., a novel. However, the Perseus Digital Library (where these files are sourced from) has added at least top-level divisions to break up texts. In some cases, these divisions truly exist in the text (for example, *The Iliad* is broken into 24 books); in other cases, like plays, these divisions don't seem to be directly present in the text, but are akin to something like a "scene". I've considered all of these largest divisions as "chapters".

To get at something like a "paragraph", I used a different approach based on whether the work was a play or not:

- For plays, I used each speaker section (denoted by a "\<sp>" in the files) as a "paragraph". 
- For everything else, there wasn't a built-in tag for "paragraph"-type divisions, but there is a self-closing "milestone" tag that marks the start of a new "card" used on the Perseus website to denote content to be displayed on one page. Since these are self-closing, they don't actually enclose the particular block of text that I wanted to get at; instead, I replaced these with newlines and split up text based on a double newline, which seemed to give fairly satisfactory results.

In [3]:
root_dir = os.path.abspath('..')
data_dir = os.path.join(root_dir, 'data')
output_dir = os.path.join(data_dir, 'outputs')

docpaths = glob(os.path.join(data_dir, 'raw', '**', '*.xml'), recursive=True)

OHCO = ['work', 'chapter', 'para', 'sent']

In [4]:
doc_list = []

for path in docpaths:
    doc = Document(path)
    doc_list.append(doc)
    
    doc.parse_text_to_paras()
    doc.tokenize(remove_pos_tuple=True, remove_ws=True)

In [18]:
book_bag = OHCO[:1]
chapter_bag = OHCO[:2]
paragraph_bag = OHCO[:3]

corp = Corpus(doc_list)
corp.extract_annotate_vocab()
corp.compute_tfidf(OHCO_level=book_bag, methods=['n', 'max', 'bool'])

In [20]:
corp.vocab.sort_values('tfidf_max_sum', ascending=False).head(10)

Unnamed: 0_level_0,n,stop,p_stem,df,idf,tfidf_n_sum,tfidf_max_sum,tfidf_bool_sum
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
oedipus,436,0,oedipu,5,1.925999,0.853654,0.289711,0.204559
electra,222,0,electra,5,1.925999,0.434659,0.27149,0.202865
thou,2550,0,thou,6,1.662965,4.310846,0.263936,0.121421
creon,216,0,creon,5,1.925999,0.422911,0.263725,0.164499
odysseus,787,0,odysseu,7,1.440573,1.152522,0.259,0.191628
orestes,225,0,orest,9,1.078003,0.246571,0.257692,0.16237
dionysus,138,0,dionysu,8,1.247928,0.175068,0.255391,0.205215
deathless,103,0,deathless,7,1.440573,0.150838,0.2548,0.260727
prometheus,101,0,prometheu,5,1.925999,0.19775,0.254311,0.216066
achilles,548,0,achil,8,1.247928,0.695199,0.2538,0.156809


In [7]:
# corp.save_tables(os.path.join(output_dir, 'corpus'))