In [1]:
import pandas as pd
import numpy as np
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import cltk
import re
from bs4 import BeautifulSoup, SoupStrainer

### Importing & Cleaning Texts:

In [2]:
# Dictionary of Authors, their texts, the text urls and the text itself.
  # The text itself will be retrieved via the function below.

text_dict = {'Petrarca': {'Africa': 
                                    {'url': 'http://backend.bibliotecaitaliana.it/wp-json/muruca-core/v1/xml/bibit000921',
                                     'text1': '',
                                     'text2': ''},
                          'Bucc. Carmen - Pet': 
                                    {'url': 'http://backend.bibliotecaitaliana.it/wp-json/muruca-core/v1/xml/bibit000846',
                                     'text1': '',
                                     'text2': ''},
                          'Varia': 
                                    {'url': 'http://backend.bibliotecaitaliana.it/wp-json/muruca-core/v1/xml/bibit000769',
                                     'text1': '', 
                                     'text2': ''},
                          'Psalmi': 
                                    {'url': 'http://backend.bibliotecaitaliana.it/wp-json/muruca-core/v1/xml/bibit001378',
                                     'text1': '', 
                                     'text2': ''}},
            'Boccaccio': {'Carmina': 
                                    {'url': 'http://backend.bibliotecaitaliana.it/wp-json/muruca-core/v1/xml/bibit001658',
                                     'text1': '', 
                                     'text2': ''},
                          'Bucc. Carmen - Boc': 
                                    {'url': 'http://backend.bibliotecaitaliana.it/wp-json/muruca-core/v1/xml/bibit001461',
                                     'text1': '', 
                                     'text2': ''}},
             'Dante': {'Egloghe': 
                                    {'url': 'http://backend.bibliotecaitaliana.it/wp-json/muruca-core/v1/xml/bibit000808',
                                     'text1': '', 
                                     'text2': ''},
                       'Quaestio': 
                                    {'url': 'http://backend.bibliotecaitaliana.it/wp-json/muruca-core/v1/xml/bibit001095',
                                     'text1': '', 
                                     'text2': ''},
                       'DVE': 
                                    {'url': 'http://backend.bibliotecaitaliana.it/wp-json/muruca-core/v1/xml/bibit000018',
                                     'text1': '', 
                                     'text2': ''}},
             'Anon': {'Maiolichinus': 
                                     {'url':'http://backend.bibliotecaitaliana.it/wp-json/muruca-core/v1/xml/bibit000029',
                                      'text1': '', 
                                      'text2': ''}}}



In [3]:
# This function takes care of most cleaning as well.

text_holder = []

def text_prep(text_dict):

    for author in text_dict.keys():
    
        for work in text_dict[author]:
            
            text = requests.get(text_dict[author][work]['url'])
        
            text = BeautifulSoup(text.content)
            
            text = text.select('div1')
            
            text1 = ' '.join([line.get_text().replace('\n', ' ') for line in text])
            
            text_dict[author][work]['text1'] = text1
            
            text2 = [[line.get_text().replace('\n', ' '), author, work] for line in text]
            
            text_dict[author][work]['text2'] = text2
            
    

In [4]:
text_prep(text_dict)

In [5]:
# Hand-checked complete text-retrieval here:

text_dict['Anon']['Maiolichinus']['text2']

[[" Argomento Ad laudem et gloriam omnipotentis Dei et gloriosissime virginis Marie advocate Pisane civitatis atque totius triumphantis curie paradisi, amen. Et ad honorem et gloriam populi Pisani et exaltationem et augumentum status ipsius Pisane civitatis, amen. In Dei nomine amen. In questo libro che seguita qui di contra intitulato il Maiorichino si contiene chome anno Domini mille cento quatordici, cioè 1114, il populo pisano, per comandamento et a preghi di papa Paschale secondo, andò a l'insula di Maioricha con trecento vele grosse et bene armate de homini pisani io da piè et da chavallo. Et il dì di sancto Sixto uscirono di porto pisano con grande pianto di vechi et di fanciulli et di donne, et pervenuti prima a l'insula de Evisa, città fortissima in detta insula posta, et quella assediata per uno mese, et dopo molte crudele battaglie date la preseno per forsa d'arme con mirabile uccisione di Pagani, cioè il giorno di san Lorenso a dì X d'agosto anno 1114 sopra scripto. Et dipo

### Exploratory Analysis:

#### Word Tokenization and Stop Words:

In [6]:
author_str = str()
author_list = []

count_list = []

for author in text_dict.keys():
    
    for work in text_dict[author]:
        
        author_str = "{}: {}".format(author, work)
        
        author_list.append(author_str)
        
        count_list.append(len(text_dict[author][work]['text1']))

In [7]:
wordcount_df = pd.DataFrame(author_list)

wordcount_df['Word Count'] = count_list

In [8]:
sns.

SyntaxError: invalid syntax (<ipython-input-8-40c5c93bca4b>, line 1)

##### Stop Lists:

Here, we will be creating lists of stop words (i.e. words that are too common to factor into our analysis).  After creating them, we will take these out of our tokens so they don't interfere with analysis of style, etc.

In [None]:
# Variables for storing our stop words in the function below:
pet_texts = list([bc_pet2, africa2, varia_pet2, psalmi_pet2])
pet_stops = set()

gio_texts = list([carmina_gio2, bc_gio2])
gio_stops = set()

dante_texts = list([egloghe_dante2, quaestio_dante2, dve_dante2])
dante_stops = set()

anon_stops = set()

In [None]:
import cltk.stop.stop
import cltk.stop.latin

# Defining our stop list constructor:
Stopper = cltk.stop.latin.CorpusStoplist(language = 'latin')

# Creating a function so we can generate stop word-lists
def Stop_Builder(text_list, end_set):
    
    for text in text_list:
    
        x = Stopper.build_stoplist(text, 
                                   basis = 'frequency', 
                                   size = 50, 
                                   remove_punctuation = False)
        
        end_set.update(x)
        end_set.update(cltk.stop.latin.PERSEUS_STOPS)

In [None]:
# Running our functions:
Stop_Builder(pet_texts, pet_stops)
Stop_Builder(gio_texts, gio_stops)
Stop_Builder(dante_texts, dante_stops)

In [None]:
# Since this author only has one text here, we ran it outside of the function:
anon_stops.update(Stopper.build_stoplist(liber_maio2, 
                                         basis = 'frequency',
                                         size = 50,
                                         remove_punctuation = False))

Since it looks like our stop-list generator caught a few words we do not want to include in our stop-list, we will go ahead and take them out.

In [None]:
# Creating lists of words we want to keep/not consider as stop words:

pet_removal = ['amen', 'amor', 'arma', 'astra', 'bello', 'clausa', 'decus', 'dies',
               'ego', 'domine', 'fata', 'filio', 'forte', 'fortuna', 'mors', 'semper',
              'gloria', 'miserere', 'patri', 'principio', 'sancto', 'scipio', 'secula',
              'seculorum', 'signa', 'spes', 'spiritui', 'tempora', 'valle', 'vita']
gio_removal = ['agam', 'amor', 'amplexus', 'annos', 'constantia', 'dolor', 'forte',
               'iacet', 'montes', 'pascua', 'precor', 'silvas', 'silvis', 'tempus', 
               'videre', 'vita', 'fata', 'mors']
dante_removal = ['alphesibeus', 'capelle', 'carmine', 'ego', 'melibeus',
                 'pascua', 'potest', 'secundum', 'senex', 'tityre', 'tityrus',
                 'vero', 'videtur', 'vulgare', 'vix', 'dicimus', 'mopsus']
anon_removal = ['agmina', 'balee', 'bella', 'hostes', 'menia', 'plures', 'populus',
                'turres', 'urbem', 'viribus']

In [None]:
def Remove_Function(stop_set, removal_list):
    
    for x in removal_list:
        stop_set.remove(x)

In [None]:
Remove_Function(pet_stops, pet_removal)
Remove_Function(gio_stops, gio_removal)
Remove_Function(dante_stops, dante_removal)
Remove_Function(anon_stops, anon_removal)

##### Word Tokens:

Now, we will create our word-based tokens, and remove any stop words from them:

In [None]:
from cltk.tokenize.word import WordTokenizer
from nltk.tokenize.punkt import PunktLanguageVars

word_tokenizer = WordTokenizer('latin')
punctuation = PunktLanguageVars()

In [None]:
# Petrarca:
psalmi_tokens = list()
pet_varia_tokens = list()
pet_bc_tokens = list()
africa_tokens = list()
pet_container = [psalmi_tokens, pet_varia_tokens, pet_bc_tokens, africa_tokens]

# Gio:
carmina_gio_tokens = list()
bc_gio_tokens = list()
gio_container = [carmina_gio_tokens, bc_gio_tokens]

# Dante:
egloghe_tokens = list()
quaestio_tokens = list()
dve_tokens = list()
dante_container = [egloghe_tokens, quaestio_tokens, dve_tokens]

# Anon:
liber_maio_tokens = list()

In [None]:
def Tokens(texts, containers, stops):
    
    y = 0
    
    for x in texts:
        
        beta = word_tokenizer.tokenize(x)
        
        beta2 = [w for w in beta if not w in stops]
        
        containers[y].append(beta2)
        
        y += 1

In [None]:
Tokens(pet_texts, pet_container, pet_stops)
Tokens(gio_texts, gio_container, gio_stops)
Tokens(dante_texts, dante_container, dante_stops)

beta = word_tokenizer.tokenize(liber_maio2)
liber_maio_tokens.append(beta)

In [None]:
# Deleting a few variables we no-longer need:
del pet_texts, gio_texts, dante_texts, pet_removal, dante_removal, gio_removal, anon_removal

#### Token and Stop Analysis:

In [None]:
import collections

In [None]:
# Un-nesting our lists from the above loop:

# Petrarca
psalmi_tokens = psalmi_tokens[0]
pet_varia_tokens = pet_varia_tokens[0]
pet_bc_tokens = pet_bc_tokens[0]
africa_tokens = africa_tokens[0]

# Gio:
carmina_gio_tokens = carmina_gio_tokens[0]
bc_gio_tokens = bc_gio_tokens[0]

# Dante:
egloghe_tokens = egloghe_tokens[0]
quaestio_tokens = quaestio_tokens[0]
dve_tokens = dve_tokens[0]

# Anon:
liber_maio_tokens = liber_maio_tokens[0]

In [None]:
psalmi_common = collections.Counter(psalmi_tokens).most_common(50)
africa_common = collections.Counter(africa_tokens).most_common(50)
pet_bc_common = collections.Counter(pet_bc_tokens).most_common(50)
pet_varia_common = collections.Counter(pet_varia_tokens).most_common(50)

carmina_gio_common = collections.Counter(carmina_gio_tokens).most_common(50)
bc_gio_common = collections.Counter(bc_gio_tokens).most_common(50)

egloghe_common = collections.Counter(egloghe_tokens).most_common(50)
dve_common = collections.Counter(dve_tokens).most_common(50)
quaestio_common = collections.Counter(quaestio_tokens).most_common(50)

anon_common = collections.Counter(psalmi_tokens).most_common(50)

Let's take a look at how these stop words compare between each author. 

In [None]:
pet_stops = list(pet_stops)
pet_stops.sort()

gio_stops = list(gio_stops)
gio_stops.sort()

dante_stops = list(dante_stops)
dante_stops.sort()

anon_stops = list(anon_stops)
anon_stops.sort()

In [None]:
pet = pd.Series(pet_stops)
gio = pd.Series(gio_stops)
dante = pd.Series(dante_stops)
anon = pd.Series(anon_stops)

In [None]:
stops_df = pd.DataFrame(data = [pet_stops, gio_stops, dante_stops, anon_stops]).T

stops_df.rename(columns = {0: 'Petrarch', 
                           1: 'G. Boccaccio', 
                           2: 'Dante', 
                           3: 'Anonymous'}, inplace = True)

In [None]:
stops_df

If we take a look at the most common words between Petrarch and Gio (considering their Buccolicum Carmen works together since those two words are considered a similar genre), we can see how different the two authors are in their recurrent vocabulary.

As you can see from above, our stop list is pretty well populated, but it looks liike some punctuation still made it in (a bug we will need to go back and fix later / look more at CLTK's documentation).  In order to account for this, we will skim off the first 15 of our most common words for each author (since they would be considered stops)

In [None]:
f, ax = plt.subplots(2, 2, sharey = True, figsize = (12, 10))

ax1 = sns.barplot(x = psalmi_common[0], y = [1], ax = ax[0][0])
ax1.tick_params(axis = 'x', labelrotation = 60, labelsize = 11)

ax2 = sns.barplot(x = b[0], y = b[1], ax = ax[0][1])
ax2.tick_params(axis = 'x', labelrotation = 60, labelsize = 11)

ax3 = sns.barplot(x = b[0], y = b[1], ax = ax[1][0])
ax3.tick_params(axis = 'x', labelrotation = 60, labelsize = 11)

ax4 = sns.barplot(x = b[0], y = b[1], ax = ax[1][1])
ax4.tick_params(axis = 'x', labelrotation = 60, labelsize = 11)

plt.show()

In [None]:
to-do notes:
    
    - get overall length of text along with how much percentage the stops reflect and the cleaned tokens reflect. 

__[Beautiful Soup Tags](https://stackoverflow.com/questions/50486567/combine-find-all-beautiful-soup-tags-into-one-string)__

__[Nested Dictionaries](https://www.geeksforgeeks.org/python-nested-dictionary/)__

__[Nested Dictionary Iterations](https://stackoverflow.com/questions/43752962/how-to-iterate-through-a-nested-dict/43753252)__

In [None]:
# Looking at stop-words:
 ## Looks like this actually gives a bag of word count for all words:
    
import cltk.stop.stop

psalmi_stops = cltk.stop.stop.Counter(psalmi_pet2_tokens)
africa_stops = cltk.stop.stop.Counter(africa2_tokens)
varia_stops = cltk.stop.stop.Counter(varia_pet2_tokens)
bc_pet_stops = cltk.stop.stop.Counter(bc_pet2_tokens)

a = pd.DataFrame(psalmi_stops.most_common(n = 20))
b = pd.DataFrame(africa_stops.most_common(n = 20))
c = pd.DataFrame(varia_stops.most_common(n = 20))
d = pd.DataFrame(bc_pet_stops.most_common(n = 20))