# Dendrogram samples

This notebook applies the methods explored in [progress_report_4.ipynb](progress_report_4.ipynb) to an assortment of poems with different properties. It begins by consolidating the run script into a function, which is then called against different samples.

## Reload libraries each time, since we’re tinkering with them

In [239]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load libraries

In [240]:
from xml.dom import pulldom  # parse input XML
from xml.dom.minidom import Document  # construct output XML
import numpy as np
import pandas as pd
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage
# see https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt
%matplotlib inline
import regex as re
import json
from cyr2phon import cyr2phon  # custom package
pd.set_option('display.max_colwidth', -1) # show all text in cell, without truncation; default is 50
pd.set_option('display.max_columns', None) # show all columns; default is 20

## Class and variables for parsing input XML

In [241]:
class Stack(list):  # keep track of open nodes while constructing XML output
    def push(self, item):
        self.append(item)

    def peek(self):  
        return self[-1]


open_elements = Stack()
WS_RE = re.compile(r'\s+')  # normalize white space in output

## Function to parse the XML

Returns a list of lists, with stanza number, line number, and `<line>` element for each line. We use the light-weight *xml.dom.pulldom* library to parse the input XML and *xml.dom.minidom* to construct the lines as simplified XML, removing elements we don’t care about, such as `<latin>` and `<italic>`, before serializing them to the output. (We actually do care about `<latin>`, but we are ignoring it temporarily, and we’ll return to it at a later stage in the project.)

In [242]:
def process(input_xml):
    poemId = ""
    stanzaNo = 0
    lineNo = 0
    inline = 0  # flag to control behavior inside and outside lines
    result = []  # array of arrays, one per line, with stanzaNo, lineNo, and serialized XML
    doc = pulldom.parse(input_xml)
    for event, node in doc:
        if event == pulldom.START_ELEMENT and node.localName == 'poem':
            poemId = node.getAttribute("opid")
        elif event == pulldom.START_ELEMENT and node.localName == 'stanza':
            stanzaNo = node.getAttribute("stanzaNo")
        elif event == pulldom.START_ELEMENT and node.localName == 'line':
            d = Document()  # each line is an output XML document
            open_elements.push(d)  # document node
            lineNo = node.getAttribute("lineNo")
            inline = 1  # we’re inside a line
            open_elements.peek().appendChild(node)  # add as child of current node in output tree
            open_elements.push(node)  # keep track of open elements
        elif event == pulldom.END_ELEMENT and node.localName == 'line':
            inline = 0  # when we finish our work here, we’ll no longer be inside a line
            open_elements.pop()  # line is finished
            # serialize XML, strip declaration, rewrite &quot; entity as character
            result.append([poemId, int(stanzaNo), int(lineNo),
                WS_RE.sub(" " ,
                open_elements.pop().toxml().replace('<?xml version="1.0" ?>', '').replace('&quot;', '"'))])
        elif event == pulldom.START_ELEMENT and node.localName == 'stress':
            open_elements.peek().appendChild(node)  # add as child of current node in output tree
            open_elements.push(node)  # keep track of open elements
        elif event == pulldom.END_ELEMENT and node.localName == 'stress':
            open_elements.pop()  # stress element is finished
        elif event == pulldom.CHARACTERS and inline:  # keep text only inside lines
            t = d.createTextNode(node.data)
            open_elements.peek().appendChild(t)
    return result

## Function to extract rhyme zone from rhyme word

In [243]:
rhymezonepat = re.compile(r'(.?[AEIOU]$)|([AEIOU].*$)')
def remove_pretonic_segments(s: str) -> str: # removes segments in place
    try:
        return rhymezonepat.search(s).group(0)
    except: # modify this to raise a real error, instead of just reporting
        print(s)

## Imports sample file, analyzes, outputs reports

In [358]:
def explore(filepath, ceiling=1000, ward=None):
    '''Render text and dendrograms of rhyme clustering
    
    Parameters:
        filepath (str): path to XML file with poem, required
        ceiling (int): maximum number of stanzas to return (useful for sampling long poems), 
            defaults to high value
        ward (boolean): show Ward dendrogram separately (improves legibility of long stanzas),
            defaults to None
    
    Does not return a value; all output is written directly from within the function
    '''
    
    # Read file
    with open(filepath) as f:
        data = process(f)
    df = pd.DataFrame(data, columns=["PoemId", "StanzaNo", "LineNo", "Text"])
    
    # Prepare data
    trans_vec = np.vectorize(cyr2phon.transliterate)
    df["Phonetic"] = trans_vec(df["Text"])
    df["RhymeWord"] = df["Phonetic"].str.split().str[-1] # clitics have already been joined
    df["RhymeZone"] = df["RhymeWord"].apply(remove_pretonic_segments)
    df.loc[df["RhymeZone"].isnull(), "RhymeZone"] = "Abcde" # provisional placeholder for nulls
    df["tokenized"] = [x[0] for x in df["RhymeZone"].str.
                       findall(r"(.?)([AEIOU])([^aeiou]*)([aeiou]?)([^aeiou]*)([aeiou]?)([^aeiou]*)([aeiou]?)([^aeiou]*)([aeiou]?)([^aeiou]*)([aeiou]?)([^aeiou]*)([aeiou]?)([^aeiou]*)([aeiou]?)([^aeiou]*)([aeiou]?)")]
    i = 0
    while pd.np.count_nonzero([item[i] for item in df["tokenized"]]) > 0:
#         print([item[i] for item in df["tokenized"]]) # diagnostic
        df["token" + str(i)] = [item[i] for item in df["tokenized"]]
        i += 1
    tokenheaders = df.filter(regex="^token\d").columns
    df[tokenheaders] = df[tokenheaders].replace(r'^$', 'Q', regex=True) # replace empty strings with specific value (in features.json); inplace doesn't work (?)
    df.filter(regex=r"StanzaNo|LineNo|RhymeWord|Text|^token\d").head() # columns we care about
#     print(df[tokenheaders])
    
    with open('features.json') as json_file:
        feature_matrix = json.load(json_file)
    feature_names = {feature for segment,features in feature_matrix.items() for feature in features}
    
    for column_label in df.filter(regex="^token\d$").columns:
        for feature in feature_names:
            df[column_label + '_' + feature] = df[column_label].apply(lambda x: np.nanmean(np.asarray([feature_matrix.get(char, dict()).get(feature, np.nan) for char in list(x)]).astype(np.float32)))

    #     dummy = pd.get_dummies(df, columns=df.filter(regex="^token\d").columns, drop_first=True)
#     df = df.merge(dummy, on=["PoemId", "StanzaNo", "LineNo", "Text", "Phonetic", "RhymeWord", "RhymeZone", "tokenized"])
#     df.set_index(["PoemId", "StanzaNo", "LineNo"], inplace=True)
    
    # Visualize
#     methods = ["single", "complete", "average", "weighted", "centroid", "median", "ward"]
#     stanzas = df.groupby(level=[0,1])
#     pd.set_option('display.width', 1000) # don't wrap long rows
#     for offset, (id, lines) in enumerate(stanzas):
#         if offset < ceiling:
#             print(pd.concat([lines["Text"].str.replace(r"<[^>]+?>", ""), lines[["RhymeWord", "RhymeZone"]]], axis=1)) # diagnostic
#             data = lines.copy().filter(regex=r"^token\d_") # only one-hot features
#             missing = data.filter(regex=r"missing$")
#             missing = missing * 0.2 # reduce weight of missing values
#             labelList = list(range(1, len(lines)+1)) # labels are line numbers within stanza
#             data.loc[:,"LineNo"] = [n / (len(labelList) * 2) for n in labelList] # scale to avoid tyranny of proximity, currently 0 <= n <= 0.5
#             plt.figure(figsize=(20, 2))
#             for n, m in enumerate(methods):
#                 linked = linkage(data, method=m)
#                 m_c, m_coph_dist = cophenet(linked, pdist(data))
#                 plt.subplot(1, 7, n + 1)
#                 plt.title(m + ": " + str(round(m_c, 3)))
#                 dendrogram(linked, labels=labelList)
#             plt.show()
#             if ward:
#                 linked = linkage(data, method="ward")
#                 labelList=list(range(1,len(lines)+1))
#                 c, coph_dist = cophenet(linked, pdist(data))
#                 plt.figure(figsize=(20, 10))
#                 plt.title("Ward: " + str(c))
#                 dendrogram(linked, labels=labelList)
#                 plt.show()
#     pd.set_option('display.width', 80) # restore default
    return df

## Test with Eugene Onegin chapter 1

In [359]:
explore("data_samples/eo1.xml", 1, True) # no return; explore() writes the output



Unnamed: 0,PoemId,StanzaNo,LineNo,Text,Phonetic,RhymeWord,RhymeZone,tokenized,token0,token1,token2,token3,token4,token0_Syllabic,token0_High,token0_Delayedrelease,token0_Palatalized,token0_Lateral,token0_Anterior,token0_Low,token0_Back,token0_Continuant,token0_Voiced,token0_Nasal,token0_Coronal,token0_Sonorant,token1_Syllabic,token1_High,token1_Delayedrelease,token1_Palatalized,token1_Lateral,token1_Anterior,token1_Low,token1_Back,token1_Continuant,token1_Voiced,token1_Nasal,token1_Coronal,token1_Sonorant,token2_Syllabic,token2_High,token2_Delayedrelease,token2_Palatalized,token2_Lateral,token2_Anterior,token2_Low,token2_Back,token2_Continuant,token2_Voiced,token2_Nasal,token2_Coronal,token2_Sonorant,token3_Syllabic,token3_High,token3_Delayedrelease,token3_Palatalized,token3_Lateral,token3_Anterior,token3_Low,token3_Back,token3_Continuant,token3_Voiced,token3_Nasal,token3_Coronal,token3_Sonorant,token4_Syllabic,token4_High,token4_Delayedrelease,token4_Palatalized,token4_Lateral,token4_Anterior,token4_Low,token4_Back,token4_Continuant,token4_Voiced,token4_Nasal,token4_Coronal,token4_Sonorant
0,Eo.1,1,1,"<line lineNo=""001"">""Мой дядя самых честных пр<stress>а</stress>вил,</line>",maJ DiDi samix Čistnix prAVil,prAVil,AVil,"(, A, V, i, l, , , , , , , , , , , , , )",Q,A,V,i,l,,,,,,,,,,,,,,1.0,0.0,,,,,1.0,1.0,,,,,1.0,0.0,,0.0,1.0,0.0,1.0,,,1.00,1.0,0.0,0.00,0.00,1.0,1.0,,,,,0.0,0.0,,,,,1.0,0.0,,0.0,0.0,1.0,1.0,,,1.0,1.0,0.0,1.0,1.0
1,Eo.1,1,2,"<line lineNo=""002"">Когда не в шутку занем<stress>о</stress>г,</line>",kagda Nifšutku zaNimOk,zaNimOk,Ok,"(, O, k, , , , , , , , , , , , , , , )",Q,O,k,Q,Q,,,,,,,,,,,,,,1.0,0.0,,,,,0.0,1.0,,,,,1.0,0.0,,0.0,0.0,0.0,0.0,,,0.00,0.0,0.0,0.00,0.00,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Eo.1,1,3,"<line lineNo=""003"">Он уважать себя заст<stress>а</stress>вил</line>",an uvažaT SiBi zastAVil,zastAVil,AVil,"(, A, V, i, l, , , , , , , , , , , , , )",Q,A,V,i,l,,,,,,,,,,,,,,1.0,0.0,,,,,1.0,1.0,,,,,1.0,0.0,,0.0,1.0,0.0,1.0,,,1.00,1.0,0.0,0.00,0.00,1.0,1.0,,,,,0.0,0.0,,,,,1.0,0.0,,0.0,0.0,1.0,1.0,,,1.0,1.0,0.0,1.0,1.0
3,Eo.1,1,4,"<line lineNo=""004"">И лучше выдумать не м<stress>о</stress>г.</line>",iluČši vidumaT NimOk,NimOk,Ok,"(, O, k, , , , , , , , , , , , , , , )",Q,O,k,Q,Q,,,,,,,,,,,,,,1.0,0.0,,,,,0.0,1.0,,,,,1.0,0.0,,0.0,0.0,0.0,0.0,,,0.00,0.0,0.0,0.00,0.00,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Eo.1,1,5,"<line lineNo=""005"">Его пример другим на<stress>у</stress>ка;</line>",Jiva pRiMir druGim naUka,naUka,Uka,"(, U, k, a, , , , , , , , , , , , , , )",Q,U,k,a,Q,,,,,,,,,,,,,,1.0,1.0,,,,,0.0,1.0,,,,,1.0,0.0,,0.0,0.0,0.0,0.0,,,0.00,0.0,0.0,0.00,0.00,1.0,0.0,,,,,1.0,1.0,,,,,1.0,,,,,,,,,,,,,
5,Eo.1,1,6,"<line lineNo=""006"">Но, боже мой, какая ск<stress>у</stress>ка</line>",nabaži maJ kakaJi skUka,skUka,Uka,"(, U, k, a, , , , , , , , , , , , , , )",Q,U,k,a,Q,,,,,,,,,,,,,,1.0,1.0,,,,,0.0,1.0,,,,,1.0,0.0,,0.0,0.0,0.0,0.0,,,0.00,0.0,0.0,0.00,0.00,1.0,0.0,,,,,1.0,1.0,,,,,1.0,,,,,,,,,,,,,
6,Eo.1,1,7,"<line lineNo=""007"">С больным сидеть и день и н<stress>о</stress>чь,</line>",zbaLnim SiDiT iDiN inOČ,inOČ,OČ,"(, O, Č, , , , , , , , , , , , , , , )",Q,O,Č,Q,Q,,,,,,,,,,,,,,1.0,0.0,,,,,0.0,1.0,,,,,1.0,0.0,,1.0,1.0,0.0,0.0,,,1.00,0.0,0.0,1.00,0.00,,,,,,,,,,,,,,,,,,,,,,,,,,
7,Eo.1,1,8,"<line lineNo=""008"">Не отходя ни шагу пр<stress>о</stress>чь!</line>",NiatxaDi Nišagu prOČ,prOČ,OČ,"(, O, Č, , , , , , , , , , , , , , , )",Q,O,Č,Q,Q,,,,,,,,,,,,,,1.0,0.0,,,,,0.0,1.0,,,,,1.0,0.0,,1.0,1.0,0.0,0.0,,,1.00,0.0,0.0,1.00,0.00,,,,,,,,,,,,,,,,,,,,,,,,,,
8,Eo.1,1,9,"<line lineNo=""009"">Какое низкое ков<stress>а</stress>рство</line>",kakaJi NiskaJi kavArstva,kavArstva,Arstva,"(, A, rstv, a, , , , , , , , , , , , , , )",Q,A,rstv,a,Q,,,,,,,,,,,,,,1.0,0.0,,,,,1.0,1.0,,,,,1.0,0.0,,0.0,0.0,0.0,1.0,,,0.75,0.5,0.0,0.75,0.25,1.0,0.0,,,,,1.0,1.0,,,,,1.0,,,,,,,,,,,,,
9,Eo.1,1,10,"<line lineNo=""010"">Полу-живого забавл<stress>я</stress>ть,</line>",palu-živava zabavLAT,zabavLAT,AT,"(, A, T, , , , , , , , , , , , , , , )",Q,A,T,Q,Q,,,,,,,,,,,,,,1.0,0.0,,,,,1.0,1.0,,,,,1.0,0.0,,0.0,1.0,0.0,1.0,,,0.00,0.0,0.0,1.00,0.00,,,,,,,,,,,,,,,,,,,,,,,,,,
