In [1]:
import csv
from datetime import datetime
import giasmus
from lxml import html
import numpy as np
import re
import requests
from sklearn.decomposition import NMF

np.set_printoptions(precision=2)

# 1. Try non-negative matrix factorization for chiasmus determination

## Background
"**Non-negative matrix factorization** (**NMF** or **NNMF**), also **non-negative matrix approximation** is a group of algorithms in multivariate analysis and linear algebra where a matrix V is factorized into (usually) two matrices W and H, with the property that all three matrices have no negative elements. [...] NMF can be used for text mining applications. In this process, a document-term matrix is constructed with the weights of various terms (typically weighted word frequency information) from a set of documents. This matrix is factored into a term-feature and a feature-document matrix. The features are derived from the contents of the documents, and the feature-document matrix describes data clusters of related documents." (Source: [Wikipedia: NMF](https://en.wikipedia.org/wiki/Non-negative_matrix_factorization?oldformat=true))

In [2]:
# Define data
C = np.random.binomial(10,0.5,[20,5])
c = np.random.binomial(1,0.1,[20,5]) # added to C to slightly perturb the latter half of the chiasmus
C = np.concatenate((C,np.fliplr(C)+c),axis=1)
print(C)
print(np.shape(C))

[[7 5 4 5 5 5 6 4 6 7]
 [3 4 3 4 6 6 4 3 4 3]
 [3 6 2 4 6 6 4 2 6 4]
 [4 3 3 3 3 3 3 3 3 4]
 [6 6 4 6 7 7 6 5 7 6]
 [7 3 4 4 4 4 4 4 3 7]
 [6 5 4 6 5 5 6 4 5 6]
 [4 5 5 8 5 5 8 5 5 5]
 [4 7 2 2 6 6 3 2 7 4]
 [4 3 5 7 7 7 7 5 3 5]
 [6 5 6 3 5 5 3 6 6 6]
 [3 7 7 1 7 7 1 7 7 3]
 [5 3 8 7 4 4 7 8 3 5]
 [4 6 6 3 6 6 3 6 6 4]
 [7 6 3 3 5 5 3 3 6 7]
 [5 6 2 4 4 4 4 2 6 5]
 [7 4 5 5 6 6 5 5 4 7]
 [5 7 3 3 4 4 3 3 7 5]
 [7 5 5 5 7 7 5 5 5 7]
 [4 6 4 5 4 4 5 4 6 4]]
(20, 10)


In [3]:
# Define non-negative matrix factorization (NMF) model
model = NMF(n_components=5, init='random', random_state=0)

In [4]:
# Perform the NMF
W = model.fit_transform(C)
H = model.components_

In [5]:
# Identify the coupled features
MH = np.argmax(H, axis=0)
print('Max(H):\n', MH, '\n')

Max(H):
 [2 4 2 3 4 4 3 2 4 2] 



In [6]:
# Check if the coupled features form a chiasmus
print('Giasmus?', giasmus.check_giasmus(MH.tolist()))

Giasmus? True


# 2. Try the above technique on text data

In [7]:
string1 = "This is a test string. It's not very interesting, since it's just a test string."
string2 = "This is another test string, but this one is very interesting."
string1c = "This is a another test string. It's not very interesting, since it's just another test string."
string2c = "This is yet another test string, but this one is also very interesting."
# print('# times test occurs:\t', string1.count('test'))
# print('# times string occurs:\t', string1.count('string'))

In [8]:
string_list = re.findall(r"[\w']+", (string1+string1c+string2+string2c).lower())
string_set = set(string_list)
print(string_set)

{'just', 'another', 'but', 'very', 'a', 'string', 'also', "it's", 'interesting', 'test', 'is', 'since', 'this', 'not', 'yet', 'one'}


In [9]:
A1 = np.zeros(len(string_set))
A2 = np.zeros(len(string_set))
B1 = np.zeros(len(string_set))
B2 = np.zeros(len(string_set))
for i, word in enumerate(string_set):
    A1[i] = string1.count(word)
    A2[i] = string1c.count(word)
    B1[i] = string2.count(word)
    B2[i] = string2c.count(word)
print('A1:', A1)
print('A2:', A2)
print('B1:', B1)
print('B2:', B2)

A1: [ 1.  0.  0.  1.  2.  2.  0.  1.  1.  2.  2.  1.  0.  1.  0.  0.]
A2: [ 1.  2.  0.  1.  3.  2.  0.  1.  1.  2.  2.  1.  0.  3.  0.  0.]
B1: [ 0.  1.  1.  1.  1.  1.  0.  0.  1.  1.  4.  0.  1.  1.  0.  1.]
B2: [ 0.  1.  1.  1.  2.  1.  1.  0.  1.  1.  4.  0.  1.  1.  1.  1.]


In [10]:
# Prepare the data
# print(np.expand_dims(np.asarray(A1), axis=1))
C_text = np.concatenate((np.expand_dims(np.asarray(A1), axis=1), np.expand_dims(np.asarray(B1), axis=1),\
                     np.expand_dims(np.asarray(B2), axis=1), np.expand_dims(np.asarray(A2), axis=1)), axis=1)
print(C_text.shape)

(16, 4)


In [11]:
# Define non-negative matrix factorization (NMF) model
model_text = NMF(n_components=2, init='random', random_state=0)

In [12]:
# Perform the NMF
W_text = model_text.fit_transform(C_text)
H_text = model_text.components_

In [13]:
# Identify the coupled features
MH_text = np.argmax(H_text, axis=0)
print('Max(H):\n', MH_text, '\n')

Max(H):
 [0 1 1 0] 



In [14]:
# Check if the coupled features form a chiasmus
print('Giasmus?', giasmus.check_giasmus(MH_text.tolist()))

Giasmus? True


# 3. Try to detect the chiastic structure in the Dutch National Anthem

## Background
"The Wilhelmus, the national anthem of the Netherlands, has a structure composed around a thematic chiasmus: the 15 stanzas of the text are symmetrical, in that verses one and 15 resemble one another in meaning, as do verses two and 14, three and 13, etc., until they converge in the eighth verse, the heart of the song." (Source: [Wikipedia: Thematic Chiasmus](https://en.wikipedia.org/wiki/Chiasmus?oldformat=true#Thematic_chiasmus))

In [15]:
# Get the data
page = requests.get('https://www.koninklijkhuis.nl/onderwerpen/volkslied/tekst-van-het-wilhelmus')
tree = html.fromstring(page.content)

In [16]:
raw = np.asarray(tree.xpath('//p/text()')[5:139])
string_list = re.findall(r"[\w']+", str(raw).lower())
string_set = set(string_list)
print(string_set)

{"vrezen'", 'dienaar', 'nu', 'konings', 'tirannie', "vermeten'", 'ook', "leven'", 'begeven', 'ende', 'betracht', 'hoogsten', 'hebben', "opgezeten'", 'te', 'god', 'aan', 'gebracht', 'wil', 'bij', 'al', "namen'", 'tempeest', 'nacht', 'geweest', 'redden', 'gemoed', "getrouwe'", 'daaraan', 'gij', 'luid', 'getrouwen', 'rijks', "wassen'", 'bloed', 'vaderland', 'o', "zuchten'", 'zoet', 'van', 'hierboven', 'christen', 'blijf', 'moeten', 'gedenke', 'gerechtigheid', 'altijd', 'dien', 'onversaagd', 'duitsen', 'stam', 'goed', 'had', "'t", 'een', 'adolf', 'vroom', 'verheven', 'ruiters', 'handen', 'zwaar', 'heeft', 'der', 'broeders', "gebeden'", 'maastricht', "nassouwe'", 'vrees', 'loven', 'zaak', 'ontvangen', 'bidt', 'geern', 'geven', 'die', 'slag', 'hij', 'mijner', 'zonder', 'oprecht', 'moed', 'prinse', 'david', 'edelman', 'dat', "betrouwen'", 'regeert', 'oorlof', "vluchten'", 'groten', "'", 'ziel', 'nimmermeer', 'meer', 'tegenspoed', 'slapen', "obediëren'", 'christenman', 'moedig', 'prinselijk', 

In [17]:
# Separate the anthem into the 15 verses
anthem = np.zeros(shape=(len(string_set),15))
verse_borders = [-1] + np.ndarray.tolist(np.where(raw == " ")[0]) + [139]
for i in range(15):
    temp = str(raw[verse_borders[i]+1:verse_borders[i+1]]).lower()
    print('verse',i+1,'\n',temp)
    for j, word in enumerate(string_set):
        anthem[j,i] = temp.count(word)

verse 1 
 ['wilhelmus van nassouwe' ' ben ik, van duitsen bloed,'
 ' den vaderland getrouwe' ' blijf ik tot in den dood.'
 ' een prinse van oranje' ' ben ik, vrij, onverveerd,'
 ' den koning van hispanje' ' heb ik altijd geëerd.']
verse 2 
 [' in godes vrees te leven' ' heb ik altijd betracht,'
 ' daarom ben ik verdreven,' ' om land, om luid gebracht.'
 ' maar god zal mij regeren' ' als een goed instrument,'
 ' dat ik zal wederkeren' ' in mijnen regiment.']
verse 3 
 [' lijdt u, mijn onderzaten' ' die oprecht zijt van aard,'
 ' god zal u niet verlaten,' ' al zijt gij nu bezwaard.'
 ' die vroom begeert te leven,' ' bidt god nacht ende dag,'
 ' dat hij mij kracht zal geven,' ' dat ik u helpen mag.']
verse 4 
 [' lijf ende goed tezamen' ' heb ik u niet verschoond,'
 ' mijn broeders, hoog van namen' " hebben 't u ook vertoond"
 ' graaf adolf is gebleven' ' in friesland in den slag,'
 " zijn ziel in 't eeuwig leven" ' verwacht den jongsten dag.']
verse 5 
 [' edel en hooggeboren,' ' van kei

In [18]:
# Define non-negative matrix factorization (NMF) model
model = NMF(n_components=15, init='random', random_state=0)

In [19]:
# Perform the NMF
W_text = model_text.fit_transform(anthem)
H_text = model_text.components_

In [20]:
# Identify the coupled segments
MH_text = np.argmax(H_text, axis=0)
print('Number of features found:', len(H_text))
print('Text structure:', MH_text)

Number of features found: 2
Text structure: [1 0 1 1 1 1 0 1 1 0 0 0 0 1 1]


In [21]:
# Check if the coupled features form a chiasmus
print('Giasmus?', giasmus.check_giasmus(MH_text.tolist()))

Giasmus? False


## Conclusion
While the current **NMF**-based technique may be succesfully applied to determine the chiastic structure of a text where the corresponding sections have similar word frequencies, as seen in **(2)**, the current technique appears to be less succesfull in the detection of a thematic chiasmus, as seen in **(3)**, where the similarities in word frequencies between corresponding sections may be reduced. Thus, the latter task would require a form of topic modelling, which may be solved using a more specialized **NMF**-based method (Arora _et al._, 2012).

# References

Arora, S., Ge, R., & Moitra, A. (2012, October). Learning topic models--going beyond SVD. In Foundations of Computer Science (FOCS), 2012 IEEE 53rd Annual Symposium on (pp. 1-10). IEEE.