# Analyze the dataset

In [1]:
from unidecode import unidecode
import json

STOP_WORDS = set([
    "ἄλλος", "ἄν", "ἄρα", "ἀλλ'", "ἀλλά", "ἀπό", "αὐτός", "δ'", "δαί", "δαίς", "δέ", "δή",
    "διά", "ἑαυτοῦ", "ἔτι", "ἐάν", "ἐγώ", "ἐκ", "ἐμός", "ἐν", "ἐπί", "εἰ", "εἰμί", "εἶμι",
    "εἰς", "γάρ", "γὰ", "γε", "ἡ", "ἦ", "καί", "κατά", "μέν", "μετά", "μή", "ὁ", "ὅδε",
    "ὅς", "ὅστις", "ὅτι", "οἱ", "οὕτως", "οὗτος", "οὐ", "οὔτε", "οὖν", "οὐδέ", "οὐδείς",
    "οὐκ", "παρά", "περί", "πρός", "σός", "σύ", "σύν", "τά", "τε", "τήν", "τῆς", "τῇ",
    "τί", "τί", "τίς", "τις", "τό", "τόν", "τοί", "τοιοῦτος", "τούς", "τοῦ", "τῶν", "τῷ",
    "ὑμός", "ὑπέρ", "ὑπό", "ὥστε", "ὡς", "ὦ"
])


def remove_diacritics(string):
    return unidecode(string)


def remove_stop_words(string, STOP_WORDS=STOP_WORDS):
    return " ".join([word for word in string.split() if word not in STOP_WORDS])

In [2]:
def extract_text_info(file):
    
    chapter_info = {}
    book_info = {"lemmatized": "",
                 "raw": "",
                 "pos": "",
                 "dep": "",
                 "tense": "",
                 "aspect": "",
                 "mood": ""}
    

    with open(file, 'r') as f:
        text = json.load(f)


    for chapter, content in text.items():
        chapter_info[chapter] = {"lemmatized": "",
                                "raw": "",
                                "pos": "",
                                "dep": "",
                                "tense": "",
                                "aspect": "",
                                "mood": ""}
        for words in content:
            if words["pos"] != "PUNCT":
                chapter_info[chapter]["lemmatized"] += words["lemma"] + " "
                chapter_info[chapter]["raw"] += words["raw"] + " "
                chapter_info[chapter]["pos"] += words["pos"] + " "
                chapter_info[chapter]["dep"] += words["dep"] + " "
                if words["pos"] == "VERB":
                    chapter_info[chapter]["tense"] += words["morph"]["Tense"] + " "
                    if "Aspect" in words["morph"]:
                        chapter_info[chapter]["aspect"] += words["morph"]["Aspect"] + " "
                    if "VerbForm" in words["morph"]:
                        chapter_info[chapter]["aspect"] += words["morph"]["VerbForm"] + " "
                    if "Mood" in words["morph"]:
                        chapter_info[chapter]["mood"] += words["morph"]["Mood"] + " "
        book_info["lemmatized"] += chapter_info[chapter]["lemmatized"]
        book_info["raw"] += chapter_info[chapter]["raw"]
        book_info["pos"] += chapter_info[chapter]["pos"]
        book_info["dep"] += chapter_info[chapter]["dep"]
        book_info["tense"] += chapter_info[chapter]["tense"]
        book_info["aspect"] += chapter_info[chapter]["aspect"]
        book_info["mood"] += chapter_info[chapter]["mood"]
    return book_info, chapter_info

In [3]:
book_info, chapter_info = extract_text_info("../data/lemmatized/aam/tisch_acta_andreae_matthei_anthropophag.json")

## Term count analysis

In [4]:
print("VOCABULARY ANALYSIS")
print("===============================================")
print("===============================================")
print("Distinct number of terms (lemmatized):")
print(len(set(book_info["lemmatized"].split())))
print(len(set(book_info["lemmatized"].split()))/len(book_info["lemmatized"].split()))

print("===============================================")
print("Distinct number of terms (non lemmatized):")
print(len(set(book_info["raw"].split())))
print(len(set(book_info["raw"].split()))/len(book_info["raw"].split()))

print("===============================================")

print("Distinct number of terms after removing stop words:")
print(len(set(remove_stop_words(book_info["lemmatized"]).split())))
print("===============================================")
print("===============================================")

print("GRAMMATICAL ANALYSIS")
print("===============================================")
print("===============================================")
print("Number of NOUNS")
print(book_info["pos"].count("NOUN"))

print("===============================================")
print("Number of VERBS")
print(book_info["pos"].count("VERB"))

print("===============================================")
print("Number of ADJECTIVES")
print(book_info["pos"].count("ADJ"))

print("===============================================")
print("Number of NMOD")
print(book_info["dep"].count("nmod"))

VOCABULARY ANALYSIS
Distinct number of terms (lemmatized):
1375
0.21636506687647522
Distinct number of terms (non lemmatized):
2176
0.34240755310778914
Distinct number of terms after removing stop words:
1326
GRAMMATICAL ANALYSIS
Number of NOUNS
1092
Number of VERBS
1306
Number of ADJECTIVES
301
Number of NMOD
170


## Multivariate analysis

### Lexicometry analysis: tf (-idf)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from umap import UMAP
import pandas as pd

vectorizer = TfidfVectorizer(stop_words=list(STOP_WORDS), use_idf=True)
umap = UMAP(n_components=3)

In [6]:
term_matrix = vectorizer.fit_transform([info["lemmatized"] for info in chapter_info.values()])



In [7]:
tf_idf = pd.DataFrame(term_matrix.todense(), columns=vectorizer.get_feature_names_out(), index=chapter_info.keys())
tf_idf_reduced = pd.DataFrame(umap.fit_transform(tf_idf), columns=["x", "y", "z"], index=chapter_info.keys())
tf_idf_reduced["chapter"] = tf_idf_reduced.index

tf_idf_reduced["is_addition"] = tf_idf_reduced["chapter"].apply(lambda x: int(x) in list(range(11, 16)))

In [8]:
import plotly.express as px

fig = px.scatter_3d(tf_idf_reduced, y="y", x="x", z="z", text="chapter", color="is_addition")
fig.update_traces(marker_size=10)
fig.show()

In [30]:
tf_idf.loc["11"].sort_values(ascending=False).head(20)

κρυπτός         0.420022
ἀποκρίνω        0.309209
ψυχή            0.299642
ἐκπειράζω       0.250396
λέγω            0.225691
μόνος           0.213080
ἰησοῦς          0.185292
πᾶς             0.152403
ανδρέας         0.143267
ἔπεισάς         0.140007
ἐπερωτήσις      0.140007
ἀγάλλομαι       0.140007
ναί             0.140007
ἐποίησενκαί     0.140007
εἶπενποῖαι      0.140007
φανέρωσόνμος    0.140007
τὸπνεῦμα        0.140007
ἀναγγέλλω       0.140007
φανερός         0.125198
ἄρχιερος        0.125198
Name: 11, dtype: float64

In [31]:
tf_idf.loc["12"].sort_values(ascending=False).head(10)

ἡμεῖς         0.566266
υἱός          0.282706
ἀρχιερεύς     0.202888
θεός          0.190098
καρδία        0.176690
λέγω          0.133083
ῥῆμα          0.123837
εγένετο       0.123837
ταλαίπωρος    0.123837
πώποτε        0.123837
Name: 12, dtype: float64

In [32]:
tf_idf.loc["13"].sort_values(ascending=False).head(10)

δεξιός        0.352204
ἀρχιερεύς     0.322644
σφίγξ         0.215096
ἰησοῦς        0.173753
εἷς           0.159547
ἔρχομαι       0.139841
εἰσέρχομαι    0.139841
ἡμεῖς         0.138539
ὃς            0.131288
κάτω          0.131288
Name: 13, dtype: float64

### Stylometry analysis

#### Stop words analysis

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from umap import UMAP
import pandas as pd

vectorizer = TfidfVectorizer(vocabulary=list(STOP_WORDS), use_idf=False)
umap = UMAP(n_components=3)

In [34]:
stop_words_matrix = vectorizer.fit_transform([info["lemmatized"] for info in chapter_info.values()])

In [35]:
tf_idf_stopwords = pd.DataFrame(stop_words_matrix.todense(), columns=vectorizer.get_feature_names_out())
tf_idf_reduced = pd.DataFrame(umap.fit_transform(tf_idf_stopwords), columns=["x", "y", "z"], index=chapter_info.keys())
tf_idf_reduced["chapter"] = tf_idf_reduced.index

tf_idf_reduced["is_addition"] = tf_idf_reduced["chapter"].apply(lambda x: int(x) in list(range(10, 18)))

In [36]:
tf_idf

Unnamed: 0,άλλὰ,ίς,ίἐστιν,αβραάμ,αδελφέ,αδὰμ,ακουσος,αληθῶς,αμαήλ,ανάστα,...,ᾔνεσεν,ᾠδή,ῥάβδος,ῥέον,ῥέω,ῥεύω,ῥύμα,ῥύμαιςτέη,ῥύμη,ῥῆμα
1,0.065891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.058206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.045557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.056272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.047064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.036855,0.083438,0.0,0.0,0.0,0.0,0.083438,0.0,0.0,0.0,...,0.0,0.0,0.083438,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.03729,0.0,0.0,0.0,0.075493,0.0,0.0,0.0,0.0,0.084423,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.061452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.049476,0.0,0.0,0.0,0.0,0.0,0.0,0.112012,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
import plotly.express as px

fig = px.scatter_3d(tf_idf_reduced, y="y", x="x", z="z", text="chapter", color="is_addition")
fig.update_traces(marker_size=10)
fig.show()

### Part of speech analysis

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from umap import UMAP
import pandas as pd

vectorizer = TfidfVectorizer(use_idf=False)
umap = UMAP(n_components=3)

pos_matrix = vectorizer.fit_transform([info["pos"] for info in chapter_info.values()])

tf_idf_pos = pd.DataFrame(pos_matrix.todense(), columns=vectorizer.get_feature_names_out())
tf_idf_reduced = pd.DataFrame(umap.fit_transform(tf_idf_pos), columns=["x", "y", "z"], index=chapter_info.keys())
tf_idf_reduced["chapter"] = tf_idf_reduced.index

tf_idf_reduced["is_addition"] = tf_idf_reduced["chapter"].apply(lambda x: int(x) in list(range(11, 16)))

In [18]:
tf_idf_pos

Unnamed: 0,adj,adp,adv,aux,cconj,det,intj,noun,num,pron,propn,sconj,verb
0,0.074284,0.173329,0.099045,0.049523,0.222851,0.495226,0.0,0.619032,0.0,0.272374,0.024761,0.024761,0.445703
1,0.081422,0.113991,0.26055,0.032569,0.130275,0.537385,0.0,0.439679,0.0,0.374541,0.113991,0.065138,0.504816
2,0.133345,0.26669,0.333363,0.053338,0.226687,0.41337,0.0,0.466708,0.013335,0.306694,0.133345,0.040004,0.493377
3,0.030114,0.180681,0.240908,0.030114,0.271022,0.51193,0.0,0.481816,0.030114,0.225851,0.090341,0.04517,0.526987
4,0.12869,0.200184,0.257379,0.057195,0.142989,0.486161,0.0,0.414667,0.042897,0.228782,0.171586,0.028598,0.600552
5,0.08981,0.15396,0.28226,0.02566,0.17962,0.37207,0.0,0.52603,0.01283,0.30792,0.19245,0.06415,0.55169
6,0.135216,0.16902,0.259164,0.033804,0.11268,0.416916,0.0,0.484524,0.045072,0.270432,0.146484,0.101412,0.597204
7,0.026104,0.130521,0.24799,0.052208,0.143573,0.482928,0.013052,0.574293,0.0,0.234938,0.117469,0.078313,0.509033
8,0.111249,0.088999,0.378245,0.066749,0.155748,0.400495,0.044499,0.489494,0.0,0.244747,0.155748,0.111249,0.556243
9,0.192351,0.144263,0.240439,0.064117,0.240439,0.304556,0.0,0.400732,0.064117,0.240439,0.176322,0.032059,0.689258


In [19]:
import plotly.express as px

fig = px.scatter_3d(tf_idf_reduced, y="y", x="x", z="z", text="chapter", color="is_addition")
fig.update_traces(marker_size=10)
fig.show()

### Dependency parsing study

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from umap import UMAP
import pandas as pd

vectorizer = TfidfVectorizer(use_idf=False)
umap = UMAP(n_components=3)

dep_matrix = vectorizer.fit_transform([info["dep"] for info in chapter_info.values()])

tf_idf_dep = pd.DataFrame(dep_matrix.todense(), columns=vectorizer.get_feature_names_out())
tf_idf_reduced = pd.DataFrame(umap.fit_transform(tf_idf_dep), columns=["x", "y", "z"], index=chapter_info.keys())
tf_idf_reduced["chapter"] = tf_idf_reduced.index

tf_idf_reduced["is_addition"] = tf_idf_reduced["chapter"].apply(lambda x: int(x) in list(range(11, 16)))

In [21]:
import plotly.express as px

fig = px.scatter_3d(tf_idf_reduced, y="y", x="x", z="z", text="chapter", color="is_addition")
fig.update_traces(marker_size=10)
fig.show()

### Tense analysis

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from umap import UMAP
import pandas as pd

vectorizer = TfidfVectorizer(use_idf=False)
umap = UMAP(n_components=3)

tense_matrix = vectorizer.fit_transform([info["aspect"] for info in chapter_info.values()])

tf_idf_tense = pd.DataFrame(tense_matrix.todense(), columns=vectorizer.get_feature_names_out())
tf_idf_reduced = pd.DataFrame(umap.fit_transform(tf_idf_tense), columns=["x", "y", "z"], index=chapter_info.keys())
tf_idf_reduced["chapter"] = tf_idf_reduced.index

tf_idf_reduced["is_addition"] = tf_idf_reduced["chapter"].apply(lambda x: int(x) in list(range(11, 16)))

In [23]:
import plotly.express as px

fig = px.scatter_3d(tf_idf_reduced, y="y", x="x", z="z", text="chapter", color="is_addition")
fig.update_traces(marker_size=10)
fig.show()

### Mood analysis

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from umap import UMAP
import pandas as pd

vectorizer = TfidfVectorizer(use_idf=False)
umap = UMAP(n_components=3)

mood_matrix = vectorizer.fit_transform([info["mood"] for info in chapter_info.values()])

tf_idf_mood = pd.DataFrame(mood_matrix.todense(), columns=vectorizer.get_feature_names_out())
tf_idf_reduced = pd.DataFrame(umap.fit_transform(tf_idf_tense), columns=["x", "y", "z"], index=chapter_info.keys())
tf_idf_reduced["chapter"] = tf_idf_reduced.index

tf_idf_reduced["is_addition"] = tf_idf_reduced["chapter"].apply(lambda x: int(x) in list(range(11, 16)))

In [25]:
import plotly.express as px

fig = px.scatter_3d(tf_idf_reduced, y="y", x="x", z="z", text="chapter", color="is_addition")
fig.update_traces(marker_size=10)
fig.show()

In [26]:
tf_idf_mood

Unnamed: 0,imp,ind,opt,sub
0,0.0,0.995037,0.0,0.099504
1,0.131024,0.982683,0.0,0.131024
2,0.092253,0.968658,0.0,0.230633
3,0.228665,0.971825,0.0,0.057166
4,0.20601,0.97855,0.0,0.0
5,0.305788,0.917365,0.0,0.254824
6,0.109618,0.950019,0.0,0.292314
7,0.115087,0.97824,0.0,0.172631
8,0.209529,0.977802,0.0,0.0
9,0.083045,0.996546,0.0,0.0


### Merging all

In [27]:
full_style_df = pd.concat([tf_idf_stopwords, tf_idf_pos, tf_idf_dep, tf_idf_tense], axis=1)

umap = UMAP(n_components=3)

tf_idf_reduced = pd.DataFrame(umap.fit_transform(full_style_df), columns=["x", "y", "z"], index=chapter_info.keys())
tf_idf_reduced["chapter"] = tf_idf_reduced.index

tf_idf_reduced["is_addition"] = tf_idf_reduced["chapter"].apply(lambda x: int(x) in list(range(11, 16)))
fig = px.scatter_3d(tf_idf_reduced, y="y", x="x", z="z", text="chapter", color="is_addition")
fig.update_traces(marker_size=10)
fig.show()

In [28]:
full_style_df = pd.concat([tf_idf, tf_idf_stopwords, tf_idf_pos, tf_idf_dep, tf_idf_tense], axis=1)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from umap import UMAP
import pandas as pd

umap = UMAP(n_components=3)

tf_idf_reduced = pd.DataFrame(umap.fit_transform(full_style_df), columns=["x", "y", "z"], index=chapter_info.keys())
tf_idf_reduced["chapter"] = tf_idf_reduced.index

tf_idf_reduced["is_addition"] = tf_idf_reduced["chapter"].apply(lambda x: int(x) in list(range(11, 16)))
fig = px.scatter_3d(tf_idf_reduced, y="y", x="x", z="z", text="chapter", color="is_addition")
fig.update_traces(marker_size=10)
fig.show() 

ValueError: Input contains NaN.