# Imports and Data Wrangling

In [1]:
import glob
import os
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import h2o
from h2o.automl import H2OAutoML

import joblib
import re
import string
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB

historiography = {
    "herodotus",
    "thucydides",
    "xenophon",
    "polybius",
    "plutarch"
}

poetry = {
    "homer",
    "hesiod",
    "apollonius",
    "callimachus",
    "theocritus",
    "pindar"
}

author_target = {"herodotus":0,
           "thucydides":1,
           "xenophon":2,
           "polybius":3,
           "plutarch":4,
           "homer":5,
           "hesiod":6,
           "apollonius":7, 
           "callimachus":8,
           "theocritus":9,
           "pindar":10}

genre_target = {'poetry':0, 'history':1}

def standardize_accents(target_token):
    token = target_token.replace('ὰ', 'ά')
    token = token.replace('ἂ', 'ἄ')
    token = token.replace('ἃ', 'ἅ')
    token = token.replace('ὲ', 'έ')
    token = token.replace('ἒ', 'ἔ')
    token = token.replace('ἓ', 'ἕ')
    token = token.replace('ὴ', 'ή')
    token = token.replace('ἢ', 'ἤ')
    token = token.replace('ἣ', 'ἥ')
    token = token.replace('ὶ', 'ί')
    token = token.replace('ἲ', 'ἴ')
    token = token.replace('ἳ', 'ἵ')
    token = token.replace('ῒ', 'ΐ')
    token = token.replace('ὸ', 'ό')
    token = token.replace('ὂ', 'ὄ')
    token = token.replace('ὃ', 'ὅ')
    token = token.replace('ὺ', 'ύ')
    token = token.replace('ὒ', 'ὔ')
    token = token.replace('ὓ', 'ὕ')
    token = token.replace('ῢ', 'ΰ')
    token = token.replace('ὼ', 'ώ')
    token = token.replace('ὢ', 'ὤ')
    token = token.replace('ὣ', 'ὥ')
    token = token.replace('ᾲ', 'ᾴ')
    token = token.replace('ᾂ', 'ᾄ')
    token = token.replace('ᾃ', 'ᾅ')
    token = token.replace('ῂ', 'ῄ')
    token = token.replace('ᾒ', 'ᾔ')
    token = token.replace('ᾓ', 'ᾕ')
    token = token.replace('ῲ', 'ῴ')
    token = token.replace('ᾢ', 'ᾤ')
    token = token.replace('ᾣ', 'ᾥ')
    token = token.replace('Ὰ', 'Ά')
    token = token.replace('Ἂ', 'Ἄ')
    token = token.replace('Ἃ', 'Ἅ')
    token = token.replace('Ὲ', 'Έ')
    token = token.replace('Ἒ', 'Ἔ')
    token = token.replace('Ἓ', 'Ἕ')
    token = token.replace('Ὴ', 'Ή')
    token = token.replace('Ἢ', 'Ἤ')
    token = token.replace('Ἣ', 'Ἥ')
    token = token.replace('Ὶ', 'Ί')
    token = token.replace('Ἲ', 'Ἴ')
    token = token.replace('Ἳ', 'Ἵ')
    token = token.replace('Ὸ', 'Ό')
    token = token.replace('Ὂ', 'Ὄ')
    token = token.replace('Ὃ', 'Ὅ')
    token = token.replace('Ὺ', 'Ύ')
    token = token.replace('Ὓ', 'Ὕ')
    token = token.replace('Ὼ', 'Ώ') 
    token = token.replace('Ὢ', 'Ὤ')
    token = token.replace('Ὣ', 'Ὥ')
    token = token.replace('ᾊ', 'ᾌ')
    token = token.replace('ᾋ', 'ᾍ')
    token = token.replace('ᾚ', 'ᾜ')
    token = token.replace('ᾛ', 'ᾝ')
    token = token.replace('ᾪ', 'ᾬ')
    token = token.replace('ᾫ', 'ᾭ')
    return token

def is_wine_line(line):
    standardized_line = standardize_accents(line)
    wine_forms = ['οἶνος', 'οἴνου', 'οἴνῳ', 'οἶνον', 'οἴνω', 'οἶνοι', 'οἴνου', 'οἴνῳ', 'οἶνον', 'οἶνε', 'οἴνοιν', 'οἴνω','οἴνων', 'οἴνοις', 'οἴνους', 'χοἷνος', 'οἶνων', 'οἴνωι', 'οἴνω', 'οἷνου', 'οἵνου', 'οἶνου', 'οἴνου', 'ὀίνου', 'ὄινου', 'οίνου', 'οινου', 'οἷνος', 'οἶνος', 'οινος',  'οἷνον', 'οἵνον', 'οἶνον', 'οἴνον', 'οινον', 'οἴνοισιν', 'οἴνοισι', 'οἴνοις', 'οἴνοιο','οἶνοι','οἶνο','οἶνε','οίνε','κᾦνος','κᾦνον','ϝοῖνον','γοῖνος']
    for form in wine_forms:
        if form in standardized_line.split(" "):
            return True
    return False
    
def clean_files_to_df(quiet=True):
    """Reads all csv files in data to a pandas dataframe."""
    path = os.getcwd() + '\data_clean'
    text_files = glob.glob(os.path.join(path, "*.csv"))
    header = ['genre', 'author', 'work', 'section', 'line_number','string', 'wine_line']
    all_lines = []
    for file in text_files:
        if not quiet:
            print('File Name:', file.split("\\")[-1])
        file_tokens = (file.split("\\")[-1]).split("_")
        author = file_tokens[0]
        work = file_tokens[1]
        if author in historiography:
            genre = 'history'
        elif author in poetry:
            genre = 'poetry'
        if len(file_tokens) == 4:
            section = file_tokens[2]
        else:
            section = "NA"
        # open and read file             
        open_file = open(file, 'r', encoding='utf8')
        file_lines = open_file.readlines()
        line_number = 1
        for line in file_lines:
            if len(line.split(' '))>3:
                wine = is_wine_line(line.strip())
                line = line.replace("†", "")
                line = line.replace('""', '')
                line = line.replace('…', '')
                line = line.replace("'", "")
                line = line.replace('”', "")
                line = line.replace('“', '')
                line = line.replace('—', '')
                line = line.replace('῾', '')
                line = line.replace('"', '')
                line = line.replace("!", '')
                line = line.replace('*', '')
                line = line.replace('-', '')
                line = line.replace('>', '')
                all_lines.append([genre, author, work, section, line_number, line.strip(), wine])
                line_number += 1
    corpus_df = pd.DataFrame(all_lines, columns=header)
    return corpus_df

def process_text(text_processing):
    text_processing = str(text_processing).lower()
    text_processing = re.sub(
        f"[{re.escape(string.punctuation)}]", " ", text_processing
    )
    text_processing = " ".join(text_processing.split())
    return text_processing


The lines below create a dataframe of our corpus and factorizes genre and author. I also remove all empty strings from the corpus. I define our training corpus as all the lines that do not contain οἶνος and the experiment corpus as all the lines that do contain οἶνος.

In [None]:
corpus = clean_files_to_df(quiet=True)
corpus = corpus[corpus.string != ""]

corpus['target']=corpus['genre'].map(genre_target)
corpus['author_target']=corpus['author'].map(author_target)

experiment = corpus[corpus.wine_line==True]
training = corpus[corpus.wine_line==False]

# Classification by Genre

The two cell blocks below train a multinomial Naive Bayes classifier to classify lines of ancient Greek text into one of two genres, poetry and prose. We train the MNB using all the lines in the corpus that do not contain a form of οἶνος and then test the model using the 159 lines that do contain a form of οἶνος. The second cell is a control experiment that uses both training and test data from the non-οἶνος corpus. (Note: Validation sets are drawn from training data.)

In [None]:
# Experiment

df = training
df["clean_text"] = df.string.map(process_text)
df_train, df_test = train_test_split(df, test_size=0.20, stratify=df.target)

vec = CountVectorizer(ngram_range=(1, 3))

X_train = vec.fit_transform(df_train.clean_text)
X_test = vec.transform(df_test.clean_text)
y_train = df_train.target
y_test = df_test.target

nb = MultinomialNB()
nb.fit(X_train, y_train)

preds = nb.predict(X_test)
# Save the model
joblib.dump(nb, "nb.joblib")
joblib.dump(vec, "vec.joblib")
# Print a report of the model's performance
print(classification_report(y_test, preds))
# Load the model again
nb_saved = joblib.load("nb.joblib")
vec_saved = joblib.load("vec.joblib")
# Test the model and record the results
all_mnb_results = []
header=['author', 'text', 'line', 'expected', 'result']
for index,row in experiment.iterrows():
    line = row['string']
    clean_line = process_text(line)
    sample_vec = vec_saved.transform([line])
    author = row['author']
    text = row['work']
    expected = row['target']
    result = nb_saved.predict(sample_vec)
    all_mnb_results.append([author, text, line, expected, result[0]])

results_df = pd.DataFrame(all_mnb_results, columns=header)
results_df.to_csv('mnb_results.csv')

In [None]:
# Control

#We define our training data as 95% of the non-οἶνος corpus
df = training.sample(frac=0.95)
#The test data for the control experiment is the remaining 5% 
test = training.drop(df.index)

df["clean_text"] = df.string.map(process_text)
df_train, df_test = train_test_split(df, test_size=0.20, stratify=df.target)

vec = CountVectorizer(ngram_range=(1, 3))

X_train = vec.fit_transform(df_train.clean_text)
X_test = vec.transform(df_test.clean_text)

y_train = df_train.target
y_test = df_test.target

nb = MultinomialNB()
nb.fit(X_train, y_train)

preds = nb.predict(X_test)

joblib.dump(nb, "nb.joblib")
joblib.dump(vec, "vec.joblib")

print(classification_report(y_test, preds))

nb_saved = joblib.load("nb.joblib")
vec_saved = joblib.load("vec.joblib")
all_mnb_results = []
header=['author', 'text', 'line', 'expected', 'result']

for index,row in test.iterrows():
    line = row['string']
    clean_line = process_text(line)
    sample_vec = vec_saved.transform([line])
    author = row['author']
    text = row['work']
    expected = row['target']
    result = nb_saved.predict(sample_vec)
    all_mnb_results.append([author, text, line, expected, result[0]])
    
results_df = pd.DataFrame(all_mnb_results, columns=header)
results_df.to_csv('mnb_control_results.csv')

# Classification by Author

The two cell blocks below train a multinomial Naive Bayes classifier to classify lines of ancient Greek text by author. We train the MNB using all the lines in the corpus that do not contain a form of οἶνος and then test the model using the 159 lines that do contain a form of οἶνος. The second cell is a control experiment that uses both training and test data from the non-οἶνος corpus. (Note: Validation sets are drawn from training data.)

In [None]:
# Experiment

df = training.sample(frac=0.95)
test = training.drop(df.index)
df["clean_text"] = df.string.map(process_text)
df_train, df_test = train_test_split(df, test_size=0.20, stratify=df.author_target)

vec = CountVectorizer(ngram_range=(1, 3))

X_train = vec.fit_transform(df_train.clean_text)
X_test = vec.transform(df_test.clean_text)

y_train = df_train.author_target
y_test = df_test.author_target

nb = MultinomialNB()
nb.fit(X_train, y_train)

preds = nb.predict(X_test)

joblib.dump(nb, "nb.joblib")
joblib.dump(vec, "vec.joblib")

print(classification_report(y_test, preds))

nb_saved = joblib.load("nb.joblib")
vec_saved = joblib.load("vec.joblib")

all_mnb_author_results = []
header=['author', 'text', 'line', 'expected', 'result']
for index,row in test.iterrows():
    line = row['string']
    clean_line = process_text(line)
    sample_vec = vec_saved.transform([line])
    author = row['author']
    text = row['work']
    expected = row['author_target']
    result = nb_saved.predict(sample_vec)
    all_mnb_author_results.append([author, text, line, expected, result[0]])
    
results_df = pd.DataFrame(all_mnb_author_results, columns=header)
results_df.to_csv('mnb_author_results.csv')

In [None]:
# Control

#We define our training data as 95% of the non-οἶνος corpus
df = training.sample(frac=0.95)
test = training.drop(df.index)

df["clean_text"] = df.string.map(process_text)
df_train, df_test = train_test_split(df, test_size=0.20, stratify=df.author_target)

vec = CountVectorizer(ngram_range=(1, 3))

X_train = vec.fit_transform(df_train.clean_text)
X_test = vec.transform(df_test.clean_text)

y_train = df_train.author_target
y_test = df_test.author_target

nb = MultinomialNB()
nb.fit(X_train, y_train)

preds = nb.predict(X_test)

joblib.dump(nb, "nb.joblib")
joblib.dump(vec, "vec.joblib")

print(classification_report(y_test, preds))

nb_saved = joblib.load("nb.joblib")
vec_saved = joblib.load("vec.joblib")
all_mnb_author_results = []
header=['author', 'text', 'line', 'expected', 'result']
for index,row in test.iterrows():
    line = row['string']
    clean_line = process_text(line)
    sample_vec = vec_saved.transform([line])
    author = row['author']
    text = row['work']
    expected = row['author_target']
    result = nb_saved.predict(sample_vec)
    all_mnb_author_results.append([author, text, line, expected, result[0]])
    
results_df = pd.DataFrame(all_mnb_author_results, columns=header)
results_df.to_csv('mnb_control_author_results.csv')