In [None]:
import os
import re
import spacy
import pandas as pd

# Preprocess Text

In [None]:
FILE_PATH = "../assets/NER_assets/Ancient_Words.csv"
# read csv file
df = pd.read_csv(FILE_PATH)

In [None]:
df.columns

In [None]:
# rename columns to fit code
df.rename(columns = {'Word':'Keyword', 'Category Types':'Label'}, inplace = True)
# If a cell is empty (NaN), Fill it with the value in its parallel "Early" column
for row in df:
    df['Quote'].fillna(df['Early Quote'], inplace=True)
    df['Word Before'].fillna(df['Early Word Before'], inplace=True)
    df['Word After'].fillna(df['Early Word After'], inplace=True)
    df['Label'].fillna(df['Early Category Type'], inplace=True)
# remove rows with no Keyword
df = df.dropna(subset=['Keyword'])
# Remove any row that isn't Greek
pat = '[ء-ي]+'
#df.Keyword.str.contains(pat)
df = df[~df.Keyword.str.contains(pat, na=False)]
#replace new line in df column
df['Keyword'].replace('\n', '', regex=True, inplace=True)
#replace numbers in df
df.replace('\d+', '', regex=True, inplace=True)
#replace hyphens in df column
df.replace('-', '', regex=True, inplace=True)
# replace comma in df column
df['Keyword'].replace(',', '', regex=True, inplace=True)
#replace period in df column
df['Keyword'].replace('\.', '', regex=True, inplace=True)
#replace interpunkt in df column
df['Keyword'].replace('\·', '', regex=True, inplace=True)
# replace multiple spaces in df column
df.replace(' +', ' ', regex=True, inplace=True)
# replace end punctuation in df column
df['Keyword'].replace('\s+$', '', regex=True, inplace=True)

df.fillna(0)
df.reset_index(drop=True, inplace=True)


In [None]:
df.head(10)
len(df)

In [None]:
# if any of the fields "KeyWord", "Quote", "Word Before", "Word After" are "0", drop the row
for w in ['Keyword', 'Quote', 'Word Before', 'Word After']:
    df = df[df[w] != 0]


In [None]:
df.head(10)
len(df)

In [None]:
# import requirements for converting the dataframe to Spacy Docs
from collections import defaultdict
from typing import List
from spacy.tokens import Doc, DocBin
from unicodedata import normalize
import random


# Create dictionaries from dendrosearch and conllu files (supplied by Jacobo)

In [None]:
PUNCTUATION = ['.', ")", ".", "·", "(", "[", "]", ":", ";", ",", "?", "!", "،", "_"]
# extract from df a dictionary {word: lemma}
lemma_dict = {}
for index, row in df.iterrows():
    lemma_dict[row['Keyword']] = row['Lemma']

# load dendrosearch lemma dictionary
dendrosearch_lemma_dict = {}
with open('../assets/dendrosearch_lemma_dict.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.split()
        # check if not punctuation
        if len(line) > 1 and line[0] not in PUNCTUATION:
            dendrosearch_lemma_dict[line[0]] = line[1]

# create dictionary from all conllu files
PATH = "../assets/Lemmatization_training_files/"
conllu_lemma_dict = {}

# iterate over all files in directory
for f in os.listdir(PATH):
    if f.endswith(".conllu"):
        # if file is a conllu file
        with open(os.path.join(PATH, f), 'r', encoding='utf-8') as f:
            for line in f:
                # conll line is: id | keyword | lemma | pos | _
                # we want only keyword and lemma
                line = line.split()
                if len(line) > 2 and line[1] not in PUNCTUATION:
                    conllu_lemma_dict[line[1]] = line[2]



## Create dictionary from INCEpTION files

In [None]:
from cassis import *
import zipfile
import tempfile
import os
from tqdm import tqdm


inception_dict = {}
inception_sentences = [] # list of tuples (sentence, source_file)

In [None]:
# extract all files in inception folder to temp folder
with tempfile.TemporaryDirectory() as tempdir:
    for f in os.listdir("../assets/NER_assets/INCEpTION_files/"):
        if f.endswith(".zip"):
            with zipfile.ZipFile(os.path.join("../assets/NER_assets/INCEpTION_files/", f), 'r') as zip_ref:
                zip_ref.extractall(tempdir)
    print (tempdir)
    # open typesystem and print content
    with open('{0}/{1}'.format(tempdir, "TypeSystem.xml"), 'rb') as f:
        typesystem = load_typesystem(f)

    # iterate over all files in temp folder
    for f in os.listdir(tempdir):
        # if file is a xmi file
        print (typesystem)
        if f.endswith(".xmi"):
            # load xmi file
            with open(os.path.join(tempdir, f), 'rb') as f:
                # load typesystem from temp folder
                cas = load_cas_from_xmi(f, typesystem=typesystem)
                for token in cas.select('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma'):
                    inception_dict[token.get_covered_text()] = token.value
                for sentence in cas.select("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"):
                    # use os to get only file name
                    inception_sentences.append((sentence.get_covered_text(), os.path.basename(f.name)))


In [None]:
# print random sentence from inception_sentences
print (random.choice(inception_sentences))

In [None]:
# create big dict df with word, lemma, source. check for duplicates
big_dict = {
    'Conllu': conllu_lemma_dict,  # files from Jacobo
    'Inception': inception_dict,
    'Coda': lemma_dict,
    'Dendrosearch': dendrosearch_lemma_dict
}

with open('big_dict.txt', 'w') as f:
    for k, v in big_dict.items():
        f.write(f'{k}: {v}\n')
        f.close


In [None]:
# create two copies of dictionary, one in NFKD, other in NFKC

# remove all nan values from big_dict
# big_dict is a dictionary of dictionaries

big_dict = {k: big_dict[k] for k in big_dict if big_dict[k]}
big_dict_nfkd = {}
big_dict_nfkc = {}

for source in big_dict:
    big_dict_nfkd[source] = {}
    big_dict_nfkc[source] = {}
    for word in big_dict[source]:
        # if word and lemma are not nan: if
        if word and not pd.isnull(word) and big_dict[source][word] and not pd.isnull(big_dict[source][word]):
            big_dict_nfkd[source][normalize('NFKD', word)] = normalize('NFKD', big_dict[source][word])
            big_dict_nfkc[source][normalize('NFKC', word)] = normalize('NFKC', big_dict[source][word])


In [None]:
from collections import Counter

def create_lemma_dict(dicts):
    # Create an empty dictionary to store the word-lemma pairs
    word_lemma_dict = {}
    
    # Iterate over each dictionary in the list of dictionaries
    for d in dicts:
        # Iterate over each key-value pair in the dictionary
        for k, v in d.items():
            # If the value is not one of the empty marks, add the key-value pair to the word-lemma dictionary
            if v not in ["_", " ", ""]:
                word_lemma_dict[k] = v
            # If the value is one of the empty marks, search for all other instances of that word in all dictionaries
            # and add the lemma that appears the most times in total for that word, as long as it's not one of the empty marks above
            else:
                word = k
                lemmas = []
                for d2 in dicts:
                    if word in d2 and d2[word] not in ["_", " ", ""]:
                        lemmas.append(d2[word])
                if lemmas:
                    lemma_counts = Counter(lemmas)
                    most_common_lemma = lemma_counts.most_common(1)[0][0]
                    print(most_common_lemma)
                    word_lemma_dict[word] = most_common_lemma
    
    # Return the word-lemma dictionary
    return word_lemma_dict

In [None]:
dicts = [conllu_lemma_dict, inception_dict, lemma_dict, dendrosearch_lemma_dict]
word_lemma_dict = create_lemma_dict(dicts)

# Now you can use the word_lemma_dict dictionary outside the function

In [None]:
apostrophe_items = [(k, v) for k, v in word_lemma_dict.items() if v.endswith("'" or "’" or "‘") or k.endswith("'" or "’" or "‘")]
print(apostrophe_items)

In [None]:
import unicodedata

for d in big_dict:
    apostrophe_items = [(k, v) for k, v in word_lemma_dict.items() if v.endswith("'") or k.endswith("'" or "’" or "‘")]
    print(f"Apostrophe items in {big_dict.index(d)}:")
    for k, v in apostrophe_items:
        print(f"{k}: {v}")

In [None]:
apostrophes = ["'", "‘", "ʼ"]
apostrophe_items = [(k, v) for k, v in conllu_lemma_dict.items() if any(v.endswith(a) or k.endswith(a) for a in apostrophes)]
print((apostrophe_items))

In [None]:
# save apostrophe_items to txt file
with open('apostrophe_items.txt', 'w') as f:
    for item in apostrophe_items:
        print(item, file=f)

In [None]:
    #'Conllu': conllu_lemma_dict,  # files from Jacobo
    #'Inception': inception_dict,
    #'Coda': lemma_dict,
    #'Dendrosearch': dendrosearch_lemma_dict

In [None]:
apostrophe_items = [(k, v) for k, v in conllu_lemma_dict.items() if v.endswith("'" or "’" or "‘" or "ʼ") or k.endswith("'" or "’" or "‘" or "ʼ")]
print((apostrophe_items))

In [None]:
#save conllu_lemma_dict to txt file
with open('inception_dict.txt', 'w') as f:
    for key, value in inception_dict.items():
        f.write('%s:%s\n' % (key, value))
        f.close

In [None]:
τῶν:ὁ


In [None]:
inception_dict

In [None]:
print(len(word_lemma_dict))

In [None]:
# print amount of total of all items in all sources in big_dict
print('Total number of all items in all sources in big_dict: ', sum(len(v) for v in big_dict.values()))

In [None]:
# compare big_dict with word_lemma_dict
for k, v in big_dict.items():
    if k in word_lemma_dict:
        if word_lemma_dict[k] in big_dict[k]:
            print(f'{k} and {word_lemma_dict[k]} are the same')
        else:
            print(f'{k} and {word_lemma_dict[k]} are not the same')
    else:
        print(f'{k} is not in word_lemma_dict')

In [None]:
for word in set(big_dict.keys()) | set(word_lemma_dict.keys()):
    if word in big_dict and word not in word_lemma_dict:
        print(f"{word}: {big_dict[word]} (in big_dict)")
    elif word in word_lemma_dict and word not in big_dict:
        print(f"{word}: {word_lemma_dict[word]} (in word_lemma_dict)")
    elif word in big_dict and word in word_lemma_dict:
        if big_dict[word] != word_lemma_dict[word]:
            print(f"{word}: {big_dict[word]} (in big_dict), {word_lemma_dict[word]} (in word_lemma_dict) - LEMMA MISMATCH")
        else:
            print(f"{word}: {big_dict[word]} (in both)")

In [None]:
print(len(set(big_dict.keys())))

In [None]:
#save conllu_lemma_dict to txt file
with open('word_lemma_dict.txt', 'w') as f:
    for key, value in word_lemma_dict.items():
        f.write('%s:%s\n' % (key, value))
        f.close

In [None]:
for key, value in conllu_lemma_dict.items():
    if "Φειδίαν" in key:
        print(key, value)

In [None]:
#save conllu_lemma_dict to txt file
with open('big_dict_test.txt', 'w') as f:
    for key, value in conllu_lemma_dict.items():
        f.write('%s:%s\n' % (key, value))
        f.close

# Run NLP pipeline on INCEpTION and Coda files

In [None]:
# install spacy grc model if not already installed
nlp = spacy.load("grc_proiel_trf") # Use your preferred model here


In [None]:
# merge sentences from inception and coda, keep record of source
sentences = []
for sentence in inception_sentences:
    sentences.append((sentence[0], sentence[1]))

# add coda sentences(from original df) to sentences list
for sentence in df['Quote'].tolist():
    sentences.append((sentence, 'Coda'))


In [None]:

# create list of Doc objects
docs_nfkd: List[Doc] = []

# create df to record corrections
corrections_df_nfkd = pd.DataFrame(columns=['sentence', 'source', 'token', 'lemma', 'lemma_corrected', 'correction_source'])
corrected_sentences = 0

for sentence in tqdm(sentences):
    sentence = (normalize('NFKD', sentence[0]), sentence[1])
    doc = nlp(sentence[0])

    # we search train_df for the sentence and add keywords to the doc


    for token in doc:
        for source in big_dict_nfkd: # each source is a dictionary
            if token.text in big_dict_nfkd[source]: # if token is in dictionary
                if big_dict_nfkd[source][token.text] != token.lemma_: # if lemma is not the same as the one in the dictionary
                    corrections_df_nfkd = pd.concat([corrections_df_nfkd,
                        pd.DataFrame({
                        'sentence': sentence[0],
                        'source': sentence[1],
                        'token': token.text,
                        'lemma': token.lemma_,
                        'lemma_corrected': big_dict_nfkd[source][token.text],
                        'correction_source': source
                    }, index=[0])], ignore_index=True)
                    corrected_sentences += 1
                    token.lemma_ = big_dict_nfkd[source][token.text]

                    break

    docs_nfkd.append(doc)

In [None]:
# filter by correction source: Coda Annotation
corrections_df_nfkd.sample(10)

In [None]:
# create list of Doc objects
docs_nfkc: List[Doc] = []

# create df to record corrections
corrections_df_nfkc = pd.DataFrame(columns=['sentence', 'source', 'token', 'lemma', 'lemma_corrected', 'correction_source'])
corrected_sentences = 0


for sentence in tqdm(sentences):
    sentence = (normalize('NFKC', sentence[0]), sentence[1])
    doc = nlp(sentence[0])

    # we search train_df for the sentence and add keywords to the doc


    for token in doc:
        for source in big_dict_nfkc:
            if token.text in big_dict_nfkc[source]:
                if big_dict_nfkc[source][token.text] != token.lemma_:
                    corrections_df_nfkc = pd.concat([corrections_df_nfkc,
                        pd.DataFrame({
                        'sentence': sentence[0],
                        'source': sentence[1],
                        'token': token.text,
                        'lemma': token.lemma_,
                        'lemma_corrected': big_dict_nfkc[source][token.text],
                        'correction_source': source
                    }, index=[0])], ignore_index=True)
                    corrected_sentences += 1
                    token.lemma_ = big_dict_nfkc[source][token.text]

                    break

    docs_nfkc.append(doc)




In [None]:
corrections_df_nfkc.sample(10)

In [None]:
# find how many corrected by each dictionary
corrections_df_nfkd.groupby('correction_source').count()

In [None]:
corrections_df_nfkc.groupby('correction_source').count()

In [None]:
# split docs to train, dev, test randomly
from sklearn.model_selection import train_test_split
from pathlib import Path

# split docs to train, dev, test randomly, for each normalization

train_docs_nfkd, test_docs_nfkd = train_test_split(docs_nfkd, test_size=0.2, random_state=42)
train_docs_nfkd, dev_docs_nfkd = train_test_split(train_docs_nfkd, test_size=0.2, random_state=42)

train_docs_nfkc, test_docs_nfkc = train_test_split(docs_nfkc, test_size=0.2, random_state=42)
train_docs_nfkc, dev_docs_nfkc = train_test_split(train_docs_nfkc, test_size=0.2, random_state=42)

print (f"train: {len(train_docs_nfkd)}\ndev: {len(dev_docs_nfkd)}\ntest: {len(test_docs_nfkd)} for nfkd")
print (f"train: {len(train_docs_nfkc)}\ndev: {len(dev_docs_nfkc)}\ntest: {len(test_docs_nfkc)} for nfkc")
# save each one to DocBin

Path("../corpus/train").mkdir(parents=True, exist_ok=True)
Path("../corpus/dev").mkdir(parents=True, exist_ok=True)
Path("../corpus/test").mkdir(parents=True, exist_ok=True)


train_bin_nfkd = DocBin(docs=train_docs_nfkd)
train_bin_nfkd.to_disk("../corpus/train/lemma_train/train_lemma_NFKD.spacy")
test_bin_nfkd = DocBin(docs=test_docs_nfkd)
test_bin_nfkd.to_disk("../corpus/test/lemma_test/test_lemma_NFKD.spacy")
dev_bin_nfkd = DocBin(docs=dev_docs_nfkd)
dev_bin_nfkd.to_disk("../corpus/dev/lemma_dev/dev_lemma_NFKD.spacy")

train_bin_nfkc = DocBin(docs=train_docs_nfkc)
train_bin_nfkc.to_disk("../corpus/train/lemma_train/train_lemma_NFKC.spacy")
test_bin_nfkc = DocBin(docs=test_docs_nfkc)
test_bin_nfkc.to_disk("../corpus/test/lemma_test/test_lemma_NFKC.spacy")
dev_bin_nfkc = DocBin(docs=dev_docs_nfkc)
dev_bin_nfkc.to_disk("../corpus/dev/lemma_dev/dev_lemma_NFKC.spacy")



The spacy dataset should be exported to '../corpus/' folder.\
More specifically:\
train to '..corpus/train/lemma_train/'\
dev to '../corpus/dev/lemma_dev/'\
test to '../corpus/test/lemma_test/