In [None]:
import os
import re
from pathlib import Path

In [None]:
import spacy
from spacy.util import compounding, minibatch
from spacy import displacy
# Uncomment if you want Spacy to use GPU for training. Note - this will use transformer architecture
spacy.require_gpu()

# Training

In [None]:
# Use Weights and Biases platform to log the model training
import wandb
wandb.init(project="Category Types (Greek NER)", entity="atlomy-nlp")

In [None]:
# only for first time using a new pretrained model pipeline
# nlp.config.to_disk('./grc_ud_proiel_trf.cfg')

In [None]:
# Use if you need spaCy to fill missing default values in config
#!python -m spacy init fill-config --diff ../Models/grc_ud_proiel_trf/NER_S_grc_ud_proiel_trf-trainable-lemmatizer.cfg

In [None]:
nlp = spacy.load("/root/Projects/Atlomy/git/greCy_ATLOMY/training/transformer/assembled/model-best")

# NEW_NER - SElf-built from source

## ner training (2X)

In [None]:
# check for config validity
!python -m spacy debug config /root/Projects/Atlomy/git/greCy_ATLOMY/configs/ner_2Xnorm.cfg

In [None]:
!python -m spacy debug data /root/Projects/Atlomy/git/greCy_ATLOMY/configs/ner_2Xnorm.cfg

In [None]:
!python -m spacy train ../configs/ner_2Xnorm.cfg --output ../training/transformer/NER --gpu-id 0 --nlp.lang=grc


In [None]:
!python -m spacy benchmark accuracy /root/Projects/Atlomy/git/greCy_ATLOMY/training/transformer/NER/model-best /root/Projects/Atlomy/git/greCy_ATLOMY/corpus/test/ner_test/ner_test.spacy --gpu-id 0 --displacy-limit 200 --displacy-path /root/Projects/Atlomy/git/greCy_ATLOMY/training/transformer/NER/model-best

## ner training (old files)

In [None]:
!python -m spacy debug data /root/Projects/Atlomy/git/greCy_ATLOMY/configs/ner.cfg

In [None]:
!python -m spacy train ../configs/ner.cfg --output ../training/NER --gpu-id 0 --nlp.lang=grc


In [None]:
!python -m spacy benchmark accuracy /root/Projects/Atlomy/git/greCy_ATLOMY/training/NER/model-best /root/Projects/Atlomy/git/greCy_ATLOMY/corpus/test/test.spacy --gpu-id 0 --displacy-limit 200 --displacy-path /root/Projects/Atlomy/git/greCy_ATLOMY/training/NER/model-best

In [None]:
!python -m spacy debug config /root/Projects/Atlomy/git/greCy_ATLOMY/configs/ner2.cfg

In [None]:
!python -m spacy train ../configs/ner2.cfg --output ../training/NER2 --gpu-id 0 --nlp.lang=grc --verbose

In [None]:
# benchmark the model
!python -m spacy benchmark accuracy /root/Projects/Atlomy/git/greCy_ATLOMY/training/NER2/model-best /root/Projects/Atlomy/git/greCy_ATLOMY/corpus/test/test.spacy --gpu-id 0 --displacy-limit 200 --displacy-path /root/Projects/Atlomy/git/greCy_ATLOMY/training/NER2/model-best

In [None]:
import spacy
from spacy.tokens import Doc, DocBin, Span
# load the model you just trained
nlp = spacy.load("/root/Projects/Atlomy/git/greCy_ATLOMY/training/NER/model-best")


In [None]:
# load the test data
test_data = DocBin().from_disk('/root/Projects/Atlomy/git/greCy_ATLOMY/corpus/test/test.spacy')
# get the docs from the test data
test_docs = list(test_data.get_docs(nlp.vocab))

In [None]:
for doc in test_docs:
      docced = nlp(doc)
      #let's print the lemmas of the tokens
      for token in docced:
          print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.ent_type_)

In [None]:
# define WandB sweep for NER_S_grc_ud_proiel_trf-new_db_3-accumulate-gradient.cfg
sweep_configuration = {
    "method": "bayes",
    "metric": {
        "name": "ents_f",
        "goal": "maximize",
    },
    "parameters": {
        "training": {
            "n_iter": {
                "values": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
            },
            "batch_size": {
                "values": [8, 16, 32, 64, 128, 256, 512, 1024]
            },
            "accumulate_gradient": {
                "values": [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
            },
            "dropout": {
                "values": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
            },
            "learning_rate": {
                "values": [0.0001, 0.001, 0.01, 0.1, 1.0]
            },
            "weight_decay": {
                "values": [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0]
            },
            "grad_norm": {
                "values": [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
            },
        },
    },
    "early_terminate": {
        "type": "hyperband",
        "min_iter": 10,
    }
}


In [None]:
import typer
from pathlib import Path
from spacy.training.loop import train
from spacy.training.initialize import init_nlp
from spacy import util
from thinc.api import Config
import wandb
spacy.require_gpu()


def sweep_ner(default_config: Path, output_path: Path):
    sweep_config = {"method": "bayes"}
    #metric_a = {"name": "ents_f", "goal": "maximize"}
    #metric_b = {"name": "ents_p", "goal": "maximize"}
    #metric_c = {"name": "ents_r", "goal": "maximize"}
    #metric_combined = 0.5*metric_a + 0.3*metric_b + 0.2*metric_c
    #metric = {"name": "metric_combined", "goal": "maximize"}
    metric = {"name": "ents_f", "goal": "maximize"}
    sweep_config["metric"] = metric
    early_terminate = {"type": "hyperband", "min_iter": 3, "s": 2, "eta": 3}
    parameters_dict = {
        "training.dropout": {"distribution": "uniform","min": 0.1, "max": 0.4},
        "training.optimizer.learn_rate": {"distribution": "uniform","min": 0.00001, "max": 0.1},
        "training.batcher.size": {"distribution": "int_uniform","min": 64,"max": 1000,},
        "components.ner.model.maxout_pieces": {"values": [1, 2, 3]},
        "components.ner.model.hidden_width": {"values": [64, 128, 256, 512, 1024]},
        "components.ner.model.extra_state_tokens": {"values": [True, False]},
        "components.ner.model.use_upper": {"values": [True, False]},
        #"corpora.train.augmenter": {"values": [None, "lemmatizer"]},
        "nlp.batch_size": {"values": [128, 256, 512, 1024]},
    }
    sweep_config["early_terminate"] = early_terminate
    sweep_config["parameters"] = parameters_dict
    def train_spacy():
        loaded_local_config = util.load_config(default_config)
        with wandb.init() as run:
            sweeps_config = Config(util.dot_to_dict(run.config))
            merged_config = Config(loaded_local_config).merge(sweeps_config)
            print(merged_config)
            nlp = init_nlp(merged_config, use_gpu=0)
            output_path.mkdir(parents=True, exist_ok=True)
            train(nlp, output_path, use_gpu=0)
    sweep_id = wandb.sweep(sweep_config, project="Greek_NER_sweeps")
    wandb.agent(sweep_id, train_spacy, count=20)


#if __name__ == "__main__":
#    typer.run(main)

In [None]:
sweep_ner(default_config=Path("../Models/grc_ud_proiel_trf/NER_S_grc_ud_proiel_trf-sweep.cfg"), output_path=Path("../Models/grc_ud_proiel_trf/NER_S_grc_ud_proiel_trf-sweep2/"))

## Solo Training

In [None]:
# Begin train model
!python -m spacy train ../Models/grc_ud_proiel_trf/NER_S_grc_ud_proiel_trf_updating-newtest2-changes.cfg --gpu-id 0 --output ./Models/grc_ud_proiel_trf/NER_S_grc_ud_proiel_trf_updating-newtest2-changes/ --verbose


In [None]:
# Evaluate model on test data
!python -m spacy evaluate ../Models/grc_ud_proiel_trf/grc_ud_proiel_trf_Lem_NER/model-best ../Corpus/test.spacy --gpu-id 0 --displacy-limit 200 --displacy-path ../Models/grc_ud_proiel_trf/grc_ud_proiel_trf_Lem_NER/model-best

In [None]:
#benchmark model speed - NEW feature, needs spacy V3.5
# If using 3.5 make sure to run !python -m spacy validate to check installed pipelines compatibility to current version
!python -m spacy benchmark speed  ./Models/grc_ud_proiel_trf/NER_S_grc_ud_proiel_trf_0.2/model-best ./Models/grc_ud_proiel_trf/corpus --gpu-id 0

# Test model on new text

In [None]:
# pre-process text prompt. A normalization step similiar to the normalization done on the model dataset.
import unicodedata as ud

def clean_text(text):
    cleaned = re.sub(r"[\(\[].*?[\)\]]", "", text)
    cleaned = ud.normalize('NFKD', cleaned)
    return cleaned

### Test model 1

In [None]:
# Test the saved model
output_dir = Path("../Models/grc_ud_proiel_trf/grc_ud_proiel_trf_Lem_NER/model-best")

print("Loading from", output_dir)
nlp = spacy.load(output_dir)

Text Examples:
Ἀνατομικὰς ἐγχειρήσεις ἔγραψα μὲν καὶ πρόσθεν, ἡνίκα τὸ πρῶτον ἀνῆλθον ἔναγχος εἰς Ῥώμην, ἄρχειν ἠργμένου τοῦ καὶ νῦν ἡμῖν ἄρχοντος Ἀντωνίνου, γράφειν δ' αὖθις ἄλλας ἔοικα ταύτας διὰ διττὴν αἰτίαν.
Κατά την υπόθεση που είχε ανακοινωθεί, οι δύο άνδρες υπό περιπολία συνελήφθησαν στον οδό Μελίνου και Κορινθίου στην Αθήνα, μετά από συνομιλία με την αστυνομία.

Ταῦτ’ οὖν εἴς τε τὸν παρόντα καὶ τὸν ἑξῆς ἅπαντα λόγον οἷον ὑποθέσεις τινὰς τῶν ἀποδείξεων λαμβάνοντες ἐν ἑκάστῳ τῶν ὀργάνων τὴν ἐξ αὐτῶν ὠφέλειαν ἐροῦμεν ἀπὸ τῶν δακτύλων αὖθις ἀρξάμενοι. 
ἐπειδὴ γὰρ τὴν τῶν ὀστῶν κατασκευὴν ἐν αὐτοῖς ἐπιτηδειοτάτην ὀργάνοις ἀντιληπτικοῖς ἡ φύσις ἐποίησεν, ἦν δ’ ἀμήχανον αὐτοῖς τοῖς ὀστοῖς οὕτω γεώδεσί τε καὶ λιθώδεσιν οὖσι μεταδοῦναι τῆς καθ’ ὁρμὴν κινήσεως, ἐξεῦρεν, ὅτῳ τρόπῳ δι’ ἑτέρων αὐτὰ κινήσει. 
τῶν οὖν κατὰ τὸν πῆχυν μυῶν ἀποφύσασα τένοντας εὐθὺ τῶν δακτύλων ἤγαγεν. 
ἃ γὰρ οἱ παλαιοὶ καλοῦσι νεῦρα, ταυτὶ τὰ προφανῆ, τὰ κινοῦντα τοὺς δακτύλους, οἱ τένοντές εἰσιν· 

In [None]:
doc_noclean = nlp("Ταῦτ’ οὖν εἴς τε τὸν παρόντα καὶ τὸν ἑξῆς ἅπαντα λόγον οἷον ὑποθέσεις τινὰς τῶν ἀποδείξεων λαμβάνοντες ἐν ἑκάστῳ τῶν ὀργάνων τὴν ἐξ αὐτῶν ὠφέλειαν ἐροῦμεν ἀπὸ τῶν δακτύλων αὖθις ἀρξάμενοι. ἐπειδὴ γὰρ τὴν τῶν ὀστῶν κατασκευὴν ἐν αὐτοῖς ἐπιτηδειοτάτην ὀργάνοις ἀντιληπτικοῖς ἡ φύσις ἐποίησεν, ἦν δ’ ἀμήχανον αὐτοῖς τοῖς ὀστοῖς οὕτω γεώδεσί τε καὶ λιθώδεσιν οὖσι μεταδοῦναι τῆς καθ’ ὁρμὴν κινήσεως, ἐξεῦρεν, ὅτῳ τρόπῳ δι’ ἑτέρων αὐτὰ κινήσει. τῶν οὖν κατὰ τὸν πῆχυν μυῶν ἀποφύσασα τένοντας εὐθὺ τῶν δακτύλων ἤγαγεν. ἃ γὰρ οἱ παλαιοὶ καλοῦσι νεῦρα, ταυτὶ τὰ προφανῆ, τὰ κινοῦντα τοὺς δακτύλους, οἱ τένοντές εἰσιν·")
for sent in doc_noclean.sents:
    print(sent.text)
#for ent in doc.ents:
#    print(ent.text, ent.start_char, ent.end_char, ent.label_)
for token in doc_noclean:
     print(token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)

In [None]:
#Test text prompt
test_text = input("Enter your testing text: ")
#clean text
test = clean_text(test_text)
#predict
doc = nlp(test)
for sent in doc.sents:
    print(sent.text)
#for ent in doc.ents:
#    print(ent.text, ent.start_char, ent.end_char, ent.label_)
for token in doc:
     print(token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)

In [None]:
cleancomparetext = 'Ταῦτ’ οὖν εἴς τε τὸν παρόντα καὶ τὸν ἑξῆς ἅπαντα λόγον οἷον ὑποθέσεις τινὰς τῶν ἀποδείξεων λαμβάνοντες ἐν ἑκάστῳ τῶν ὀργάνων τὴν ἐξ αὐτῶν ὠφέλειαν ἐροῦμεν ἀπὸ τῶν δακτύλων αὖθις ἀρξάμενοι. ἐπειδὴ γὰρ τὴν τῶν ὀστῶν κατασκευὴν ἐν αὐτοῖς ἐπιτηδειοτάτην ὀργάνοις ἀντιληπτικοῖς ἡ φύσις ἐποίησεν, ἦν δ’ ἀμήχανον αὐτοῖς τοῖς ὀστοῖς οὕτω γεώδεσί τε καὶ λιθώδεσιν οὖσι μεταδοῦναι τῆς καθ’ ὁρμὴν κινήσεως, ἐξεῦρεν, ὅτῳ τρόπῳ δι’ ἑτέρων αὐτὰ κινήσει. τῶν οὖν κατὰ τὸν πῆχυν μυῶν ἀποφύσασα τένοντας εὐθὺ τῶν δακτύλων ἤγαγεν. ἃ γὰρ οἱ παλαιοὶ καλοῦσι νεῦρα, ταυτὶ τὰ προφανῆ, τὰ κινοῦντα τοὺς δακτύλους, οἱ τένοντές εἰσιν·'

In [None]:
# compare text nlp - clean and uncleaned:
doc_noclean = nlp(cleancomparetext)
doc = nlp(clean_text(cleancomparetext))
for token, token2, in zip(doc, doc_noclean):
    #check if the lemmas are the same
    if token.lemma_ != token2.lemma_:
        print(token.text, ": ", token.lemma_, " or ", token2.lemma_, '? ')
        #print(token.text, "|", "1: ", token.lemma_, "!=" , "2: ", token2.lemma_)
        #print(token.text, ":\n", '\033[1m', "1: ", token.lemma_, "!=" , "2: ", token2.lemma_, '\033[0m')
    else:
        print(token.text, ": ", "is the Lemma ", token.lemma_, " ? ")
        #print(token.text, "|", "1: ", token.lemma_, "==" , "2: ", token2.lemma_)
        pass

In [None]:
doc = clean_text(cleancomparetext)

In [None]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)

In [None]:
nlp.get_pipe("trainable_lemmatizer").label_data

In [None]:
for token in doc:
    print('TOKEN: ', token.text, 'LEMMA: ', token.lemma_, 'POS: ', token.pos_, 'TAG: ', token.tag_, 'DEP: ', token.dep_,
            'SHAPE: ', token.shape_, ' ALPHA: ', token.is_alpha, 'STOP: ', token.is_stop)

In [None]:
displacy.render(doc, style="dep")

### Test model 2

In [None]:
# Test the saved model
output_dir2 = Path("../Models/grc_ud_proiel_trf/NER_S_grc_ud_proiel_trf_updating\model-best")

print("Loading from", output_dir2)
nlp2 = spacy.load(output_dir2)

In [None]:
#Test text prompt
test_text = input("Enter your testing text: ")
#clean text
test = clean_text(test_text)
#predict
doc2 = nlp2(test)
for sent in doc2.sents:
    print(sent.text)
for ent in doc2.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
#train.py
import random
import numpy as np

def train_one_epoch(epoch, lr, bs): 
  acc = 0.25 + ((epoch/30) +  (random.random()/10))
  loss = 0.2 + (1 - ((epoch-1)/10 +  random.random()/5))
  return acc, loss

def evaluate_one_epoch(epoch): 
  acc = 0.1 + ((epoch/20) +  (random.random()/10))
  loss = 0.25 + (1 - ((epoch-1)/10 +  random.random()/6))
  return acc, loss
  
config = {
    'lr' : 0.0001,
    'bs' : 16,
    'epochs': 5
}

def main():
    # Note that we define values from `wandb.config` instead of 
    # defining hard values
    lr = config['lr']
    bs = config['bs']
    epochs = config['epochs']

    for epoch in np.arange(1, epochs):
      train_acc, train_loss = train_one_epoch(epoch, lr, bs)
      val_acc, val_loss = evaluate_one_epoch(epoch)
      
      print('epoch: ', epoch)
      print('training accuracy:', train_acc,'training loss:', train_loss)
      print('validation accuracy:', val_acc,'training loss:', val_loss)

# Call the main function.       
main()

In [None]:
import wandb
import numpy as np 
import random

# Define sweep config
sweep_configuration = {
    'method': 'random',
    'name': 'sweep',
    'metric': {'goal': 'maximize', 'name': 'val_acc'},
    'parameters': 
    {
        'batch_size': {'values': [16, 32, 64]},
        'epochs': {'values': [5, 10, 15]},
        'lr': {'max': 0.1, 'min': 0.0001}
     }
}

# Initialize sweep by passing in config. (Optional) Provide a name of the project.
sweep_id = wandb.sweep(sweep=sweep_configuration, project='my-first-sweep')

# Define training function that takes in hyperparameter values from `wandb.config` and uses them to train a model and return metric
def train_one_epoch(epoch, lr, bs): 
  acc = 0.25 + ((epoch/30) +  (random.random()/10))
  loss = 0.2 + (1 - ((epoch-1)/10 +  random.random()/5))
  return acc, loss

def evaluate_one_epoch(epoch): 
  acc = 0.1 + ((epoch/20) +  (random.random()/10))
  loss = 0.25 + (1 - ((epoch-1)/10 +  random.random()/6))
  return acc, loss

def main():
    run = wandb.init()

    # note that we define values from `wandb.config` instead 
    # of defining hard values
    lr  =  wandb.config.lr
    bs = wandb.config.batch_size
    epochs = wandb.config.epochs

    for epoch in np.arange(1, epochs):
      train_acc, train_loss = train_one_epoch(epoch, lr, bs)
      val_acc, val_loss = evaluate_one_epoch(epoch)

      wandb.log({
        'epoch': epoch, 
        'train_acc': train_acc,
        'train_loss': train_loss, 
        'val_acc': val_acc, 
        'val_loss': val_loss
      })

# Start sweep job.
wandb.agent(sweep_id, function=main, count=4)

Default spaCy:
Ταῦτ’ -> οὗτος
οὖν -> οὖν
εἴς -> εἰς
τε -> τε
τὸν -> ὁ
παρόντα -> πάρειμι
καὶ -> καί
τὸν -> ὁ
ἑξῆς -> ἑξής
ἅπαντα -> ἅπας
λόγον -> λόγος
οἷον -> οἷος
ὑποθέσεις -> ὑπόθεσις
τινὰς -> τις
τῶν -> ὁ
ἀποδείξεων -> ἀπόδειξις
λαμβάνοντες -> λαμβάνω
ἐν -> ἐν

Grecy:
Ταῦτ’ -> οὗτος
οὖν -> οὖν
εἴς -> εἰς
τε -> τε
τὸν -> ὁ
παρόντα -> πάρειμι
καὶ -> καί
τὸν -> ὁ
ἑξῆς -> ἑξής
ἅπαντα -> ἅπας
λόγον -> λόγος
οἷον -> οἷος
ὑποθέσεις -> ὑπόθεσις
τινὰς -> τις

In [None]:
from spacy.tokens import Doc, DocBin, Span
train_data = DocBin().from_disk('../Corpus/train.spacy')
train_data_docs = list(train_data.get_docs(nlp.vocab))
dev_data = DocBin().from_disk('../Corpus/dev.spacy')
dev_data_docs = list(dev_data.get_docs(nlp.vocab))
test_data = DocBin().from_disk('../Corpus/test.spacy')
test_data_docs = list(test_data.get_docs(nlp.vocab))


In [None]:
import unicodedata
φλέψ = "φλέψ"
#normalize texwor
def normalize_text(text):
    return unicodedata.normalize('NFKD', text)

φλέψ = normalize_text(φλέψ)


In [None]:
print(texwor)

In [None]:
textartery = 'ἀρτηρία'
textartery = normalize_text(textartery)

In [None]:
for doc in train_data_docs_norm:
    docced = nlp(doc)
    # if "ὥσπερ αἱ φλέβες" is in the text of the doc, then print the text and the lemma
    if "φλέβες" in docced.text:
        print(docced.text)
        for token in docced:
            print(token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)



In [None]:
for doc in train_data_docs:
    docced = nlp(doc)
    for token in docced:
        if token.lemma_ == texwor:
            print(docced, ' | ', token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)
        #if token.lemma_ == textartery:
          #  print(docced, ' | ', token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)

In [None]:
for doc in dev_data_docs:
    docced = nlp(doc)
    for token in docced:
        #if token.lemma_ == texwor:
        #    print(docced, ' | ', token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)
        if token.lemma_ == textartery:
            print(docced, ' | ', token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)

In [None]:
for doc in test_data_docs:
    docced = nlp(doc)
    for token in docced:
        if token.lemma_ == texwor:
            print(docced, ' | ', token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)
            #displacy.render(docced, style='ent', jupyter=True)
        #if token.lemma_ == textartery:
        #    print(docced, ' | ', token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)
            # displacy
        #    displacy.render(docced, style='ent', jupyter=True)

In [None]:
for doc in test_data_docs:
    docced = nlp(doc)
    for token in docced:
        print(token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)

In [None]:
for doc in test_data_docs:
    docced = nlp(doc)
    # if entity is Body Part then print the sentence and move to the next sentnce
    for ent in docced.ents:
        if ent.label_ =="Body Part":
            print(docced, ' | ', ent.text, ent.label_, token, token.lemma_)
            break

# Test body part and lemma finding

In [None]:
text = "Προϊόντι γὰρ καὶ καταβαίνοντι τῷ περιττώματι εὐρυχωρία γίνεται, καὶ πρὸς τὸ μεταβάλλειν ἱσταμένῳ τοῖς εὐχιλοτέροις τῶν ζῴων καὶ πλείονος δεομένοις τροφῆς, διὰ τὸ μέγεθος ἢ τὴν θερμότητα τῶν τόπων. Εἶτ' ἐντεῦθεν πάλιν, ὥσπερ ἀπὸ τῆς ἄνω κοιλίας δέχεται στενώτερον ἔντερον, οὕτως ἐκ τοῦ κώλου καὶ τῆς εὐρυχωρίας ἐν τῇ κάτω κοιλίᾳ πάλιν εἰς στενώτερον ἔρχεται καὶ εἰς τὴν ἕλικα τὸ περίττωμα ἐξικμασμένον πάμπαν."

In [None]:
colors = {"Body Part": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"ents": ["Body Part"], "colors": colors}

In [None]:
nlp = spacy.load("../training/NER2/model-best")
nlp_lemmatizer = spacy.load("../training/transformer/lemmatizer/model-best")

nlp.add_pipe('lemmatizer', source=nlp_lemmatizer, before='trainable_lemmatizer')
nlp.remove_pipe('trainable_lemmatizer')

nlp.remove_pipe('transformer')
nlp.add_pipe('transformer', source=nlp_lemmatizer, before='morphologizer')

In [None]:
nlp_ner = spacy.load("../training/NER2/model-best")
nlp = spacy.load("../training/transformer/lemmatizer/model-best")

nlp.add_pipe('ner', source=nlp_ner, last=True)


In [None]:
# add to attribute ruler that when the lemma is "κοιλία" the entity label is "Body Part"
ruler = nlp.add_pipe("attribute_ruler", before="ner")
patterns = [[{"LEMMA": "κοιλία"}]]
attrs = {"ENT_TYPE": "BodyPart"}
ruler.add(patterns=patterns, attrs=attrs)



In [None]:
nlp.pipe_names

In [None]:
with open("../assets/Evaluations/Aristotle_Partibus_Animalium_675a31-6a5.txt", encoding='utf8') as text:
    text = text.read()
    docs = nlp(text)

In [None]:
#from spacy.tokens import Doc, DocBin, Span
#test_data = DocBin().from_disk("../Archive/corpus_morefiles_model/test/test.spacy")
#docs = list(test_data.get_docs(nlp.vocab))

In [None]:
text= "Μετὰ δὲ τὴν κοιλίαν ἡ τῶν ἐντέρων ἔγκειται φύσις πᾶσι τοῖς ζῴοις. Ἔχει δὲ διαφορὰς πολλάς, καθάπερ ἡ κοιλία, καὶ τοῦτο τὸ μόριον. Τοῖς μὲν γὰρ ἁπλοῦν ἐστι καὶ ὅμοιον ἀναλυόμενον, τοῖς δ' ἀνόμοιον· ἐνίοις μὲν γὰρ εὐρύτερον τὸ πρὸς τῇ κοιλίᾳ, τὸ δὲ πρὸς τῷ τέλει στενώτερον (διόπερ αἱ κύνες μετὰ πόνου προΐενται τὴν τοιαύτην περίττωσιν), τοῖς δὲ πλείοσιν ἄνωθεν στενώτερον, πρὸς τῷ τέλει δ' εὐρύτερον.[…] Πᾶσι δὲ τοῖς μὴ εὐθυεντέροις προϊοῦσιν εὐρύτερον γίνεται τὸ μόριον τοῦτο, καὶ τὸ καλούμενον κόλον ἔχουσι, καὶ τοῦ ἐντέρου τυφλόν τι καὶ ὀγκῶδες, εἶτ' ἐκ τούτου πάλιν στενώτερον καὶ εἱλιγμένον. Τὸ δὲ μετὰ τοῦτο εὐθὺ πρὸς τὴν ἔξοδον διατείνει τοῦ περιττώματος, καὶ τοῖς μὲν τοῦτο τὸ μόριον, ὁ καλούμενος ἀρχός, κνισσώδης ἐστί, τοῖς δ' ἀπίμελος."
#text = "Ὑπὸ δὲ τὸ ὑπόζωμα κεῖται ἡ κοιλία τοῖς ζῴοις, τοῖς μὲν ἔχουσιν οἰσοφάγον ᾗ τελευτᾷ τοῦτο τὸ μόριον, τοῖς δὲ μὴ ἔχουσιν εὐθὺς πρὸς τῷ στόματι· τῆς δὲ κοιλίας ἐχόμενον τὸ καλούμενον ἔντερον. Δι' ἣν δ' αἰτίαν ἔχει ταῦτα τὰ μόρια τῶν ζῴων ἕκαστον, φανερὸν πᾶσιν. Καὶ γὰρ δέξασθαι τὴν εἰσελθοῦσαν τροφὴν καὶ τὴν ἐξικμασμένην ἀναγκαῖον ἐκπέμψαι, καὶ μὴ τὸν αὐτὸν τόπον εἶναι τῆς τε ἀπέπτου καὶ τοῦ περιττώματος, εἶναί τέ τινα δεῖ τόπον ἐν ᾧ μεταβάλλει."

In [None]:
# add BodyPart label to ner in pipeline
ner = nlp.get_pipe("ner")
ner.add_label("BodyPart")

In [None]:
nlp_ner.pipe_labels

In [None]:
import spacy
spacy.require_gpu()
from spacy import displacy

In [None]:
nlp=spacy.load("/root/Projects/Atlomy/git/greCy_ATLOMY/training/transformer/NER/model-best")

In [None]:
text =("Ταῦτ’ οὖν εἴς τε τὸν παρόντα καὶ τὸν ἑξῆς ἅπαντα λόγον οἷον ὑποθέσεις τινὰς τῶν ἀποδείξεων λαμβάνοντες ἐν ἑκάστῳ τῶν ὀργάνων τὴν ἐξ αὐτῶν ὠφέλειαν ἐροῦμεν ἀπὸ τῶν δακτύλων αὖθις ἀρξάμενοι.")

In [None]:
# parse the text
doc = nlp(text)
# parse to sentences with spacy
for token in doc:
    print (token.text, token.ent_type_)
displacy.render(doc, style="ent", jupyter=True)



In [None]:
# parse the text
doc = nlp(text)
# parse to sentences with spacy
for token in doc:
    print (token.text, token.ent_type_)
displacy.render(doc, style="ent", jupyter=True)



In [None]:
print([(ent.text, ent.label_) for ent in doc.ents])

In [None]:
for token in doc:
    if token.lemma_ == "κοιλία":
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.ent_type_)

In [None]:
docced = nlp(text)
for token in docced:
    if token.lemma_ == "κοιλία":
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_)


In [None]:
docced = nlp(text)
for sent in docced:
    print(sent.text, sent.lemma_, sent.pos_, sent.tag_, sent.dep_, sent.ent_type_)

In [None]:
for token in docced:
        displacy.render(docced, style='ent', jupyter=True, options=options)


In [None]:
# find lemma in doclines sentences. If found, print sentence and move to next sentence
docced = nlp(text)
#for doc in docs:
#    docced = nlp(doc)
for token in docced:
    #print(token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)
    if token.ent_type_ == "Body Part" and token.lemma_ == "κοιλία":
        print(token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)
        #print(docced, ' | ', token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)
        #print(docced, token.lemma_, token.ent_type_)
        displacy.render(docced, style='ent', jupyter=True, options=options)
        break
    

In [None]:
for doc in doclines:
    #docced = nlp(doc)
    for token in doc:
        print(token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)

In [None]:
for doc in doclines:
    #docced = nlp(doc)
    for token in doc:
        #find lemma
        if token.ent_type_ == "Body Part" and token.lemma_ == φλέψ:
            displacy.render(doc, style='ent', jupyter=True, options=options)
            break

In [None]:
for token in docs:
    #find lemma of κοιλία and print the sentence
    if token.lemma_ == 'κοιλία':
        displacy.render(token.sents, style='ent', jupyter=True, options=options)
        break
    
    if token.ent_type_ == "Body Part" and token.lemma_ == κοιλία:
        displacy.render(docs.sents, style='ent', jupyter=True, options=options)
        break

In [None]:
for doc in test_data_docs:
    docced = nlp(doc)
    for token in docced:
        #find lemma
        if token.ent_type_ == "Body Part" and token.lemma_ == φλέψ:
            displacy.render(docced, style='ent', jupyter=True, options=options)
            break

In [None]:
for doc in test_data_docs:
    docced = nlp(doc)
    for token in docced:
        # if entity is "Body Part" then print the text
        if token.ent_type_ == "Body Part":
            displacy.render(docced, style='ent', jupyter=True)
            break


In [None]:
!python -m spacy package ../Models/grc_ud_proiel_trf/grc_ud_proiel_trf_Lem_NER/model-best ../Models/grc_ud_proiel_trf/trf_Lem_NER --name trf_Lem_NER --version 0.1

In [None]:
output_dir = Path("../Models/grc_ud_proiel_trf/grc_ud_proiel_trf_Lem_NER/model-best")


print("Loading from", output_dir)
nlp = spacy.load(output_dir)
# disable parser
#nlp.disable_pipes("parser")

nlp2 = spacy.load("el_core_news_sm")

In [None]:
nlp.pipe_names

In [None]:
paragraph = 'Ταῦτ’ οὖν εἴς τε τὸν παρόντα καὶ τὸν ἑξῆς ἅπαντα λόγον οἷον ὑποθέσεις τινὰς τῶν ἀποδείξεων λαμβάνοντες ἐν ἑκάστῳ τῶν ὀργάνων τὴν ἐξ αὐτῶν ὠφέλειαν ἐροῦμεν ἀπὸ τῶν δακτύλων αὖθις ἀρξάμενοι. ἐπειδὴ γὰρ τὴν τῶν ὀστῶν κατασκευὴν ἐν αὐτοῖς ἐπιτηδειοτάτην ὀργάνοις ἀντιληπτικοῖς ἡ φύσις ἐποίησεν, ἦν δ’ ἀμήχανον αὐτοῖς τοῖς ὀστοῖς οὕτω γεώδεσί τε καὶ λιθώδεσιν οὖσι μεταδοῦναι τῆς καθ’ ὁρμὴν κινήσεως, ἐξεῦρεν, ὅτῳ τρόπῳ δι’ ἑτέρων αὐτὰ κινήσει. τῶν οὖν κατὰ τὸν πῆχυν μυῶν ἀποφύσασα τένοντας εὐθὺ τῶν δακτύλων ἤγαγεν. ἃ γὰρ οἱ παλαιοὶ καλοῦσι νεῦρα, ταυτὶ τὰ προφανῆ, τὰ κινοῦντα τοὺς δακτύλους, οἱ τένοντές εἰσιν·'
#paragraph = normalize_text(paragraph)

docced = nlp(paragraph)
# print sentences
#for sent in docced.sents:
    #print(sent, ' | ', "sentence")

#for doc in docced:
#    print(doc.text, doc.pos_, doc.dep_, doc.ent_type_, doc.lemma_)
for token in docced:
#    if token.lemma_ == texwor:
    #if token.pos_ == 'PUNCT':
        print(token, ' | ', token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)
        #displacy.render(docced, style='ent', jupyter=True)
    #if token.lemma_ == textartery:
    #    print(docced, ' | ', token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)
        # displacy
    #displacy.render(sent, style='dep', jupyter=True)
    #for token in sent:
    #    print(sent, ' | ', token.text, token.pos_, ' | ',token.dep_, token.ent_type_, token.lemma_)
