<a href="https://colab.research.google.com/github/Derinhelm/graph_syntax_parsing/blob/main/Parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [1]:
!pip install -q transformers

In [2]:
!pip install torch_geometric

from IPython.display import clear_output

clear_output()

# Loading scripts

In [3]:
!git clone https://github.com/Derinhelm/graph_syntax_parsing.git

Cloning into 'graph_syntax_parsing'...
remote: Enumerating objects: 358, done.[K
remote: Counting objects: 100% (96/96), done.[K
remote: Compressing objects: 100% (50/50), done.[K
remote: Total 358 (delta 51), reused 84 (delta 45), pack-reused 262[K
Receiving objects: 100% (358/358), 97.11 MiB | 22.83 MiB/s, done.
Resolving deltas: 100% (205/205), done.
Updating files: 100% (22/22), done.


# Logging

#uuparser/utils.py

# uuparser/multilayer_perceptron.py


# uuparser/arc_hybrid.py

## Config

## Oracle

## Parser

# uuparser/parser.py

In [4]:
import sys
sys.path.insert(0, '/content/graph_syntax_parsing')

In [5]:
from graph_syntax_parsing.utils import set_seeds, read_conll, ConllEntry, get_irels
from graph_syntax_parsing.project_logging import logging
from graph_syntax_parsing.project_parser import Parser

root - DEBUG - New debug
root - INFO - New info

root - DEBUG - New debug
root - INFO - New info
root - DEBUG - New debug
root - INFO - New info



In [6]:
import pickle, os, time, sys, copy, itertools, re, random

from shutil import copyfile

def evaluate_uas(sentence_descr):
    #sentence_descr is a list, in which elements 0, 1, 2 are auxiliary
    right_parent_tokens = 0
    for token in sentence_descr[3:]:
        if isinstance(token, ConllEntry): # TODO: изучить случаи, когда не ConllEntry - ошибка считывания?
          if token.pred_parent_id == token.parent_id:
              right_parent_tokens += 1
        #print("pred_parent:", token.pred_parent_id, "real_parent:", token.parent_id)
    uas = right_parent_tokens / (len(sentence_descr) - 3)
    return uas

def evaluate_uas_epoche(sentence_list):
    summ_uas = 0
    for sent in sentence_list:
        summ_uas += evaluate_uas(sent)
    return summ_uas / len(sentence_list)

p = None

def run(traindata, valdata, testdata, embeds, options):

    irels = get_irels(traindata)
    logging.debug('Initializing the model')
    parser = Parser(options, irels, embeds)
    global p
    p = parser

    dev_best = [options["epochs"],-1.0] # best epoch, best score

    for epoch in range(options["first_epoch"], options["epochs"] + 1):
        # Training
        logging.info(f'Starting epoch {epoch} (training)')
        parser.Train(traindata)
        logging.info(f'Finished epoch {epoch} (training)')

        parser.Save(epoch)

        logging.info(f"Predicting on dev data")
        dev_pred = list(parser.Predict(valdata))
        mean_dev_score = evaluate_uas_epoche(dev_pred)
        logging.info(f"Dev score {mean_dev_score:.2f} at epoch {epoch:d}")
        print(f"Dev score {mean_dev_score:.2f} at epoch {epoch:d}")

        if mean_dev_score > dev_best[1]:
            dev_best = [epoch,mean_dev_score] # update best dev score

    logging.info(f"Loading best model from epoche{dev_best[0]:d}")
    # Loading best_models to parser.labeled_GNN and parser.unlabeled_GNN
    parser.Load(epoch)

    logging.info(f"Predicting on test data")

    test_pred = list(parser.Predict(testdata))
    mean_test_score = evaluate_uas_epoche(test_pred)

    logging.info(f"On test obtained UAS score of {mean_test_score:.2f}")
    print(f"On test obtained UAS score of {mean_test_score:.2f}")


    logging.debug('Finished predicting')


# Execution

In [7]:
!mkdir "/content/models"

In [8]:
real_dataset = False
colab_using = True
embed_pickle_using = True

In [9]:
if real_dataset:
  train_a_dir = '/content/graph_syntax_parsing/UD_Russian-SynTagRus/ru_syntagrus-ud-train-a.conllu'
  train_b_dir = '/content/graph_syntax_parsing/UD_Russian-SynTagRus/ru_syntagrus-ud-train-b.conllu'
  train_c_dir = '/content/graph_syntax_parsing/UD_Russian-SynTagRus/ru_syntagrus-ud-train-c.conllu'

  val_dir = '/content/graph_syntax_parsing/UD_Russian-SynTagRus/ru_syntagrus-ud-dev.conllu'
  test_dir = '/content/graph_syntax_parsing/UD_Russian-SynTagRus/ru_syntagrus-ud-test.conllu'
  if colab_using:
    for p in [train_a_dir, train_b_dir, train_c_dir, val_dir, test_dir]:
      p = "/content/graph_syntax_parsing/" + p

  train_a, train_words_a = read_conll(train_a_dir)
  train_b, train_words_b = read_conll(train_b_dir)
  train_c, train_words_c = read_conll(train_c_dir)
  train = train_a + train_b + train_c
else:
  train_dir = '/content/graph_syntax_parsing/UD_Russian-SynTagRus-small/ru_syntagrus-ud-train.conllu'
  val_dir = '/content/graph_syntax_parsing/UD_Russian-SynTagRus-small/ru_syntagrus-ud-dev.conllu'
  test_dir = '/content/graph_syntax_parsing/UD_Russian-SynTagRus-small/ru_syntagrus-ud-test.conllu'
  if colab_using:
    for p in [train_dir, val_dir, test_dir]:
      p = "/content/graph_syntax_parsing/" + p
  train, train_words = read_conll(train_dir)
val, val_words = read_conll(val_dir)
test, test_words = read_conll(test_dir)

In [10]:
if not embed_pickle_using:
  if real_dataset:
    all_words = train_words_a | train_words_b | train_words_c | val_words | test_words
  else:
    all_words = train_words | val_words | test_words


In [11]:
options = {}
options["hidden_dims"] = 100 # MLP hidden layer dimensions
options["learning_rate"] = 0.001 # Learning rate for neural network optimizer

options["dynamic_oracle"] = True # Use the static oracle instead of the dynamic oracle

options["epochs"] = 10 # Number of epochs
options["first_epoch"] = 1

# really important to do this before anything else to make experiments reproducible
set_seeds()

In [12]:
from transformers import AutoTokenizer, BertModel
def get_embed(tokenizer, model, word): # TODO: переписать или убрать!
    inputs = tokenizer(word, return_tensors="pt")
    outputs = model(**inputs)

    last_hidden_states = outputs.last_hidden_state[0][0]
    return last_hidden_states.detach().cpu()

def create_embeds(embed_pickle=None):
    if embed_pickle is None:
        embeds = {}
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        model = BertModel.from_pretrained("bert-base-uncased")

        logging.debug('Creating embeddings')
        ts = time.time()

        for word in all_words:
            embeds[word] = get_embed(tokenizer, model, word)
        logging.debug(f'{len(embeds)} embeddings were created')
        te = time.time()
        logging.info(f'Time of embedding creation: {te-ts:.2g}s')
    else:
        ts = time.time()
        embeds = []
        with open(embed_pickle, 'rb') as f:
            embeds = pickle.load(f)
        te = time.time()
        logging.info(f'Time of embedding downloading: {te-ts:.2g}s')
    return embeds

In [13]:
import time
import pickle
if embed_pickle_using:
  if real_dataset:
    embeds = create_embeds("/content/graph_syntax_parsing/UD_Russian-SynTagRus/embeds.pickle")
  else:
    embeds = create_embeds("/content/graph_syntax_parsing/UD_Russian-SynTagRus-small/embeds.pickle")
else:
  embeds = create_embeds()


In [14]:
len(embeds)

516

In [15]:
len(train)

5

In [16]:
from pathlib import Path

print(Path('/content/app.log').read_text())

root - DEBUG - New debug
root - INFO - New info
root - DEBUG - New debug
root - INFO - New info
jaxlib.mlir._mlir_libs - DEBUG - Initializing MLIR with module: _site_initialize_0
jaxlib.mlir._mlir_libs - DEBUG - Registering dialects from initializer <module 'jaxlib.mlir._mlir_libs._site_initialize_0' from '/usr/local/lib/python3.10/dist-packages/jaxlib/mlir/_mlir_libs/_site_initialize_0.so'>
jax._src.path - DEBUG - etils.epath found. Using etils.epath for file I/O.
root - INFO - Reading /content/graph_syntax_parsing/UD_Russian-SynTagRus-small/ru_syntagrus-ud-train.conllu
root - DEBUG - 5 sentences read
root - INFO - Time: 0.0012s
root - INFO - Reading /content/graph_syntax_parsing/UD_Russian-SynTagRus-small/ru_syntagrus-ud-dev.conllu
root - DEBUG - 25 sentences read
root - INFO - Time: 0.011s
root - INFO - Reading /content/graph_syntax_parsing/UD_Russian-SynTagRus-small/ru_syntagrus-ud-test.conllu
root - DEBUG - 27 sentences read
root - INFO - Time: 0.006s
root - DEBUG - Using default 

In [17]:
run(train, val, test, embeds, options)



IndexError: ignored

In [None]:
logging.info(f'evaluate_time: {evaluate_time:.2g}s, transform_time:{transform_time:.2g}')

In [None]:
from google.colab import files

files.download("app.log")

# TODO

TODO:
В sentence последний элемент -

{'id': 0,
 'form': '*root*',
 'char_rep': '*root*',
 'norm': '*root*',
 'cpos': 'ROOT-CPOS',
 'pos': 'ROOT-POS',
 'parent_id': -1,
 'relation': 'rroot',
 'lemma': '*root*',
 'feats': '_',
 'deps': '_',
 'misc': '_',
 'pred_parent_id': None,
 'pred_relation': None,
 'treebank_id': None,
 'proxy_tbank': None,
 'pred_pos': None,
 'pred_cpos': None,
 'projective_order': 0,
 'rdeps': [8],
 'children': [],
 'scores': None,
 'parent': None,
 'vecs': None}


В какую сторону стек в коде сейчас ?
Используют stack[-1], stack[-2].
Стек или очередь ?

Разобраться, какие метрики считают при обучении (на train)