<a href="https://colab.research.google.com/github/Derinhelm/parser_stat/blob/main/parser_running.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code downloading

In [None]:
!git clone https://github.com/Derinhelm/parser_stat.git

Cloning into 'parser_stat'...
remote: Enumerating objects: 198, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (54/54), done.[K
remote: Total 198 (delta 43), reused 41 (delta 22), pack-reused 122 (from 1)[K
Receiving objects: 100% (198/198), 45.05 MiB | 10.26 MiB/s, done.
Resolving deltas: 100% (103/103), done.
Updating files: 100% (18/18), done.


In [None]:
import pickle

import sys
sys.path.append('/content/parser_stat')

from data_classes import ConllEntry, Sentence

# Preparing

In [None]:
from IPython.display import clear_output

In [None]:
import time
import traceback


In [None]:
pickle_data_path = "/content/parser_stat/treebank_test_sets/treebank_data.pickle"

with open(pickle_data_path, 'rb') as f:
    data = pickle.load(f)

In [None]:
data['gsd'][0].text

'Билли начал играть за резервный состав ``Черка&#39;&#39; в возрасте 16 лет, а через пару сезонов был приглашён в основной состав.'

# UDepPLLaMA running

In [None]:
!pip install peft transformers bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0

In [None]:
import transformers
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from peft import PeftModel

In [None]:
OP = '['
CP = ']'

class UDepPLLaMAParser:
    def __init__(self):
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        model_from = AutoModelForCausalLM.from_pretrained(
            "NousResearch/Llama-2-7b-hf",
            #load_in_4bit=True,
            quantization_config=quant_config,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            device_map={"": 0},
        )

        model = PeftModel.from_pretrained(
            model_from,
            "sag-uniroma2/u-depp-llama-2-7b"
        )

        generation_config = GenerationConfig(
            num_beams=4,
            do_sample=False,
            early_stopping=True,
        )
        tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf", trust_remote_code=True)
        self.model = model
        self.generation_config = generation_config
        self.tokenizer = tokenizer


    def get_llm_output(self, input):
        prompt = f"""
        ### Input:
        {input}
        ### Answer:"""
        inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
        input_ids = inputs["input_ids"].to(self.model.device)
        with torch.no_grad():
            gen_outputs = self.model.generate(
                input_ids=input_ids,
                generation_config=self.generation_config,
                return_dict_in_generate=True,
                output_scores=True,
                max_new_tokens=1024,
                use_cache=True,
            )
        s = gen_outputs.sequences[0]
        output = self.tokenizer.decode(s, skip_special_tokens=True)

        response = output.split("### Answer:")[1].rstrip().lstrip()
        #print(response)
        return response

    def parseExpression(self, expression):
        nodeMap = dict()
        counter = 1
        node = ""
        retExp =""
        for char in expression:
            if char == OP or char == CP :
                if (len(node) > 0):
                    nodeMap[str(counter)] = node;
                    retExp += str(counter)
                    counter +=1
                retExp += char
                node =""
            elif char == ' ': continue
            else :
                node += char
        return retExp,nodeMap

    def toTree(self, expression):
        tree = dict()
        msg =""
        stack = list()
        for char in expression:
            if(char == OP):
                stack.append(msg)
                msg = ""
            elif char == CP:
                parent = stack.pop()
                if parent not in tree:
                    tree[parent] = list()
                tree[parent].append(msg)
                msg = parent
            else:
                msg += char
        return tree


    def _decode(self, tree, representation_type, node, nodeMap, parent, grand_parent, tid2treenodeMap, res):
        if node not in tree:
            tid = 1
            if res:
                tid = int(max(res.keys())) + 1

            grand_parent_label = "ROOT"
            if grand_parent in nodeMap:
                grand_parent_label = nodeMap[grand_parent]

            if representation_type == "lct":
                res[tid] = { "id": tid, "form": nodeMap[parent], "to": grand_parent_label, "toid" : grand_parent, "deprel": nodeMap[node] }
            elif representation_type == "grct":
                res[tid] = { "id": tid, "form": nodeMap[node], "to": grand_parent_label, "toid" : grand_parent, "deprel": nodeMap[parent] }
            else:
                raise Exception("The representation_type\t" + representation_type + "\t is not supported in decoding.")

            tid2treenodeMap[parent] = str(tid)

            return

        for child in tree[node]:
            self._decode(tree, representation_type, child, nodeMap, node, parent, tid2treenodeMap, res)

    def decode(self, tree, nodeMap, representation_type="lct"):
        res = dict()
        tid2treenodeMap = dict()
        #print(tree[''][0])
        self._decode(tree, representation_type, "1", nodeMap, None, None, tid2treenodeMap, res)

        for i in range(1, len(res)+1):
            if res[i]["toid"] is None:
                res[i]["toid"] = '0'
            else:
                try:
                    res[i]["toid"] = tid2treenodeMap[res[i]["toid"]]
                except:
                    res[i]["toid"] = '0'

        return res

    def _parse(self, s):
        llm_output = self.get_llm_output(s)
        retExp, nodeMap = self.parseExpression(llm_output)
        tree = self.toTree(retExp)
        res = self.decode(tree, nodeMap)
        return res

    def parse(self, sent):
        parsing_res = self._parse(sent)
        res = []
        for token in parsing_res.values():
          t =  { 'id': str(token['id']), 'form': token['deprel'],
                 'parent_id': token['toid'], 'relation': token['form'] }
          res.append(t)
        return res

In [None]:
parser = UDepPLLaMAParser()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/538 [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/81.3M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
ts = time.time()
parser.parse("Мама мыла раму.")
print(time.time() - ts)


17.49436855316162


In [None]:
te = time.time()
parser.parse(data['gsd'][0].text)
print(time.time() - te)

98.25247955322266


# Experiments

In [None]:
import gc

time_dict = {}
treebank_name = 'syntagrus'
treebank_sents = data[treebank_name]

start_i = 0 # В приведенном ниже листинге start_i был равен 400
finish_i = len(treebank_sents)
pickle_name = f'{treebank_name}'

t_res = {}
print("\n", treebank_name)
t_time = []
for i in range(start_i, finish_i):
    if i % 20 == 0:
        with open(f"{pickle_name}_{start_i}_{i}.pickle", 'wb') as f:
            pickle.dump(t_res, f)
        gc.collect()
    sent = treebank_sents[i]
    ts = time.time()
    try:
        token_list = parser.parse(sent.text)
    except Exception as err:
        t_res[i] = (err, )
        print(i, err)
    else:
        te = time.time()
        t_time.append(te - ts)
        cur_res = Sentence()
        cur_res.set_sent_id(sent.sent_id)
        cur_res.set_text(sent.text)
        for t in token_list:
            cur_res.add_token(t)
        t_res[i] = (cur_res, t_time[-1])
        print(i, t_time[-1])

time_dict[treebank_name] = sum(t_time)

print("\ntime results (s):")
for p, t in time_dict.items():
    print(f"{p:10}: {t:5.3f} (s)")

with open(pickle_name + ".pickle", 'wb') as f:
    pickle.dump(t_res, f)


 syntagrus
400 136.89211106300354
401 94.18637895584106
402 37.51187229156494
403 71.14284372329712
404 61.45914125442505
405 72.26844477653503
406 40.21013402938843
407 31.287010431289673
408 42.24173927307129
409 68.86345171928406
410 54.858309268951416
411 22.691984176635742
412 12.825880765914917
413 20.712909936904907
414 40.21846294403076
415 14.776346206665039
416 18.072959899902344
417 50.16623330116272
418 31.260660409927368
419 10.17096996307373
420 39.89760661125183
421 47.50709843635559
422 63.65354776382446
423 82.69220042228699
424 70.98271369934082
425 30.691453218460083
426 211.69219613075256
427 148.24338388442993
428 152.50018429756165
429 94.06128907203674
430 38.46248006820679
431 38.70719528198242
432 124.71310877799988
433 23.359429359436035
434 47.373963832855225
435 47.658276081085205
436 43.794363260269165
437 44.18161106109619
438 56.90594553947449
439 44.759275674819946
440 27.33616352081299
441 18.127307653427124
442 41.503233194351196
443 62.97620677947998

In [None]:
with open(f"{pickle_name}_{start_i}_{i}.pickle", 'wb') as f:
    pickle.dump(t_res, f)
gc.collect()

In [None]:
import gc
gc.collect()

In [None]:
type(parser)

In [None]:
x = 0