In [1]:
# !pip install autoawq[cpu]

In [1]:
# Import necessary modules
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
import re
from typing import List

from transformers import Pipeline, pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEFAULT_SYSTEM_PROMPT = 'You are a helpful information extraction system.'

DEFAULT_FIRST_PROMPT = 'Given a passage, your task is to extract all entities and identify their entity types. First, read the passage, later I will ask you questions about entities. The output should be in a list of tuples of the following format: [["entity 1", "type of entity 1"], ... ]. You must not write any text other than a list of tuples. \n\nPassage: {passage}'

DEFAULT_QUESTION_PROMPT = 'What describes "{entity}" in the given passage?'


In [3]:
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" # "Qwen/Qwen2.5-7B-Instruct-AWQ" # 

TOKENIZER = MODEL_NAME

DATASET_NAME = 'adsabs/WIESP2022-NER'

In [4]:
from tqdm import tqdm


class QwenPipelineNER(Pipeline):
    messages: List[dict] = []

    def __init__(self, *args, llm_pipe, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.llm_pipe = llm_pipe


    def _sanitize_parameters(self, *, entities={}, first_prompt=DEFAULT_FIRST_PROMPT, system_prompt=DEFAULT_SYSTEM_PROMPT,question_prompt=DEFAULT_QUESTION_PROMPT, **kwargs):
        preprocess_kwargs = {'system_prompt':system_prompt,
                             'first_prompt': first_prompt,
                            }
        forward_kwargs = {'question_prompt':question_prompt, 'entities':entities}
        postprocess_kwargs = {}
        return preprocess_kwargs, forward_kwargs, postprocess_kwargs


    def preprocess(self, inputs, system_prompt, first_prompt):
        self.input_text = inputs
        self.first_messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": first_prompt.format_map({'passage': inputs})  }
        ]
        self.messages = []
        return {"model_input": inputs}

    def _forward(self, model_inputs, question_prompt=DEFAULT_QUESTION_PROMPT, entities={}):
        results = {}
        for ent in entities:
            prompt = question_prompt.format_map({'entity': ent})
            messages = self.first_messages + [{"role": "user", "content": prompt}]
            response_message = self.llm_pipe(messages, max_new_tokens=512)[0]["generated_text"][-1]
            response_text = response_message['content']
            self.messages.extend([messages, response_message])
            try:
                entries_list = json.loads(response_text)
            except Exception as e:
                entries_list = []
                print(f"CANT PARSE LLM OUTPUT AS JSON: {e}\n")
            finally:
                print(f"OUTPUT:\n{response_text}\n")
                print(f"TEXT:\n{self.input_text}\n")
                print(f"ENTITY:\n{ent}\n")
                print(f"PROMPT:\n {prompt}\n")
                print('============================================================')
                results[ent] = entries_list
        return results

    def postprocess(self, model_outputs):
        list_result = []
        text = self.input_text
        entries_set = set()
        for entity in tqdm(model_outputs):
            entries = model_outputs.get(entity)
            for entry_pair in entries:
                entries_set.add(tuple(entry_pair))
        for entry_pair in entries_set:
            if len(entry_pair) != 2:
                continue
            entry, entity = entry_pair
            entry_len = len(entry)
            for occurency in re.finditer(entry, text):
                list_result.append({'entity': entity, 'word': entry, 'start': occurency.start(), 'end': occurency.start() + entry_len - 1})
        print("CHAT:")
        print('\n'.join(map(str, self.messages)))
        print(entries_set)
        return list_result

In [5]:
pipe = pipeline("text-generation", MODEL_NAME, torch_dtype="auto", device_map="auto")

In [6]:
ner_pipe = QwenPipelineNER(pipe.model, pipe.tokenizer, llm_pipe=pipe)

In [7]:
ner_pipe.device

device(type='mps')

In [9]:
res = ner_pipe("I am living in New York city. It is located right next to Los Angeles.", start_new_dialog=True, entities={"Place", "Car", "Furtniture"})
res

CANT PARSE LLM OUTPUT AS JSON: Expecting value: line 1 column 1 (char 0)

OUTPUT:
None

TEXT:
I am living in New York city. It is located right next to Los Angeles.

ENTITY:
Furtniture

PROMPT:
 What describes "Furtniture" in the given passage?

CANT PARSE LLM OUTPUT AS JSON: Expecting value: line 1 column 1 (char 0)

OUTPUT:
The word "car" does not appear in the provided passage.

TEXT:
I am living in New York city. It is located right next to Los Angeles.

ENTITY:
Car

PROMPT:
 What describes "Car" in the given passage?



Python(41048) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


OUTPUT:
[["New York city", "Location"], ["Los Angeles", "Location"]]

TEXT:
I am living in New York city. It is located right next to Los Angeles.

ENTITY:
Place

PROMPT:
 What describes "Place" in the given passage?



100%|██████████| 3/3 [00:00<00:00, 43539.49it/s]

CHAT:
[{'role': 'system', 'content': 'You are a helpful information extraction system.'}, {'role': 'user', 'content': 'Given a passage, your task is to extract all entities and identify their entity types. First, read the passage, later I will ask you questions about entities. The output should be in a list of tuples of the following format: [["entity 1", "type of entity 1"], ... ]. You must not write any text other than a list of tuples. \n\nPassage: I am living in New York city. It is located right next to Los Angeles.'}, {'role': 'user', 'content': 'What describes "Furtniture" in the given passage?'}]
{'role': 'assistant', 'content': 'None'}
[{'role': 'system', 'content': 'You are a helpful information extraction system.'}, {'role': 'user', 'content': 'Given a passage, your task is to extract all entities and identify their entity types. First, read the passage, later I will ask you questions about entities. The output should be in a list of tuples of the following format: [["entity




[{'entity': 'Location', 'word': 'Los Angeles', 'start': 58, 'end': 68},
 {'entity': 'Location', 'word': 'New York city', 'start': 15, 'end': 27}]

In [8]:
text = "Авторы хотели бы поблагодарить Адама Бургассера, Брендана Боулера, Келли Круз, Майка Кушинга, Майкла Лю и Эмили Райс за полезные обсуждения систем бенчмарков, обработки данных и различных подходов к сравнению моделей данных. Авторы благодарят Ричарда Фридмана и Роксану Лупу за предоставление непрозрачности газа и Кэролайн Морли за сравнения кодов переноса излучения и полезные обсуждения. Мы благодарим Джейкоба Люстига-Йегера и Кайла Лютера за переписывание частей кода на Python и C для значительного улучшения скорости, а также Дэна Формана-Макки за предоставление EMCEE сообществу. Наконец, мы благодарим анонимного рецензента и консультанта по статистике за полезные и проницательные комментарии."
entities = ['Archive',
 'CelestialObject',
 'CelestialObjectRegion',
 'CelestialRegion',
 'Citation',
 'Collaboration',
 'ComputingFacility',
 'Database',
 'Dataset',
 'EntityOfFutureInterest',
 'Event',
 'Fellowship',
 'Formula',
 'Grant',
 'Identifier',
 'Instrument',
 'Location',
 'Mission',
 'Model',
 'ObservationalTechniques',
 'Observatory',
 'Organization',
 'Person',
 'Proposal',
 'Software',
 'Survey',
 'Tag',
 'Telescope',
 'TextGarbage',
 'URL',
 'Wavelength']



In [None]:
res = ner_pipe(text, entities=entities)
res

In [10]:
import random
from spacy import displacy
import matplotlib.colors as mcolors

sample_for_rendering_3 = {
    'text': text,
    'ents': [({'start': it['start'], 'end': it['end'], 'label': it['entity']})
             for it in res]
}

entity_colors = [mcolors.rgb2hex((0.5 + random.random() / 2, 0.5 +random.random() / 2, 0.5 +random.random() / 2))
                 for _ in range(len(entities))]

NameError: name 'res' is not defined

In [12]:
rendered_3 = displacy.render(
    sample_for_rendering_3, style='ent',
    # options={'ents': entities, 'colors': dict(zip(entities, entity_colors))},
    manual=True, jupyter=True
)
rendered_3

### TEST


In [9]:
from datasets import load_dataset
from typing import Tuple
from nltk.tokenize.treebank import TreebankWordDetokenizer

testset = load_dataset(DATASET_NAME, split='test')

In [10]:
def find_token(token_bounds: List[Tuple[int, int]], char_idx: int) -> int:
    res = -1
    for token_idx, (token_start, token_end) in enumerate(token_bounds):
        if (char_idx >= token_start) and (char_idx < token_end):
            res = token_idx
            break
    return res

In [11]:
def predictions_to_bio(text: str, tokens: List[str], predictions: List[Tuple[int, int, str]]) -> List[str]:
    token_bounds = []
    token_labels = []
    start_pos = 0
    for cur_token in tokens:
        found_idx = text[start_pos:].find(cur_token)
        if found_idx < 0:
            err_msg = f'The token {cur_token} is not found in the text {text}'
            raise RuntimeError(err_msg)
        token_start = found_idx + start_pos
        token_end = token_start + len(cur_token)
        start_pos = token_end
        token_bounds.append((token_start, token_end))
        token_labels.append('O')
    for span_start, span_end, span_label in predictions:
        start_token = find_token(token_bounds, span_start)
        end_token = find_token(token_bounds, span_end - 1)
        if (start_token >= 0) and (end_token >= 0):
            for token_idx in range(start_token, end_token + 1):
                token_labels[token_idx] = span_label
        elif start_token >= 0:
            token_labels[start_token] = span_label
        elif end_token >= 0:
            token_labels[end_token] = span_label
    corrected_token_labels = []
    previous_label = 'O'
    for cur_label in token_labels:
        label = 'I-' + cur_label if cur_label != 'O' else cur_label
        if cur_label == previous_label:
            corrected_token_labels.append(label)
        else:
            if (cur_label == 'O') or cur_label.startswith('B-'):
                corrected_token_labels.append(label)
            else:
                if previous_label == 'O':
                    corrected_token_labels.append('B-' + cur_label)
                elif previous_label[2:] != cur_label[2:]:
                    corrected_token_labels.append('B-' + cur_label)
                else:
                    corrected_token_labels.append(label)
        previous_label = cur_label
    return corrected_token_labels

In [12]:

y_true = []
y_pred = []
testset = testset[:2]
for tokens, reference_tags in zip(testset['tokens'], testset['ner_tags']):
    y_true.append(reference_tags)
    cur_text = TreebankWordDetokenizer().detokenize(tokens)
    cur_res = ner_pipe(cur_text, entities=entities)
    y_pred.append(
        predictions_to_bio(
            cur_text,
            tokens,
            [(it['start'], it['end'], it['entity']) for it in cur_res]
        )
    )

for true_label, predicted_label in zip(y_true[0], y_pred[0]):
    print('{0:>25}   {1:>25}'.format(true_label, predicted_label))

CANT PARSE LLM OUTPUT AS JSON: Expecting value: line 1 column 1 (char 0)

OUTPUT:
None

TEXT:
The authors would like to thank Adam Burgasser, Brendan Bowler, Kelle Cruz, Mike Cushing, Michael Liu, and Emily Rice for useful discussions on benchmark systems, data treatment, and various data-model comparison approaches. The authors thank Richard Freedman and Roxana Lupu for providing gas opacities and Caroline Morley for radiative transfer code comparisons and helpful discussions. We thank Jacob Lustig-Yeager and Kyle Luther for rewriting portions of the code in python and C for significant speed improvements and also Dan Foreman-Mackey for making EMCEE available to the community. Finally, we thank the anonymous referee and statistics consultant for useful and insightful comments. J.T. acknowledges financial support from the Carnegie Origins Postdoctoral Fellowship Program. B.B. acknowledges financial support from the European Commission in the form of a Marie Curie International Outgoing

100%|██████████| 31/31 [00:00<00:00, 124068.15it/s]

CHAT:
[{'role': 'system', 'content': 'You are a helpful information extraction system.'}, {'role': 'user', 'content': 'Given a passage, your task is to extract all entities and identify their entity types. First, read the passage, later I will ask you questions about entities. The output should be in a list of tuples of the following format: [["entity 1", "type of entity 1"], ... ]. You must not write any text other than a list of tuples. \n\nPassage: The authors would like to thank Adam Burgasser, Brendan Bowler, Kelle Cruz, Mike Cushing, Michael Liu, and Emily Rice for useful discussions on benchmark systems, data treatment, and various data-model comparison approaches. The authors thank Richard Freedman and Roxana Lupu for providing gas opacities and Caroline Morley for radiative transfer code comparisons and helpful discussions. We thank Jacob Lustig-Yeager and Kyle Luther for rewriting portions of the code in python and C for significant speed improvements and also Dan Foreman-Mac




OUTPUT:
["archive"]

TEXT:
Acknowledgments I thank Dustin Lang for providing the TGAS-matched APASS data, the 2016 NYC Gaia Sprint participants for stimulating discussions, and the anonymous referee for a helpful report. I also thank Maarten Breddels for finding and fixing a bug in Figs 2 and 3 . JB received support from the Natural Sciences and Engineering Research Council of Canada. JB also received partial support from an Alfred P. Sloan Fellowship and from the Simons Foundation. The MCMC analyses in this work were run using emcee (Foreman-Mackey etal. 2013). This project was developed in part at the 2016 NYC Gaia Sprint, hosted by the Center for Computational Astrophysics at the Simons Foundation in New York City. This work has made use of data from the European Space Agency (ESA) mission Gaia (http://www.cosmos.esa.int/gaia), processed by the Gaia Data Processing and Analysis Consortium (DPAC, http://www.cosmos.esa.int/web/gaia/dpac/consortium). Funding for the DPAC has been provi

100%|██████████| 31/31 [00:00<00:00, 120727.41it/s]

CHAT:
[{'role': 'system', 'content': 'You are a helpful information extraction system.'}, {'role': 'user', 'content': 'Given a passage, your task is to extract all entities and identify their entity types. First, read the passage, later I will ask you questions about entities. The output should be in a list of tuples of the following format: [["entity 1", "type of entity 1"], ... ]. You must not write any text other than a list of tuples. \n\nPassage: Acknowledgments I thank Dustin Lang for providing the TGAS-matched APASS data, the 2016 NYC Gaia Sprint participants for stimulating discussions, and the anonymous referee for a helpful report. I also thank Maarten Breddels for finding and fixing a bug in Figs 2 and 3 . JB received support from the Natural Sciences and Engineering Research Council of Canada. JB also received partial support from an Alfred P. Sloan Fellowship and from the Simons Foundation. The MCMC analyses in this work were run using emcee (Foreman-Mackey etal. 2013). Th




In [13]:
from seqeval.scheme import IOB2
from seqeval.metrics import classification_report

In [14]:



print(classification_report(y_true, y_pred, digits=4))


                   precision    recall  f1-score   support

         Citation     0.0000    0.0000    0.0000         1
    Collaboration     0.0000    0.0000    0.0000         1
ComputingFacility     0.0000    0.0000    0.0000         0
            Event     0.0000    0.0000    0.0000         2
       Fellowship     0.0000    0.0000    0.0000         4
          Funding     0.0000    0.0000    0.0000         0
            Grant     0.0000    0.0000    0.0000         3
            Group     0.0000    0.0000    0.0000         0
         Location     0.0000    0.0000    0.0000         3
            Model     0.0000    0.0000    0.0000         1
      Observatory     0.0000    0.0000    0.0000         1
     Organization     0.5000    0.0769    0.1333        13
           Person     0.9231    0.6316    0.7500        19
         Software     0.0000    0.0000    0.0000         4
           Survey     0.0000    0.0000    0.0000         3
        Telescope     0.0000    0.0000    0.0000       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
print(classification_report(y_true, y_pred, digits=4, mode='strict', scheme=IOB2))

                   precision    recall  f1-score   support

         Citation     0.0000    0.0000    0.0000         1
    Collaboration     0.0000    0.0000    0.0000         1
ComputingFacility     0.0000    0.0000    0.0000         0
            Event     0.0000    0.0000    0.0000         2
       Fellowship     0.0000    0.0000    0.0000         4
          Funding     0.0000    0.0000    0.0000         0
            Grant     0.0000    0.0000    0.0000         3
            Group     0.0000    0.0000    0.0000         0
         Location     0.0000    0.0000    0.0000         3
            Model     0.0000    0.0000    0.0000         1
      Observatory     0.0000    0.0000    0.0000         1
     Organization     0.5000    0.0769    0.1333        13
           Person     0.9231    0.6316    0.7500        19
         Software     0.0000    0.0000    0.0000         4
           Survey     0.0000    0.0000    0.0000         3
        Telescope     0.0000    0.0000    0.0000       