<a href="https://colab.research.google.com/github/AbeHandler/AbeHandler.github.io/blob/master/runeconberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [74]:
# Use a pipeline as a high-level helper
from transformers import pipeline

from google.colab import userdata
hf = userdata.get('huggingface_token')

pipe = pipeline("token-classification", model="abehandlerorg/econberta", token=hf)

# Function to preprocess the tagged token's word for matching
def preprocess_word(word):
    return word.replace('▁', '').lower()

# Fuzzy matching function
def is_match(spacy_token, tagged_token):
    # Preprocess the tagged token's word
    tagged_word = preprocess_word(tagged_token['word'])
    # Check if the texts are similar enough (using a simple lowercase comparison here, but you can use fuzzier methods)
    if spacy_token['text'].lower() == tagged_word:
        # Check if the character offsets are similar
        if abs(spacy_token['start'] - tagged_token['start']) <= 1 and abs(spacy_token['end'] - tagged_token['end']) <= 1:
            return True
    return False

In [79]:
import spacy
from transformers import pipeline, AutoTokenizer

nlp = spacy.load("en_core_web_sm")
para = '''Using the size of CEO signatures in SEC filings to measure individual narcissism, we find that CEO narcissism is associated with several negative firm outcomes. We first validate signature size as a measure of narcissism but not overconfidence using two laboratory studies, and also find that our measure is correlated with employee perceptions of CEO narcissism used in prior research. We then use CEO signatures to study the relation between CEO narcissism and the firm’s investment policies and performance. CEO narcissism is associated with overinvestment, particularly in R&D and M&A expenditures (but not in capital expenditures). Firms led by narcissistic CEOs experience lower financial productivity in the form of profitability and operating cash flows. Despite this negative performance, narcissistic CEOs enjoy higher absolute and relative compensation. Our results are robust to several alternative specifications, including controlling for a popular options-based overconfidence measure used in prior research.'''
taggedtokens = pipe(para)

doc = nlp(para)
tokens_list = []
for i, token in enumerate(doc):
    # Determine if there's whitespace after the token
    # This checks if the next char in the original text is not the start of the next token
    ws = False
    if i + 1 < len(doc) and doc[i + 1].idx > token.idx + len(token):
        ws = True

    token_info = {
        "text": token.text,
        "start": token.idx,
        "end": token.idx + len(token),
        "id": i,
        "ws": ws
    }
    tokens_list.append(token_info)

for spacy_token in tokens_list:
    for tagged_token in taggedtokens:
        if is_match(spacy_token, tagged_token):
            # Assign the entity label from the tagged token to the spaCy token
            spacy_token['entity'] = tagged_token['entity'].replace("B-", "").replace("I-", "").upper()
            break  # Stop searching after the first match for each spaCy token


In [87]:
spans = []
current_span = None

for token in tokens_list:
    # Check if the token has an entity label
    if 'entity' in token:
        if current_span is None:
            # Start a new span
            current_span = {
                'start': token['start'],
                'end': token['end'],
                'token_start': token['id'],
                'token_end': token['id'],
                'label': token['entity']
            }
        else:
            # Check if the current token continues the current span
            if token['entity'] == current_span['label']:
                # Update the end of the current span
                current_span['end'] = token['end']
                current_span['token_end'] = token['id']
            else:
                # The current token starts a new span, so finish the current span and start a new one
                spans.append(current_span)
                current_span = {
                    'start': token['start'],
                    'end': token['end'],
                    'token_start': token['id'],
                    'token_end': token['id'],
                    'label': token['entity']
                }
    else:
        # The current token does not have a label, so finish the current span (if any) and reset
        if current_span is not None:
            spans.append(current_span)
            current_span = None

# Add the last span if it hasn't been added yet
if current_span is not None:
    spans.append(current_span)

out = {"text": para, "tokens": tokens_list, "spans": spans}

In [88]:
with open("annotations2.jsonl", "w") as of:
    of.write(json.dumps(out))

In [86]:
spans

[{'start': 10,
  'end': 32,
  'token_start': 2,
  'token_end': 5,
  'label': 'INTERVENTION'},
 {'start': 70,
  'end': 80,
  'token_start': 12,
  'token_end': 12,
  'label': 'OUTCOME'},
 {'start': 95,
  'end': 98,
  'token_start': 17,
  'token_end': 17,
  'label': 'POPULATION'},
 {'start': 99,
  'end': 109,
  'token_start': 18,
  'token_end': 18,
  'label': 'INTERVENTION'},
 {'start': 146,
  'end': 159,
  'token_start': 24,
  'token_end': 25,
  'label': 'OUTCOME'},
 {'start': 179,
  'end': 193,
  'token_start': 30,
  'token_end': 31,
  'label': 'INTERVENTION'},
 {'start': 210,
  'end': 220,
  'token_start': 36,
  'token_end': 36,
  'label': 'OUTCOME'},
 {'start': 352,
  'end': 362,
  'token_start': 58,
  'token_end': 58,
  'label': 'OUTCOME'},
 {'start': 399,
  'end': 402,
  'token_start': 67,
  'token_end': 67,
  'label': 'INTERVENTION'},
 {'start': 444,
  'end': 447,
  'token_start': 74,
  'token_end': 74,
  'label': 'POPULATION'},
 {'start': 448,
  'end': 458,
  'token_start': 75,
  