<a href="https://colab.research.google.com/github/AbeHandler/AbeHandler.github.io/blob/master/April10%2C2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path

Path("src/models").mkdir(parents=True, exist_ok=True)
Path("config").mkdir(parents=True, exist_ok=True)

In [170]:
from transformers import pipeline
from google.colab import userdata
from transformers import BertTokenizer, BertModel
import torch
import spacy
import numpy as np


class TokenProcessor:
    @staticmethod
    def preprocess_word(word):
        return word.replace('▁', '').lower()

    @staticmethod
    def is_match(spacy_token, tagged_token):
        tagged_word = TokenProcessor.preprocess_word(tagged_token['word'])
        if spacy_token['text'].lower() == tagged_word:
            if abs(spacy_token['start'] - tagged_token['start']) <= 1 and abs(spacy_token['end'] - tagged_token['end']) <= 1:
                return True
        return False

# Class for managing the NLP pipeline
class NLPipeline:
    def __init__(self):
        self.hf = userdata.get('huggingface_token')
        self.pipe = pipeline("token-classification", model="abehandlerorg/econberta", token=self.hf)
        self.nlp = spacy.load("en_core_web_sm")

    def process_text(self, text):
        tagged_tokens = self.pipe(text)
        doc = self.nlp(text)
        tokens_list = self._generate_tokens_list(doc)
        SpanGenerator.assign_entity_labels(tokens_list, tagged_tokens)
        spans = SpanGenerator.generate_spans(tokens_list)
        return {"text": text, "tokens": tokens_list, "spans": spans}

    @staticmethod
    def _generate_tokens_list(doc):
        tokens_list = []
        for i, token in enumerate(doc):
            ws = False
            if i + 1 < len(doc) and doc[i + 1].idx > token.idx + len(token):
                ws = True
            tokens_list.append({
                "text": token.text,
                "start": token.idx,
                "end": token.idx + len(token),
                "id": i,
                "ws": ws
            })
        return tokens_list

# Class for generating spans
class SpanGenerator:
    @staticmethod
    def assign_entity_labels(tokens_list, tagged_tokens):
        for spacy_token in tokens_list:
            for tagged_token in tagged_tokens:
                if TokenProcessor.is_match(spacy_token, tagged_token):
                    spacy_token['entity'] = tagged_token['entity'].replace("B-", "").replace("I-", "").upper()
                    break

    @staticmethod
    def generate_spans(tokens_list):
        spans = []
        current_span = None
        for token in tokens_list:
            if 'entity' in token:
                if current_span is None:
                    current_span = SpanGenerator._start_new_span(token)
                else:
                    if token['entity'] == current_span['label']:
                        SpanGenerator._update_current_span(current_span, token)
                    else:
                        spans.append(current_span)
                        current_span = SpanGenerator._start_new_span(token)
            else:
                if current_span is not None:
                    spans.append(current_span)
                    current_span = None
        if current_span is not None:
            spans.append(current_span)
        return spans

    @staticmethod
    def _start_new_span(token):
        return {
            'start': token['start'],
            'end': token['end'],
            'token_start': token['id'],
            'token_end': token['id'],
            'label': token['entity']
        }

    @staticmethod
    def _update_current_span(span, token):
        span['end'] = token['end']
        span['token_end'] = token['id']

import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertModel
import torch

class PhraseEncoder:
    def __init__(self, model_name='whaleloops/phrase-bert'):
        self.model = SentenceTransformer(model_name)

    def encode_phrases(self, phrase_list):
        return self.model.encode(phrase_list)

class Span2BertEmbedding:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)

    def get_span_embedding(self, text, span_info):
        encoded_input = self.tokenizer(text, return_tensors='pt')
        token_start_position, token_end_position = self._find_token_positions(encoded_input, span_info)

        if token_start_position is not None and token_end_position is not None:
            with torch.no_grad():
                outputs = self.model(**encoded_input)
            span_embeddings = outputs.last_hidden_state[0, token_start_position:token_end_position + 1]
            aggregated_embedding = torch.mean(span_embeddings, dim=0)
            return aggregated_embedding
        else:
            print("Span does not align with BERT tokenization.")
            return None

    def _find_token_positions(self, encoded_input, span_info):
        token_start_position, token_end_position = None, None
        cumulative_position = 0

        for idx, token_id in enumerate(encoded_input['input_ids'][0]):
            token = self.tokenizer.decode([token_id])
            token_start = cumulative_position
            token_end = cumulative_position + len(token.replace("##", ""))

            if token_start_position is None and span_info['start'] <= token_end:
                token_start_position = idx
            if token_end_position is None and span_info['end'] <= token_end:
                token_end_position = idx
                break
            cumulative_position = token_end

        return token_start_position, token_end_position

class PostProcessor:
    def __init__(self, bert_model='bert-base-uncased', phrase_model='whaleloops/phrase-bert'):
        self.phrase_encoder = PhraseEncoder(model_name=phrase_model)
        self.span_mapper = Span2BertEmbedding(model_name=bert_model)

    @staticmethod
    def cosine_similarity(A, B):
        dot_product = np.dot(A, B)
        norm_A = np.linalg.norm(A)
        norm_B = np.linalg.norm(B)
        return dot_product / (norm_A * norm_B)

    def process_spans(self, text, spans, target_phrase):
        target_emb = self.phrase_encoder.encode_phrases([target_phrase])[0]
        outcomes = [span for span in spans if span["label"] == "OUTCOME"]

        for outcome in outcomes:
            embed = self.span_mapper.get_span_embedding(text, outcome)
            if embed is not None:
                outcome["cos"] = self.cosine_similarity(embed.numpy(), target_emb)
            else:
                outcome["cos"] = 0

        outcomes.sort(key=lambda x: x["cos"], reverse=True)

        processed_spans = outcomes[:5] + [span for span in spans if span["label"] == "INTERVENTION"]
        return [self.map2str(span) for span in processed_spans]

    @staticmethod
    def map2str(span):
        if "cos" in span:
            span["cos"] = str(span["cos"])
        if span["label"] == "INTERVENTION":
            span["label"] = "X"
        elif span["label"] == "OUTCOME":
            span["label"] = "Y"
        return span

In [171]:
import pandas as pd
from src.models.semantic_scholar import SemanticScholarAPI
from src.logger import get_logger
from tqdm.notebook import tqdm


logger = get_logger()
api = SemanticScholarAPI(logger=logger)
Y = "research and development expenditures"
results = api.search("determinants " + Y, minCitationCount=25)

pipeline = NLPipeline()
processor = PostProcessor()

output = []
for searchresult in tqdm(results, total=len(results)):
    try:
        para = searchresult.text
        if para is not None and len(para) > 0:
            result = pipeline.process_text(para)
            processed_spans = processor.process_spans(para, result["spans"], Y)
            result["spans"] = processed_spans
            output.append(result)
    except RuntimeError:
        print("Error")
        print(searchresult.text)



<ipython-input-171-a0a09e8d6a75> 7




  0%|          | 0/48 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT tokenization.
Span does not align with BERT to

Token indices sequence length is longer than the specified maximum sequence length for this model (866 > 512). Running this sequence through the model will result in indexing errors


Error
591 Transforming our world: the 2030 agenda for sustainable development outlines a transformative vision with 17 sustainable development goals (SDGs) for economic, social and environmental development.1 While only SDG 3, to ensure healthy lives and promote well-being for all at all ages, focuses on human health, all goals are interrelated. This issue of the Bulletin of the World Health Organization examines the relationship between health and the SDGs. Implementing the 2030 agenda requires a multistakeholder, multi-actor response. Innovations and development in policy, technology and research must include dialogue between governments, the private sector, civil society organizations and nongovernmental organizations; most importantly, strong community involvement is needed.2 The focus of the 2030 agenda on addressing country-level needs is based on the engagement of all actors and sectors, as opposed to the traditional top-down, single-sector approach. Intersectoral governance is 

In [172]:
import json
with open(Y.replace(" ", "_") + ".jsonl", "w") as of:
    for i in output:
        of.write(json.dumps(i) + "\n")