In [1]:
# Add rebel component https://github.com/Babelscape/rebel/blob/main/spacy_component.py
import spacy
import crosslingual_coreference
import requests
import re
import hashlib
import pandas as pd
from spacy import Language
from typing import List
from spacy.tokens import Doc, Span
from transformers import pipeline

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\gcmar\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
def call_wiki_api(item):
    try:
        url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&language=en&format=json"
        data = requests.get(url).json()
        # Return the first id (Could upgrade this in the future)
        return data['search'][0]['id']
    except:
        return 'id-less'

def extract_triplets(text):
    """
    Function to parse the generated text and extract the triplets
    """
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})

    return triplets


@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
        self.entity_mapping = {}
        # Register custom extension on the Doc
        if not Doc.has_extension("rel"):
            Doc.set_extension("rel", default={})

    def get_wiki_id(self, item: str):
        mapping = self.entity_mapping.get(item)
        if mapping:
            return mapping
        else:
            res = call_wiki_api(item)
            self.entity_mapping[item] = res
            return res

    
    def _generate_triplets(self, sent: Span) -> List[dict]:
        output_ids = self.triplet_extractor(sent.text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"]
        extracted_text = self.triplet_extractor.tokenizer.batch_decode(output_ids[0])
        extracted_triplets = extract_triplets(extracted_text[0])
        return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        for triplet in triplets:

            # Remove self-loops (relationships that start and end at the entity)
            if triplet['head'] == triplet['tail']:
                continue

            # Use regex to search for entities
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)

            # Skip the relation if both head and tail entities are not present in the text
            # Sometimes the Rebel model hallucinates some entities
            if not head_span or not tail_span:
                continue

            index = hashlib.sha1("".join([triplet['head'], triplet['tail'], triplet['type']]).encode('utf-8')).hexdigest()
            if index not in doc._.rel:
                # Get wiki ids and store results
                doc._.rel[index] = {"relation": triplet["type"], "head_span": {'text': triplet['head'], 'id': self.get_wiki_id(triplet['head'])}, "tail_span": {'text': triplet['tail'], 'id': self.get_wiki_id(triplet['tail'])}}

    def __call__(self, doc: Doc) -> Doc:
        for sent in doc.sents:
            sentence_triplets = self._generate_triplets(sent)
            self.set_annotations(doc, sentence_triplets)
        return doc

In [3]:
DEVICE = 0 # Number of the GPU, -1 if want to use CPU

# Add coreference resolution model
coref = spacy.load('en_core_web_sm', disable=['ner', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])
coref.add_pipe("xx_coref", 
    config={"chunk_size": 2500, 
            "chunk_overlap": 2, 
            "device": DEVICE}
              )

# Define rel extraction model
rel_ext = spacy.load('en_core_web_sm', disable=['ner', 'lemmatizer', 'attribute_rules', 'tagger'])
rel_ext.add_pipe("rebel", 
                 config={
                     'device':DEVICE,
                     'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
                )

error loading _jsonnet (this is expected on Windows), treating C:\Users\gcmar\AppData\Local\Temp\tmpu_npggwv\config.json as plain json
Some weights of the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large an

<__main__.RebelComponent at 0x1b113e3cfa0>

In [4]:
# crosslingual_coreference implementation
def coref_res(text_series):
    coref_text_series = text_series.apply(lambda x : coref(x)._.resolved_text)
    return(coref_text_series)

# # choose minilm for speed/memory and info_xlm for accuracy
# predictor = Predictor(
#     language="en_core_web_sm", device=-1, model_name="minilm"
# )

In [5]:
def link_entities_text(text):
    try:
        ent_rel_lst = list(rel_ext(text)._.rel.values())
    except:
        print("Could not extract relationships for text")
        ent_rel_lst = [{'relation': 'rel_err',
                        'head_span': {'text': 'rel_err', 'id': 'rel_err'},
                        'tail_span': {'text': 'rel_err', 'id': 'rel_err'}}]
        
    entity_df = pd.DataFrame()
    rel_lst = []
    head_text_lst = []
    head_wiki_id_lst = []
    tail_text_lst = []
    tail_wiki_id_lst = []
    for i in range(len(ent_rel_lst)):
        rel_lst.append(ent_rel_lst[i]['relation'])
        head_text_lst.append(ent_rel_lst[i]['head_span']['text'])
        head_wiki_id_lst.append(ent_rel_lst[i]['head_span']['id'])
        tail_text_lst.append(ent_rel_lst[i]['tail_span']['text'])
        tail_wiki_id_lst.append(ent_rel_lst[i]['tail_span']['id'])
    entity_df['head_text'] = head_text_lst
    entity_df['head_wiki_id'] = head_wiki_id_lst
    entity_df['relation'] = rel_lst
    entity_df['tail_text'] = tail_text_lst
    entity_df['tail_wiki_id'] = tail_wiki_id_lst
    return(entity_df)

def link_entities(text_series):
    entity_df_series = text_series.apply(lambda x : link_entities_text(x))
    return(entity_df_series)

In [6]:
data_df = pd.read_csv('C:\\Users\\gcmar\\Desktop\\GIT_REPOS\\LAB\\Literature_summary\\TPN\\Entity_edgelist\\entomology_machine_learning.csv',sep=';')
data_df

Unnamed: 0,Authors,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,Page end,...,ISBN,CODEN,PubMed ID,Language of Original Document,Abbreviated Source Title,Document Type,Publication Stage,Open Access,Source,EID
0,"Wetzel M.C., Stuart D.G.",7006494575;57203057621;,Ensemble characterivstics ofcat locovmotionand...,1976,Progress in Neurobiology,7,PART 1,,1,98,...,,PGNBA,785547.0,English,Prog. Neurobiol.,Review,Final,,Scopus,2-s2.0-0017165063
1,Jenkins E.W.,14325242300;,Some Sources for the history of science educat...,1980,Studies in Science Education,7,1,,27,86,...,,,,English,Stud. Sci. Educ.,Article,Final,,Scopus,2-s2.0-84947353367
2,"MOORE D., PENIKAS J., RANKIN M.A.",55197635600;6504171017;7006868718;,Regional specialization for an optomotor respo...,1981,Physiological Entomology,6,1,,61,69,...,,,,English,Physiol.Entomol.,Article,Final,,Scopus,2-s2.0-84981620126
3,Fernald R.D.,7006718987;,Neuroethology according to Hoyle,1984,Behavioral and Brain Sciences,7,3,,387,388,...,,,,English,Behav. Brain Sci.,Article,Final,,Scopus,2-s2.0-84974510172
4,Erber J.,7006073714;,Neuroethology or motorethology?,1984,Behavioral and Brain Sciences,7,3,,386,,...,,,,English,Behav. Brain Sci.,Article,Final,,Scopus,2-s2.0-84974505693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4328,"Ryu S., Kim S.-C.",55561499900;57007270300;,Knocking and listening: Learning mechanical im...,2020,Sensors (Switzerland),20,2,369,,,...,,,31936449.0,English,Sensors,Article,Final,"All Open Access, Gold, Green",Scopus,2-s2.0-85077845047
4329,"Lhomme P., Williams S.D., Ghisbain G., Martine...",24577383300;57259021000;57210204005;5662285280...,Diversification pattern of the widespread hola...,2020,Insect Systematics and Diversity,5,2,5,,,...,,,,English,Insect Syst. Divers.,Article,Final,"All Open Access, Hybrid Gold",Scopus,2-s2.0-85106259106
4330,"Srikanth R., Rekha K.S., Kiran D.R., Raju P.V.",57222598784;57209336734;57222597171;57222605271;,Use of artificial intelligence in IPM,2020,Indian Journal of Entomology,82,4,,609,616,...,,,,English,Indian. J. Entomol.,Article,Final,,Scopus,2-s2.0-85103391838
4331,"Shi Z., Dang H., Liu Z., Zhou X.",57201977457;57200641138;57201996356;9746372900;,Detection and identification of stored-grain i...,2020,IEEE Access,8,,,163703,163714,...,,,,English,IEEE Access,Article,Final,"All Open Access, Gold",Scopus,2-s2.0-85102839536


In [7]:
# import data
# data_df = pd.read_csv('E:\GIT_REPOS\LAB\Literature_summary\TPN\Papers\\scopus.csv')
data_df = data_df[data_df["Abstract"] != '[No abstract available]']
data_df.reset_index(inplace=True, drop=True)
data_df["Abstract"] = data_df["Abstract"].str.replace(r'(', '')
data_df["Abstract"] = data_df["Abstract"].str.replace(r')', '')
data_df["Abstract"] = data_df["Abstract"].str.replace(r"'", '')
data_df["Abstract"] = data_df["Abstract"].str.replace(r"'", '')
data_df["Abstract"] = data_df["Abstract"].str.replace(r'"', '')
data_df["Abstract"] = data_df["Abstract"].str.replace(r'"', '')
data_df["Abstract"] = data_df["Abstract"].astype(str)

  data_df["Abstract"] = data_df["Abstract"].str.replace(r'(', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df["Abstract"] = data_df["Abstract"].str.replace(r'(', '')
  data_df["Abstract"] = data_df["Abstract"].str.replace(r')', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df["Abstract"] = data_df["Abstract"].str.replace(r')', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/use

In [None]:
win_size = 100
start_point = 0 # default to 0
# coref_lst = []
entities_df_lst = []
for i in range(start_point, len(data_df), win_size):
    coref_series = coref_res(text_series=data_df["Abstract"].iloc[i:i+win_size])
    print('coref done', i)
    link_entities_series = link_entities(text_series=coref_series)
    print('entity linking done', i)
    entities_df = pd.concat(link_entities_series.tolist())
    print('df create done', i)
    entities_df_lst.append(entities_df)
    print('df to list done', i, '\n')
all_entities_df = pd.concat(entities_df_lst)
all_entities_df.reset_index(drop=True, inplace=True)
edge_lst_df = all_entities_df.value_counts().reset_index().rename(columns={0: "count"})
edge_lst_df.to_csv('entity_weighted_edgelist.csv')

coref done 0
entity linking done 0
df create done 0
df to list done 0 

coref done 100


In [None]:
# coref_series = coref_res(text_series=data_df["Abstract"][:1000])
# link_entities_series = link_entities(text_series=coref_series)
# all_entities_df = pd.concat(link_entities_series.tolist())