In [1]:
# Add rebel component https://github.com/Babelscape/rebel/blob/main/spacy_component.py
import spacy
import crosslingual_coreference
import requests
import re
import hashlib
import pandas as pd
from spacy import Language
from typing import List
from spacy.tokens import Doc, Span
from transformers import pipeline

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\GCM\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
def call_wiki_api(item):
    try:
        url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={item}&language=en&format=json"
        data = requests.get(url).json()
        # Return the first id (Could upgrade this in the future)
        return data['search'][0]['id']
    except:
        return 'id-less'

def extract_triplets(text):
    """
    Function to parse the generated text and extract the triplets
    """
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})

    return triplets


@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
        self.entity_mapping = {}
        # Register custom extension on the Doc
        if not Doc.has_extension("rel"):
            Doc.set_extension("rel", default={})

    def get_wiki_id(self, item: str):
        mapping = self.entity_mapping.get(item)
        if mapping:
            return mapping
        else:
            res = call_wiki_api(item)
            self.entity_mapping[item] = res
            return res

    
    def _generate_triplets(self, sent: Span) -> List[dict]:
        output_ids = self.triplet_extractor(sent.text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"]
        extracted_text = self.triplet_extractor.tokenizer.batch_decode(output_ids[0])
        extracted_triplets = extract_triplets(extracted_text[0])
        return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        for triplet in triplets:

            # Remove self-loops (relationships that start and end at the entity)
            if triplet['head'] == triplet['tail']:
                continue

            # Use regex to search for entities
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)

            # Skip the relation if both head and tail entities are not present in the text
            # Sometimes the Rebel model hallucinates some entities
            if not head_span or not tail_span:
                continue

            index = hashlib.sha1("".join([triplet['head'], triplet['tail'], triplet['type']]).encode('utf-8')).hexdigest()
            if index not in doc._.rel:
                # Get wiki ids and store results
                doc._.rel[index] = {"relation": triplet["type"], "head_span": {'text': triplet['head'], 'id': self.get_wiki_id(triplet['head'])}, "tail_span": {'text': triplet['tail'], 'id': self.get_wiki_id(triplet['tail'])}}

    def __call__(self, doc: Doc) -> Doc:
        for sent in doc.sents:
            sentence_triplets = self._generate_triplets(sent)
            self.set_annotations(doc, sentence_triplets)
        return doc

In [3]:
DEVICE = 0 # Number of the GPU, -1 if want to use CPU

# Add coreference resolution model
coref = spacy.load('en_core_web_sm', disable=['ner', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'])
coref.add_pipe("xx_coref", 
    config={"chunk_size": 2500, 
            "chunk_overlap": 2, 
            "device": DEVICE}
              )

# Define rel extraction model
rel_ext = spacy.load('en_core_web_sm', disable=['ner', 'lemmatizer', 'attribute_rules', 'tagger'])
rel_ext.add_pipe("rebel", 
                 config={
                     'device':DEVICE,
                     'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
                )

error loading _jsonnet (this is expected on Windows), treating C:\Users\GCM\AppData\Local\Temp\tmpv7oelwq4\config.json as plain json
Some weights of the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large and 

<__main__.RebelComponent at 0x15a36441b50>

In [4]:
# crosslingual_coreference implementation
def coref_res(text_series):
    coref_text_series = text_series.apply(lambda x : coref(x)._.resolved_text)
    return(coref_text_series)

# # choose minilm for speed/memory and info_xlm for accuracy
# predictor = Predictor(
#     language="en_core_web_sm", device=-1, model_name="minilm"
# )

In [5]:
def link_entities_text(text):
    try:
        ent_rel_lst = list(rel_ext(text)._.rel.values())
    except:
        print("Could not extract relationships for text")
        ent_rel_lst = [{'relation': 'rel_err',
                        'head_span': {'text': 'rel_err', 'id': 'rel_err'},
                        'tail_span': {'text': 'rel_err', 'id': 'rel_err'}}]
        
    entity_df = pd.DataFrame()
    rel_lst = []
    head_text_lst = []
    head_wiki_id_lst = []
    tail_text_lst = []
    tail_wiki_id_lst = []
    for i in range(len(ent_rel_lst)):
        rel_lst.append(ent_rel_lst[i]['relation'])
        head_text_lst.append(ent_rel_lst[i]['head_span']['text'])
        head_wiki_id_lst.append(ent_rel_lst[i]['head_span']['id'])
        tail_text_lst.append(ent_rel_lst[i]['tail_span']['text'])
        tail_wiki_id_lst.append(ent_rel_lst[i]['tail_span']['id'])
    entity_df['head_text'] = head_text_lst
    entity_df['head_wiki_id'] = head_wiki_id_lst
    entity_df['relation'] = rel_lst
    entity_df['tail_text'] = tail_text_lst
    entity_df['tail_wiki_id'] = tail_wiki_id_lst
    return(entity_df)

def link_entities(text_series):
    entity_df_series = text_series.apply(lambda x : link_entities_text(x))
    return(entity_df_series)

In [6]:
data_df = pd.read_csv('E:\\GIT_REPOS\\LAB\\Literature_summary\\Test\\Entity_edgelist\\Input\\front_facing_labels.csv')
data_df.drop('File', axis=1, inplace=True)
data_df.drop_duplicates(inplace=True)
data_df

Unnamed: 0,﻿Authors,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,Page end,...,Link,Abstract,Author Keywords,Index Keywords,References,Document Type,Publication Stage,Open Access,Source,EID
0,"Dolan J.R., Coats D.W.",7202840577;7005698202;,A study of feeding in predacious ciliates usin...,1991,Journal of Plankton Research,13,3,,609,627,...,https://www.scopus.com/inward/record.uri?eid=2...,Feeding in predacious estuarine ciliates was i...,,ciliate; feeding; Cyclidium; Euplotes vannus; ...,"Allen, R.D., Food vacuole membrane growth with...",Article,Final,,Scopus,2-s2.0-0026309866
1,"Hrovat K.B., Harris K.Z., Leach A.D., Russell ...",55923565400;56983156000;7102097910;57216211472...,"The New Food Label, Type of Fat, and Consumer ...",1994,Archives of Family Medicine,3,8,,690,695,...,https://www.scopus.com/inward/record.uri?eid=2...,To determine how frequently lay consumers eval...,,fat; article; classification; consumer; female...,"(1992) Consumer Expenditures in 1991, , Washin...",Article,Final,,Scopus,2-s2.0-0028488355
2,Gilron I.,57207532791;,The introduction of new drugs into anaesthetic...,1995,Canadian Journal of Anaesthesia,42,6,,516,522,...,https://www.scopus.com/inward/record.uri?eid=2...,This article reviews the process by which new ...,Anaesthetics; Clinical pharmacology; Drug indu...,anesthetic agent; new drug; anesthesia; drug a...,"(1970) Br J Anaesth, 42, p. 911. , Anonymous. ...",Article,Final,"All Open Access, Bronze",Scopus,2-s2.0-0029073821
3,"Keller S.B., Landry M., Olson J., Velliquette ...",8069869700;57214620571;7402882503;6507416913;7...,"The effects of nutrition package claims, nutri...",1997,Journal of Public Policy and Marketing,16,2,,256,269,...,https://www.scopus.com/inward/record.uri?eid=2...,In a laboratory experiment using a between-sub...,,,"Alba, J.W., Hutchinson, J.W., Lynch, J., Memor...",Article,Final,"All Open Access, Green",Scopus,2-s2.0-0031314114
4,"Roe B., Levy A.S., Derby B.M.",7102359355;7401905997;7006282373;,The impact of health claims on consumer search...,1999,Journal of Public Policy and Marketing,18,1,,89,105,...,https://www.scopus.com/inward/record.uri?eid=2...,The authors report results of a mall-intercept...,,,"Andrews, J.C., Netemeyer, R.G., Burton, S., Co...",Review,Final,,Scopus,2-s2.0-0033455136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2351,"Windholtz S., Vinsonneau E., Farris L., Thibon...",57219722820;57216896427;57211680831;2457811710...,Yeast and Filamentous Fungi Microbial Communit...,2021,Frontiers in Microbiology,12,,748416,,,...,https://www.scopus.com/inward/record.uri?eid=2...,Changes are currently being made to winemaking...,biodiversity; bioprotection; maturity level; p...,sulfur dioxide; Article; Ascomycetes; biodiver...,"Agarbati, A., Canonico, L., Mancabelli, L., Mi...",Article,Final,"All Open Access, Gold, Green",Scopus,2-s2.0-85122448660
2352,"Greenberg D., Drewnowski A., Black R., Weststr...",7403344263;7005010204;57546488700;57397988700;...,A Progressive Nutrient Profiling System to Gui...,2021,Frontiers in Nutrition,8,,774409,,,...,https://www.scopus.com/inward/record.uri?eid=2...,Improving the nutrient density of processed fo...,energy density; food choice; food quality; nut...,,(2013) Global Action Plan for the Prevention a...,Article,Final,"All Open Access, Gold, Green",Scopus,2-s2.0-85122294120
2353,"Chen J., Chen X., Ho C.L.",57227468700;57397957000;42761342400;,Recent Development of Probiotic Bifidobacteria...,2021,Frontiers in Bioengineering and Biotechnology,9,,770248,,,...,https://www.scopus.com/inward/record.uri?eid=2...,"Bifidobacterium is a non-spore-forming, Gram-p...",bifidobacteria; genetic engineering; probiotic...,Genetic engineering; Health; Probiotics; Synth...,"Abdelhamid, A.G., Esaam, A., Hazaa, M.M., Cell...",Review,Final,"All Open Access, Gold, Green",Scopus,2-s2.0-85122285325
2354,"Song M., Zhao Y., Li X., Meng L.",57396406300;57221094813;57775236800;57210121930;,The Influence Exerted by Time Frames on Consum...,2021,Frontiers in Psychology,12,,790727,,,...,https://www.scopus.com/inward/record.uri?eid=2...,With the development of Internet e-commerce ch...,date; delay; nearly expired food; time frames;...,,"Aschemann-Witzel, J., Consumer perception and ...",Article,Final,"All Open Access, Gold, Green",Scopus,2-s2.0-85122230852


In [7]:
# import data
# data_df = pd.read_csv('E:\GIT_REPOS\LAB\Literature_summary\TPN\Papers\\scopus.csv')
data_df = data_df[data_df["Abstract"] != '[No abstract available]']
data_df.reset_index(inplace=True, drop=True)
data_df["Abstract"] = data_df["Abstract"].str.replace(r'(', '')
data_df["Abstract"] = data_df["Abstract"].str.replace(r')', '')
data_df["Abstract"] = data_df["Abstract"].str.replace(r"'", '')
data_df["Abstract"] = data_df["Abstract"].str.replace(r"'", '')
data_df["Abstract"] = data_df["Abstract"].str.replace(r'"', '')
data_df["Abstract"] = data_df["Abstract"].str.replace(r'"', '')
data_df["Abstract"] = data_df["Abstract"].astype(str)

  data_df["Abstract"] = data_df["Abstract"].str.replace(r'(', '')
  data_df["Abstract"] = data_df["Abstract"].str.replace(r')', '')


In [None]:
win_size = 100
start_point = 0 # default to 0
# coref_lst = []
entities_df_lst = []
for i in range(start_point, len(data_df), win_size):
    coref_series = coref_res(text_series=data_df["Abstract"].iloc[i:i+win_size])
    print('coref done', i)
    link_entities_series = link_entities(text_series=coref_series)
    print('entity linking done', i)
    entities_df = pd.concat(link_entities_series.tolist())
    print('df create done', i)
    entities_df_lst.append(entities_df)
    print('df to list done', i, '\n')
all_entities_df = pd.concat(entities_df_lst)
all_entities_df.reset_index(drop=True, inplace=True)
edge_lst_df = all_entities_df.value_counts().reset_index().rename(columns={0: "count"})
edge_lst_df.to_csv('entity_weighted_edgelist_FRONLABELS.csv')

coref done 0


In [None]:
# coref_series = coref_res(text_series=data_df["Abstract"][:1000])
# link_entities_series = link_entities(text_series=coref_series)
# all_entities_df = pd.concat(link_entities_series.tolist())