In [1]:
import pandas as pd
import spacy
import neuralcoref
import urllib
from string import punctuation
import nltk
import json
import urllib
import pandas as pd
from neo4j import GraphDatabase
import itertools

# download english files
# spacy.cli.download("en_core_web_md")
# python -m spacy download en

In [2]:
# import data
data_df = pd.read_csv('E:\\GIT_REPOS\\LAB\\Literature_summary\\Papers\\scopus_bark_ambrosia_beetles.csv')
data_df.columns

Index(['Unnamed: 0', 'Authors', 'Author(s) ID', 'Title', 'Year',
       'Source title', 'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end',
       'Page count', 'Cited by', 'DOI', 'Link', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Author Keywords',
       'Index Keywords', 'Molecular Sequence Numbers', 'Chemicals/CAS',
       'Tradenames', 'Manufacturers', 'Funding Details', 'Funding Text 1',
       'Funding Text 2', 'Funding Text 3', 'Funding Text 4', 'Funding Text 5',
       'Funding Text 6', 'Funding Text 7', 'Funding Text 8', 'Funding Text 9',
       'Funding Text 10', 'References', 'Correspondence Address', 'Editors',
       'Sponsors', 'Publisher', 'Conference name', 'Conference date',
       'Conference location', 'Conference code', 'ISSN', 'ISBN', 'CODEN',
       'PubMed ID', 'Language of Original Document',
       'Abbreviated Source Title', 'Document Type', 'Publication Stage',
       'Open Access', 'Source', 'EID'],
      dtype='object')

In [3]:
data_df['Abstract']

0       Scolytinae species that, in high populations, ...
1       Bamboo (Bambusa sp.) is a grass species with h...
2       Background: Biological invasions are responsib...
3       Background: Fungi associated with insects repr...
4       Background: Separation of biotic and abiotic i...
                              ...                        
3672    Background: Wood-feeding insects often work in...
3673    Twig beetles in the genus Pityophthorus Eichho...
3674    All 8237 species-group taxa of Coleoptera know...
3675    In Australia, the bark beetle Ips grandicollis...
3676    The taxonomic status of Thamnurgus rossicus wa...
Name: Abstract, Length: 3677, dtype: object

In [4]:
sentence = """Elon Musk is a business magnate, industrial designer, and engineer. 
Elon Musk is the founder, CEO, CTO, and chief designer of SpaceX. 
Elon Musk is also early investor, CEO, and product architect of Tesla, Inc. 
Elon Musk is also the founder of The Boring Company and the co-founder of Neuralink.
A centibillionaire, Musk became the richest person in the world in January 2021, 
with an estimated net worth of $185 billion at the time, surpassing Jeff Bezos.
Musk was born to a Canadian mother and South African father and raised in Pretoria, South Africa. 
Elon Musk briefly attended the University of Pretoria before moving to Canada aged 17 to attend Queen's University.
Elon Musk transferred to the University of Pennsylvania two years later, 
where Elon Musk received dual bachelor's degrees in economics and physics. Elon Musk moved to California in 1995 to attend Stanford University, 
but decided instead to pursue a business career. Elon Musk went on co-founding a web software company Zip2 with Elon Musk brother Kimbal Musk."""

## Coreference resolution

In [5]:
# Load SpaCy
nlp = spacy.load('en')
# Add neural coref to SpaCy's pipe
neuralcoref.add_to_pipe(nlp)

def coref_resolution(text):
    """Function that executes coreference resolution on a given text"""
    doc = nlp(text)
    # fetches tokens with whitespaces from spacy document
    tok_list = list(token.text_with_ws for token in doc)
    for cluster in doc._.coref_clusters:
        # get tokens from representative cluster name
        cluster_main_words = set(cluster.main.text.split(' '))
        for coref in cluster:
            if coref != cluster.main:  # if coreference element is not the representative element of that cluster
                if coref.text != cluster.main.text and bool(set(coref.text.split(' ')).intersection(cluster_main_words)) == False:
                    # if coreference element text and representative element text are not equal and none of the coreference element words are in representative element. This was done to handle nested coreference scenarios
                    tok_list[coref.start] = cluster.main.text + \
                        doc[coref.end-1].whitespace_
                    for i in range(coref.start+1, coref.end):
                        tok_list[i] = ""

    return "".join(tok_list)

## Named Entity Linking

In [6]:
ENTITY_TYPES = ["human", "person", "company", "enterprise", "business", "geographic region",
                "human settlement", "geographic entity", "territorial entity type", "organization"]

def wikifier(text, lang="en", threshold=0.8):
    """Function that fetches entity linking results from wikifier.com API"""
    # Prepare the URL.
    data = urllib.parse.urlencode([
        ("text", text), ("lang", lang),
        ("userKey", "tgbdmkpmkluegqfbawcwjywieevmza"),
        ("pageRankSqThreshold", "%g" %
         threshold), ("applyPageRankSqThreshold", "true"),
        ("nTopDfValuesToIgnore", "100"), ("nWordsToIgnoreFromList", "100"),
        ("wikiDataClasses", "true"), ("wikiDataClassIds", "false"),
        ("support", "true"), ("ranges", "false"), ("minLinkFrequency", "2"),
        ("includeCosines", "false"), ("maxMentionEntropy", "3")
    ])
    url = "http://www.wikifier.org/annotate-article"
    # Call the Wikifier and read the response.
    req = urllib.request.Request(url, data=data.encode("utf8"), method="POST")
    with urllib.request.urlopen(req, timeout=60) as f:
        response = f.read()
        response = json.loads(response.decode("utf8"))
    # Output the annotations.
    results = list()
    for annotation in response["annotations"]:
        # Filter out desired entity classes
        if ('wikiDataClasses' in annotation) and (any([el['enLabel'] in ENTITY_TYPES for el in annotation['wikiDataClasses']])):

            # Specify entity label
            if any([el['enLabel'] in ["human", "person"] for el in annotation['wikiDataClasses']]):
                label = 'Person'
            elif any([el['enLabel'] in ["company", "enterprise", "business", "organization"] for el in annotation['wikiDataClasses']]):
                label = 'Organization'
            elif any([el['enLabel'] in ["geographic region", "human settlement", "geographic entity", "territorial entity type"] for el in annotation['wikiDataClasses']]):
                label = 'Location'
            else:
                label = None

            results.append({'title': annotation['title'], 'wikiId': annotation['wikiDataItemId'], 'label': label,
                            'characters': [(el['chFrom'], el['chTo']) for el in annotation['support']]})
    return results

## Relationship extraction

In [7]:
entities_threshold = 0.1
# First get all the entities in the sentence
entities = wikifier(sentence, threshold=entities_threshold)
# Iterate over every permutation pair of entities
for permutation in itertools.permutations(entities, 2):
    for source in permutation[0]['characters']:
        for target in permutation[1]['characters']:
            # Relationship extraction with OpenNRE
            data = relation_model.infer(
                {'text': sentence, 'h': {'pos': [source[0], source[1] + 1]}, 't': {'pos': [target[0], target[1] + 1]}})
            if data[1] > relation_threshold:
                relations_list.append(
                    {'source': permutation[0]['title'], 'target': permutation[1]['title'], 'type': data[0]})


## Knowledge graph

In [9]:
driver = GraphDatabase.driver('bolt://localhost:7687', auth=('neo4j', 'letmein'))

def ie_pipeline(text, relation_threshold=0.9, entities_threshold=0.8):
    # Prepare the URL.
    data = urllib.parse.urlencode([
        ("text", text), ("relation_threshold", relation_threshold),
        ("entities_threshold", entities_threshold)])
    
    url = "http://localhost:5000?" + data
    req = urllib.request.Request(url, data=data.encode("utf8"), method="GET")
    with urllib.request.urlopen(req, timeout=150) as f:
        response = f.read()
        response = json.loads(response.decode("utf8"))
    # Output the annotations.
    return response

import_refactored_query = """
UNWIND $params as value
CREATE (a:Article{content:value.content})
FOREACH (rel in value.ie.relations | 
  MERGE (s:Entity{name:rel.source})
  MERGE (t:Entity{name:rel.target})
  MERGE (s)-[:RELATION]->(r:Relation{type:rel.type})-[:RELATION]->(t)
  MERGE (a)-[:MENTIONS_REL]->(r))
WITH value, a
UNWIND value.ie.entities as entity
MERGE (e:Entity{name:entity.title})
SET e.wikiId = entity.wikiId
MERGE (a)-[:MENTIONS_ENT]->(e)
WITH entity, e
CALL apoc.create.addLabels(e,[entity.label]) YIELD node
RETURN distinct 'done'
"""

with driver.session() as session:
    params = []
    for i,article in list(data.iterrows())[:500]:
        content = article['content']
        ie_data = ie_pipeline(content)
        params.append({'content':content, 'ie':ie_data})

        if (len(params) % 100 == 0):
            session.run(import_refactored_query, {'params':params})
            params = []

    session.run(update_query, {'params':params})

NameError: name 'data' is not defined