# nlp

this module supports nlp methods. for documentation:
https://stanfordnlp.github.io/CoreNLP/index.html
we are aware that starting the engine inside each method affects efficiency.
still, don't set `core_nlp_engine` as a global variable,
because that way, the java processes will not be killed.

In [None]:
#| default_exp ie_func.nlp

In [None]:
#| hide
from nbdev.showdoc import show_doc
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import json
import logging
from io import BytesIO
from os import popen
from pathlib import Path
from typing import Iterator
from zipfile import ZipFile
import os
import configparser

import jdk
from spanner_nlp.StanfordCoreNLP import StanfordCoreNLP

from spannerlib.utils import download_file_from_google_drive, get_base_file_path, get_lib_name
from spannerlib.span import Span

In [None]:
#| export
JAVA_MIN_VERSION = 1.8

NLP_URL = "https://drive.google.com/u/0/uc?export=download&id=1QixGiHD2mHKuJtB69GHDQA0wTyXtHzjl"
NLP_DIR_NAME = 'stanford-corenlp-4.1.0'
CURR_DIR = get_base_file_path()/get_lib_name()
NLP_DIR_PATH = str(CURR_DIR / NLP_DIR_NAME)
JAVA_DOWNLOADER = "install-jdk"
_USER_DIR = Path.home()
INSTALLATION_PATH = _USER_DIR / ".jre"

STANFORD_ZIP_GOOGLE_DRIVE_ID = "1QixGiHD2mHKuJtB69GHDQA0wTyXtHzjl"
STANFORD_ZIP_NAME = "stanford-corenlp-4.1.0.zip"
STANFORD_ZIP_PATH = CURR_DIR / STANFORD_ZIP_NAME

In [None]:
#| export
logger = logging.getLogger(__name__)

In [None]:
#| export
def _is_installed_nlp() -> bool:
    return Path(NLP_DIR_PATH).is_dir()

In [None]:
#| export
def _install_nlp() -> None:
    logger.info(f"Installing {NLP_DIR_NAME} into {CURR_DIR}.")
    if not STANFORD_ZIP_PATH.is_file():
        logger.info(f"downloading {STANFORD_ZIP_NAME}...")
        download_file_from_google_drive(STANFORD_ZIP_GOOGLE_DRIVE_ID, STANFORD_ZIP_PATH)
    with open(STANFORD_ZIP_PATH, "rb") as zipresp:
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            logging.info(f"Extracting files from the zip folder...")
            zfile.extractall(CURR_DIR)

    logging.info("installation completed.")

In [None]:
#| export
def _is_installed_java() -> bool:
    version = popen(
        "java -version 2>&1 | grep 'version' 2>&1 | awk -F\\\" '{ split($2,a,\".\"); print a[1]\".\"a[2]}'").read()

    if len(version) != 0 and float(version) >= JAVA_MIN_VERSION:
        return True

    return Path(INSTALLATION_PATH).is_dir()

In [None]:
#| export
def _run_installation() -> None:
    if not _is_installed_nlp():
        _install_nlp()
        assert _is_installed_nlp()
    if not _is_installed_java():
        logging.info(f"Installing JRE into {INSTALLATION_PATH}.")
        jdk.install('8', jre=True)
        if _is_installed_java():
            logging.info("installation completed.")
        else:
            raise IOError("installation failed")

In [None]:
#| export
CoreNLPEngine = None
def download_and_install_nlp():
    global CoreNLPEngine
    try:
        _run_installation()
        CoreNLPEngine = StanfordCoreNLP(NLP_DIR_PATH)
    except Exception as e:
        logger.error(f"Installation NLP failed {e}")

In [None]:
# download_and_install_nlp()

Installation NLP failed File is not a zip file


In [None]:
#| export
def tokenize_wrapper(sentence: str) -> Iterator:
    for token in CoreNLPEngine.tokenize(sentence):
        yield token["token"], token["span"]

In [None]:
#| export
Tokenize = [
    'Tokenize',
    tokenize_wrapper,
    [str],
    [str, Span]
]

In [None]:
#| export
def ssplit_wrapper(sentence: str) -> Iterator:
    for s in CoreNLPEngine.ssplit(sentence):
        yield s,

In [None]:
#| export
SSplit = [
    'SSplit',
    ssplit_wrapper,
    [str],
    [str]
]

In [None]:
#| export
def pos_wrapper(sentence: str) -> Iterator:
    for res in CoreNLPEngine.pos(sentence):
        yield res["token"], res["pos"], res["span"]

In [None]:
#| export
POS = [
    'POS',
    pos_wrapper,
    [str],
    [str, str, Span]
]

In [None]:
#| export
def lemma_wrapper(sentence: str) -> Iterator:
    for res in CoreNLPEngine.lemma(sentence):
        yield res["token"], res["lemma"], res["span"]

In [None]:
#| export
Lemma = [
    'Lemma',
    lemma_wrapper,
    [str],
    [str, str, Span]
]

In [None]:
#| export
def ner_wrapper(sentence: str) -> Iterator:
    for res in CoreNLPEngine.ner(sentence):
        if res["ner"] != 'O':
            yield res["token"], res["ner"], res["span"]

In [None]:
#| export
NER = [
    'NER',
    ner_wrapper,
    [str],
    [str, str, Span]

]

In [None]:
#| export
def entitymentions_wrapper(sentence: str) -> Iterator:
    for res in CoreNLPEngine.entitymentions(sentence):
        confidence = json.dumps(res["nerConfidences"]).replace("\"", "'")
        yield (res["docTokenBegin"], res["docTokenEnd"], res["tokenBegin"], res["tokenEnd"], res["text"],
               res["characterOffsetBegin"], res["characterOffsetEnd"], res["ner"], confidence)

In [None]:
#| export
EntityMentions = [
    'EntityMentions',
    entitymentions_wrapper,
    [str],
    [int, int, int, int, str, int, int, str, str]
]

In [None]:
#| export
def cleanxml_wrapper(sentence: str) -> Iterator:
    for res in CoreNLPEngine.cleanxml(sentence)["tokens"]:
        yield res['index'], res['word'], res['originalText'], res['characterOffsetBegin'], res['characterOffsetEnd']

In [None]:
#| export
CleanXML = [
    'CleanXML',
    cleanxml_wrapper,
    [str],
    [int, str, str, int, int]
]

In [None]:
#| export
def parse_wrapper(sentence: str) -> Iterator:
    for res in CoreNLPEngine.parse(sentence):
        # note #1: this yields a tuple
        # note #2: we replace the newlines with `<nl> because it is difficult to tell the results apart otherwise
        yield res.replace("\n", "<nl>").replace("\r", ""),

In [None]:
#| export
Parse = [
    'Parse',
    parse_wrapper,
    [str],
    [str]
]

In [None]:
#| export
def dependency_parse_wrapper(sentence: str) -> Iterator:
    for res in CoreNLPEngine.dependency_parse(sentence):
        yield res['dep'], res['governor'], res['governorGloss'], res['dependent'], res['dependentGloss']

In [None]:
#| export
DepParse = [
    'DepParse',
    dependency_parse_wrapper,
    [str],
    [str, int, str, int, str]
]

In [None]:
#| export
def coref_wrapper(sentence: str) -> Iterator:
    for res in CoreNLPEngine.coref(sentence):
        yield (res['id'], res['text'], res['type'], res['number'], res['gender'], res['animacy'], res['startIndex'],
               res['endIndex'], res['headIndex'], res['sentNum'],
               tuple(res['position']), str(res['isRepresentativeMention']))

In [None]:
#| export
Coref = [
    'Coref',
    coref_wrapper,
    [str],
    [int, str, str, str, str, str, int, int, int, int, Span, str]
]

In [None]:
#| export
def openie_wrapper(sentence: str) -> Iterator:
    for lst in CoreNLPEngine.openie(sentence):
        for res in lst:
            yield (res['subject'], tuple(res['subjectSpan']), res['relation'], tuple(res['relationSpan']),
                   res['object'], tuple(res['objectSpan']))

In [None]:
#| export
OpenIE = [
    'OpenIE',
    openie_wrapper,
    [str],
    [str, Span, str, Span, str, Span]
]

In [None]:
#| export
def kbp_wrapper(sentence: str) -> Iterator:
    for lst in CoreNLPEngine.kbp(sentence):
        for res in lst:
            yield (res['subject'], tuple(res['subjectSpan']), res['relation'], tuple(res['relationSpan']),
                   res['object'], tuple(res['objectSpan']))

In [None]:
#| export
KBP = [
    'KBP',
    kbp_wrapper,
    [str],
    [str, Span, str, Span, str, Span]
]

In [None]:
#| export
def quote_wrapper(sentence: str) -> Iterator:
    for res in CoreNLPEngine.quote(sentence):
        yield (res['id'], res['text'], res['beginIndex'], res['endIndex'], res['beginToken'], res['endToken'],
               res['beginSentence'], res['endSentence'], res['speaker'], res['canonicalSpeaker'])

In [None]:
#| export
Quote = [
    'Quote',
    quote_wrapper,
    [str],
    [int, str, int, int, int, int, int, int, str, str]
]

In [None]:
#| export
# currently ignoring sentimentTree
def sentiment_wrapper(sentence: str) -> Iterator:
    for res in CoreNLPEngine.sentiment(sentence):
        yield int(res['sentimentValue']), res['sentiment'], json.dumps(res['sentimentDistribution'])

In [None]:
#| export
Sentiment = [
    'Sentiment',
    sentiment_wrapper,
    [str],
    [int, str, str]
]

In [None]:
#| export
def truecase_wrapper(sentence: str) -> Iterator:
    for res in CoreNLPEngine.truecase(sentence):
        yield res['token'], res['span'], res['truecase'], res['truecaseText']

In [None]:
#| export
TrueCase = [
    'TrueCase',
    truecase_wrapper,
    [str],
    [str, Span, str, str]
]

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()
     