In [3]:
!pip install morfessor

Collecting morfessor
  Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl
Installing collected packages: morfessor
Successfully installed morfessor-2.0.6


In [23]:
from collections import defaultdict
import nltk

from polyglot.text import Text
from nltk.tag import StanfordNERTagger
import os

In [25]:
CORPUS = os.path.join("../data", "item1")
## Encode UTF-u and remove non-printable characters
#    document = filter(
#        lambda char: char in string.printable,
#        unicodedata.normalize('NFKD', document.decode('utf-8'))
#    )
kddcorpus = nltk.corpus.PlaintextCorpusReader(CORPUS, '.*\.txt')

In [26]:
def polyglot_entities(fileids=None, section = None, corpus=kddcorpus):
    """
    Extract entities from each file using polyglot
    """
    results = defaultdict(lambda: defaultdict(list))
    fileids = fileids or corpus.fileids()

    for fileid in fileids:
        if section is not None:
            text = Text((list(sectpull([fileid],section=section))[0][1]))
        else:
            text = Text(corpus.raw(fileid))



        for entity in text.entities:
            etext = " ".join(entity)

            if entity.tag == 'I-PER':
                key = 'persons'
            elif entity.tag == 'I-ORG':
                key = 'organizations'
            elif entity.tag == 'I-locations':
                key = 'locations'
            else:
                key = 'other'

            results[fileid][key].append(etext)

    return results

In [27]:
def stanford_entities(model, jar, fileids=None, corpus=kddcorpus, section = None):
    """
    Extract entities using the Stanford NER tagger.
    Must pass in the path to the tagging model and jar as downloaded from the
    Stanford Core NLP website.
    """
    results = defaultdict(lambda: defaultdict(list))
    fileids = fileids or corpus.fileids()
    tagger  = StanfordNERTagger(model, jar)
    section = section

    for fileid in fileids:
        if section is not None:
            text = nltk.word_tokenize(list(sectpull([fileid],section=section))[0][1])
        else:
            text  = corpus.words(fileid)

        chunk = []

        for token, tag in tagger.tag(text):
            if tag == 'O':
                if chunk:
                    # Flush the current chunk
                    etext =  " ".join([c[0] for c in chunk])
                    etag  = chunk[0][1]
                    chunk = []

                    if etag == 'PERSON':
                        key = 'persons'
                    elif etag == 'ORGANIZATION':
                        key = 'organizations'
                    elif etag == 'LOCATION':
                        key = 'locations'
                    else:
                        key = 'other'

                    results[fileid][key].append(etext)

            else:
                # Build chunk from tags
                chunk.append((token, tag))

    return results

In [28]:
def nltk_entities(fileids=None, section = None,corpus=kddcorpus):
    """
    Extract entities using the NLTK named entity chunker.
    """
    results = defaultdict(lambda: defaultdict(list))
    fileids = fileids or corpus.fileids()

    for fileid in fileids:
        if section is not None:
            text = nltk.pos_tag(nltk.word_tokenize(list(sectpull([fileid],section=section))[0][1]))
        else:
            text = nltk.pos_tag(corpus.words(fileid))



        for entity in nltk.ne_chunk(text):
            if isinstance(entity, nltk.tree.Tree):
                etext = " ".join([word for word, tag in entity.leaves()])
                label = entity.label()
            else:
                continue

            if label == 'PERSON':
                key = 'persons'
            elif label == 'ORGANIZATION':
                key = 'organizations'
            elif label == 'LOCATION':
                key = 'locations'
            elif label == 'GPE':
                key = 'other'
            else:
                key = None

            if key:
                results[fileid][key].append(etext)

    return results

In [29]:
# Only extract our annotated files.
fids  = ['msft-item1-2018.txt', 'goog-item1-2016.txt']

# NLTK Entities
nltkents = nltk_entities(fids, section='top')

# Polyglot Entities
polyents = polyglot_entities(fids, section='top')

# Stanford Model Loading
root  = os.path.expanduser('~/models/stanford-ner-2014-01-04/')
model = os.path.join(root, 'classifiers/english.muc.7class.distsim.crf.ser.gz')
jar   = os.path.join(root, 'stanford-ner-2014-01-04.jar')

# Stanford Entities
stanents = stanford_entities(model, jar, fids, section='top')

NameError: name 'sectpull' is not defined

In [None]:
##See https://www.districtdatalabs.com/named-entity-recognition-and-classification-for-entity-extraction