In [1]:
import html
import pprint
import re
from html.parser import HTMLParser


class ReutersParser(HTMLParser):

    def __init__(self, encoding='latin-1'):

        html.parser.HTMLParser.__init__(self)
        self._reset()
        self.encoding = encoding

    def _reset(self):
        self.in_body = False
        self.in_topics = False
        self.in_topic_d = False
        self.body = ""
        self.topics = []
        self.topic_d = ""

    def parse(self, fd):

        self.docs = []
        for chunk in fd:
            self.feed(chunk.decode(self.encoding))
            for doc in self.docs:
                yield doc
            self.docs = []
        self.close()

    def handle_starttag(self, tag, attrs):

        if tag == "reuters":
            pass
        elif tag == "body":
            self.in_body = True
        elif tag == "topics":
            self.in_topics = True
        elif tag == "d":
            self.in_topic_d = True 

    def handle_endtag(self, tag):

        if tag == "reuters":
            self.body = re.sub(r'\s+', r' ', self.body)
            self.docs.append( (self.topics, self.body) )
            self._reset()
        elif tag == "body":
            self.in_body = False
        elif tag == "topics":
            self.in_topics = False
        elif tag == "d":
            self.in_topic_d = False
            self.topics.append(self.topic_d)
            self.topic_d = ""  

    def handle_data(self, data):

        if self.in_body:
            self.body += data
        elif self.in_topic_d:
            self.topic_d += data

        
if __name__ == "__main__":
    # Open the first Reuters data set and create the parser
    filename = "reut2-003.sgm"
    parser = ReutersParser()

    # Parse the document and force all generated docs into
    # a list so that it can be printed out to the console
    doc = parser.parse(open(filename, 'rb'))
#    pprint.pprint(list(doc))
    topic= [result for result in list(doc) if result[0]]
#    print(topic)
    list_acq= [topics for topics in topic if 'acq' in topics[0] and topics[1]!='']

In [None]:
# import subprocess
# subprocess.check_call(["python", '-m', 'pip', 'install', 'spacy']) # install pkg


In [None]:
# import subprocess
# subprocess.check_call(["python", '-m', 'spacy', 'download', 'en']) # install pkg


In [5]:
nlp = spacy.load('en')
doc1 = nlp(list_acq[12][1])
displacy.render(doc1,style='ent',jupyter=True)


"USAir Group Inc said Piedmont Aviation Inc has agreed to be acquired for 69 dlrs per share. The company, in a newspaper advertisement, said it has started a tender offer for all Piedmont shares at that price, and the Piedmont board, with two directors absent, has unanimously approved the bid. The offer and withdrawal rights are to expire April Three unless extended, and the bid is to be followed by a merger at the same price. USAir said Piedmont has granted it an irrevocable option to buy up to 3,491,030 new shares under certain circumstances. Piedmont now has about 18.6 mln shares outstanding. USAir said the tender is conditioned on receipt of enough shares to give USAir at least a 50.1 pct interest in Piedmont on a fully diluted basis and approval by the U.S. Department of Transportation of a voting trust agreement permitting USAir to buy and hold shares pending review of its application to gain control of Piedmont. The company said its merger agreement with Piedmont provides that t

In [6]:
for token in doc1:
    print('"'+token.text+ '"', token.idx)

"Allegheny" 0
"International" 10
"Inc" 24
"said" 28
"it" 33
"has" 36
"entered" 40
"into" 48
"an" 53
"agreement" 56
"to" 66
"merge" 69
"with" 75
"an" 80
"affiliate" 83
"of" 93
"First" 96
"Boston" 102
"Inc" 109
"'s" 112
"<" 115
"FPC" 116
">" 119
"First" 121
"Boston" 127
"Corp" 134
"in" 139
"a" 142
"transaction" 144
"valued" 156
"at" 163
"about" 166
"500" 172
"mln" 176
"dlrs" 180
"." 184
"Allegheny" 186
"said" 196
"the" 201
"agreement" 205
"calls" 215
"for" 221
"holders" 225
"of" 233
"its" 236
"common" 240
"to" 247
"receive" 250
"24.60" 258
"dlrs" 264
"a" 269
"share" 271
"." 276
"Holders" 278
"of" 286
"the" 289
"company" 293
"'s" 300
"2.19" 303
"dlrs" 308
"cumulative" 313
"preference" 324
"shares" 335
"will" 342
"receive" 347
"20" 355
"dlrs" 358
"a" 363
"share" 365
"and" 371
"those" 375
"owning" 381
"its" 388
"11.25" 392
"dlrs" 398
"convertible" 403
"preferred" 415
"will" 425
"receive" 430
"87.50" 438
"dlrs" 444
"a" 449
"share" 451
"." 456
"Allegheny" 458
"International" 468
"said" 482
"t

In [None]:
for sent in doc.sents:
    print()
    print(sent)

In [7]:
print([(token.text, token.tag_) for token in doc])

[]


In [8]:
for ent in doc.ents:
    print(ent.text, ent.label_)

AttributeError: 'generator' object has no attribute 'ents'

In [None]:
#doc = nlp(list_acq[2][1])
list_acq[2][1]


In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
for sent in doc.sents:
    print()
    print(sent)

In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
spacy.explain('GPE')

In [None]:
displacy.render(doc,style='ent',jupyter=True)

In [None]:
spacy.explain('ORG')

In [None]:
displacy.render(nlp(list_acq[12][1]),style='ent',jupyter=True)


In [None]:
for token in doc.ents:
    print(token.text, token.label_)

In [None]:
list_acq[12][1]
