In [22]:
import html
import pprint
import re
from html.parser import HTMLParser


class ReutersParser(HTMLParser):

    def __init__(self, encoding='latin-1'):

        html.parser.HTMLParser.__init__(self)
        self._reset()
        self.encoding = encoding

    def _reset(self):
        self.in_body = False
        self.in_topics = False
        self.in_topic_d = False
        self.body = ""
        self.topics = []
        self.topic_d = ""

    def parse(self, fd):

        self.docs = []
        for chunk in fd:
            self.feed(chunk.decode(self.encoding))
            for doc in self.docs:
                yield doc
            self.docs = []
        self.close()

    def handle_starttag(self, tag, attrs):

        if tag == "reuters":
            pass
        elif tag == "body":
            self.in_body = True
        elif tag == "topics":
            self.in_topics = True
        elif tag == "d":
            self.in_topic_d = True 

    def handle_endtag(self, tag):

        if tag == "reuters":
            self.body = re.sub(r'\s+', r' ', self.body)
            self.docs.append( (self.topics, self.body) )
            self._reset()
        elif tag == "body":
            self.in_body = False
        elif tag == "topics":
            self.in_topics = False
        elif tag == "d":
            self.in_topic_d = False
            self.topics.append(self.topic_d)
            self.topic_d = ""  

    def handle_data(self, data):

        if self.in_body:
            self.body += data
        elif self.in_topic_d:
            self.topic_d += data

        
if __name__ == "__main__":
    # Open the first Reuters data set and create the parser
    filename = "reut2-003.sgm"
    parser = ReutersParser()

    # Parse the document and force all generated docs into
    # a list so that it can be printed out to the console
    doc = parser.parse(open(filename, 'rb'))
#    pprint.pprint(list(doc))
    s= [result for result in list(doc)]
    print(s[13])


(['usa', 'ussr'], 'Senior U.S. Arms control officials said they were optimistic the United States and Soviet Union could reach agreement on ways to verify a pact to eliminate medium-range nuclear missiles in Europe. Chief U.S. Arms control negotiator Max Kampelman said on the NBC television network a fair pact would be hard to negotiate, but, "We are determined to do it." Assistant Secretary of Defence for international security policy Richard Perle said he thought the two sides could agree on a method to ensure each side was honouring a missile pact. President Reagan said on Friday that Secretary of State George Shultz would go to Moscow next month for talks on arms control and a possible U.S.-Soviet summit meeting. The decision to send Shultz to Moscow followed an announcement by Soviet leader Mikhael Gorbachev that he was willing to separate elimination of medium-range missiles in Europe from his demand for curbs on U.S. Development of a Strategic Defence Initiative (SDI) anti-missi

In [26]:
    print(s[12])

(['acq', 'usa'], "USAir Group Inc said Piedmont Aviation Inc has agreed to be acquired for 69 dlrs per share. The company, in a newspaper advertisement, said it has started a tender offer for all Piedmont shares at that price, and the Piedmont board, with two directors absent, has unanimously approved the bid. The offer and withdrawal rights are to expire April Three unless extended, and the bid is to be followed by a merger at the same price. USAir said Piedmont has granted it an irrevocable option to buy up to 3,491,030 new shares under certain circumstances. Piedmont now has about 18.6 mln shares outstanding. USAir said the tender is conditioned on receipt of enough shares to give USAir at least a 50.1 pct interest in Piedmont on a fully diluted basis and approval by the U.S. Department of Transportation of a voting trust agreement permitting USAir to buy and hold shares pending review of its application to gain control of Piedmont. The company said its merger agreement with Piedmon

In [21]:
    for i in len(s):
        if s[i][0][0]==""

['money-fx', 'interest', 'france']