In [43]:
import numpy as np
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup
import re
import urllib

### Reuters dataset Loading and Tokenizing

In [44]:
# Load docs
docs = []
topic_str = []
for datafile in sorted(listdir('data/reuters/')):
    with open('data/reuters/{}'.format(datafile), 'rb') as f:
        if datafile.endswith('.sgm'):
            print('Loading {}...'.format(datafile))
            soup = BeautifulSoup(f.read(), 'lxml')
            for node in soup.findAll('text'):
                docs.append(''.join(node.findAll(text=True)))
            for node in soup.findAll('topics'):
                topic_str.append(node.findAll(text=True))

Loading reut2-000.sgm...
Loading reut2-001.sgm...
Loading reut2-002.sgm...
Loading reut2-003.sgm...
Loading reut2-004.sgm...
Loading reut2-005.sgm...
Loading reut2-006.sgm...
Loading reut2-007.sgm...
Loading reut2-008.sgm...
Loading reut2-009.sgm...
Loading reut2-010.sgm...
Loading reut2-011.sgm...
Loading reut2-012.sgm...
Loading reut2-013.sgm...
Loading reut2-014.sgm...
Loading reut2-015.sgm...
Loading reut2-016.sgm...
Loading reut2-017.sgm...
Loading reut2-018.sgm...
Loading reut2-019.sgm...
Loading reut2-020.sgm...
Loading reut2-021.sgm...


In [45]:
# Preprocess topics
with open('data/reuters/all-topics-strings.lc.txt') as f:
    topiclist = [x.strip() for x in f.readlines()]
topic2id = dict(zip(topiclist, np.arange(len(topiclist))))
id2topic = {v: k for k, v in topic2id.items()}
topics = [[topic2id[x] for x in y] for y in topic_str]

In [46]:
# Preprocess documents
splitted_docs = [re.sub('[^a-zA-Z]+', ' ', doc) for doc in docs]
splitted_docs = [doc.split(' ') for doc in splitted_docs]
splitted_docs = [[word for word in doc if word != ''] for doc in splitted_docs]
words = [x for y in splitted_docs for x in y]
unique_words, unique_words_c = np.unique(words, return_counts=True)

In [47]:
_, unique_words_sort = zip(*sorted(zip(unique_words_c, unique_words), reverse=True))

In [48]:
# Tokenize
word2id = dict(zip(unique_words_sort, np.arange(len(unique_words_sort))))
word2id['<unk>'] = -1
id2word = {v: k for k, v in word2id.items()}
tokenized = [[word2id[word] if word in word2id else -1 for word in doc] for doc in splitted_docs]

In [49]:
# Save tokenized reuters
np.save('data/reuters/reuters_topics.npy', topics)
np.save('data/reuters/reuters_topic2id.npy', topic2id)
np.save('data/reuters/reuters_id2topic.npy', id2topic)
np.save('data/reuters/reuters_word2id.npy', word2id)
np.save('data/reuters/reuters_id2word.npy', id2word)
np.save('data/reuters/reuters_tokenized.npy', tokenized)

In [50]:
# Load tokenized reuters
topic2id = np.load('data/reuters/reuters_topic2id.npy').item(0)
id2topic = np.load('data/reuters/reuters_id2topic.npy').item(0)
topics = list(np.load('data/reuters/reuters_topics.npy'))

word2id = np.load('data/reuters/reuters_word2id.npy').item(0)
id2word = np.load('data/reuters/reuters_id2word.npy').item(0)
tokenized = list(np.load('data/reuters/reuters_tokenized.npy'))

In [51]:
# Example document
tokenized_doc = tokenized[0]
regular_doc = ' '.join([id2word[x] for x in tokenized_doc])
tokenized_topic = topics[0]
regular_topic = ' '.join([id2topic[x] for x in tokenized_topic])

print('[TOPIC]\n', regular_topic)
print('\n[DOC]\n', regular_doc)

[TOPIC]
 cocoa

[DOC]
 BAHIA COCOA REVIEW SALVADOR Feb Showers continued throughout the week in the Bahia cocoa zone alleviating the drought since early January and improving prospects for the coming temporao although normal humidity levels have not been restored Comissaria Smith said in its weekly review The dry period means the temporao will be late this year Arrivals for the week ended February were bags of kilos making a cumulative total for the season of mln against at the same stage last year Again it seems that cocoa delivered earlier on consignment was included in the arrivals figures Comissaria Smith said there is still some doubt as to how much old crop cocoa is still available as harvesting has practically come to an end With total Bahia crop estimates around mln bags and sales standing at almost mln there are a few hundred thousand bags still in the hands of farmers middlemen exporters and processors There are doubts as to how much of this cocoa would be fit for export as s