In [None]:
import numpy as np
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup
import re
import urllib

### Reuters dataset Loading and Tokenizing

In [None]:
# Load docs
docs = []
topic_str = []
for datafile in sorted(listdir('data/reuters/')):
    with open('data/reuters/{}'.format(datafile), 'rb') as f:
        if datafile.endswith('.sgm'):
            print('Loading {}...'.format(datafile))
            soup = BeautifulSoup(f.read(), 'lxml')
            for node in soup.findAll('text'):
                docs.append(''.join(node.findAll(text=True)))
            for node in soup.findAll('topics'):
                topic_str.append(node.findAll(text=True))

In [None]:
# Preprocess topics
with open('data/reuters/all-topics-strings.lc.txt') as f:
    topiclist = [x.strip() for x in f.readlines()]
topic2id = dict(zip(topiclist, np.arange(len(topiclist))))
id2topic = {v: k for k, v in topic2id.items()}
topics = [[topic2id[x] for x in y] for y in topic_str]

In [None]:
# Preprocess documents
splitted_docs = [re.sub('[^a-zA-Z]+', ' ', doc) for doc in docs]
splitted_docs = [doc.split(' ') for doc in splitted_docs]
splitted_docs = [[word for word in doc if word != ''] for doc in splitted_docs]
words = [x for y in splitted_docs for x in y]
unique_words, unique_words_c = np.unique(words, return_counts=True)

In [None]:
_, unique_words_sort = zip(*sorted(zip(unique_words_c, unique_words), reverse=True))

In [None]:
# Tokenize
word2id = dict(zip(unique_words_sort, np.arange(len(unique_words_sort))))
id2word = {v: k for k, v in word2id.items()}
tokenized = [[word2id[word] if word in word2id else -1 for word in doc] for doc in splitted_docs]

In [None]:
# Save tokenized reuters
np.save('data/reuters/reuters_topics.npy', topics)
np.save('data/reuters/reuters_topic2id.npy', topic2id)
np.save('data/reuters/reuters_id2topic.npy', id2topic)
np.save('data/reuters/reuters_word2id.npy', word2id)
np.save('data/reuters/reuters_id2word.npy', id2word)
np.save('data/reuters/reuters_tokenized.npy', tokenized)

In [None]:
# Load tokenized reuters
topic2id = np.load('data/reuters/reuters_topic2id.npy').item(0)
id2topic = np.load('data/reuters/reuters_id2topic.npy').item(0)
topics = list(np.load('data/reuters/reuters_topics.npy'))

word2id = np.load('data/reuters/reuters_word2id.npy').item(0)
id2word = np.load('data/reuters/reuters_id2word.npy').item(0)
tokenized = list(np.load('data/reuters/reuters_tokenized.npy'))

In [None]:
# Example document
tokenized_doc = tokenized[0]
regular_doc = ' '.join([id2word[x] for x in tokenized_doc])
tokenized_topic = topics[0]
regular_topic = ' '.join([id2topic[x] for x in tokenized_topic])

print('[TOPIC]\n', regular_topic)
print('\n[DOC]\n', regular_doc)

### Writing to files

In [None]:
bigdocs = '\n'.join([' '.join([id2word[word_id] for word_id in doc]) for doc in tokenized])
with open('data/reuters/raw.txt', 'w') as f:
    f.write(bigdocs)