In [289]:
data_folder = './reuters21578/'

sgml_number_of_files = 22
sgml_file_name_template = 'reut2-NNN.sgm'

# Category files
category_files = {
    'to_': ('Topics', 'all-topics-strings.lc.txt'),
    'pl_': ('Places', 'all-places-strings.lc.txt'),
    'pe_': ('People', 'all-people-strings.lc.txt'),
    'or_': ('Organizations', 'all-orgs-strings.lc.txt'),
    'ex_': ('Exchanges', 'all-exchanges-strings.lc.txt')
}

# Word2Vec number of features
num_features = 500
# Limit each newsline to a fixed number of words
# document_max_num_words = 100
# Selected categories
# selected_categories = ['pl_usa']

In [290]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [291]:
# Create category dataframe

# Read all categories
category_data = []

for category_prefix in category_files.keys():
    with open(data_folder + category_files[category_prefix][1], 'r') as file:
        for category in file.readlines():
            category_data.append([category_prefix + category.strip().lower(), 
                                  category_files[category_prefix][0]])

# Create category dataframe
news_categories = pd.DataFrame(data=category_data)

# print "category_data: ", category_data
(news_categories.values).tolist()

[['to_acq', 'Topics'],
 ['to_alum', 'Topics'],
 ['to_austdlr', 'Topics'],
 ['to_austral', 'Topics'],
 ['to_barley', 'Topics'],
 ['to_bfr', 'Topics'],
 ['to_bop', 'Topics'],
 ['to_can', 'Topics'],
 ['to_carcass', 'Topics'],
 ['to_castor-meal', 'Topics'],
 ['to_castor-oil', 'Topics'],
 ['to_castorseed', 'Topics'],
 ['to_citruspulp', 'Topics'],
 ['to_cocoa', 'Topics'],
 ['to_coconut', 'Topics'],
 ['to_coconut-oil', 'Topics'],
 ['to_coffee', 'Topics'],
 ['to_copper', 'Topics'],
 ['to_copra-cake', 'Topics'],
 ['to_corn', 'Topics'],
 ['to_corn-oil', 'Topics'],
 ['to_cornglutenfeed', 'Topics'],
 ['to_cotton', 'Topics'],
 ['to_cotton-meal', 'Topics'],
 ['to_cotton-oil', 'Topics'],
 ['to_cottonseed', 'Topics'],
 ['to_cpi', 'Topics'],
 ['to_cpu', 'Topics'],
 ['to_crude', 'Topics'],
 ['to_cruzado', 'Topics'],
 ['to_dfl', 'Topics'],
 ['to_dkr', 'Topics'],
 ['to_dlr', 'Topics'],
 ['to_dmk', 'Topics'],
 ['to_drachma', 'Topics'],
 ['to_earn', 'Topics'],
 ['to_escudo', 'Topics'],
 ['to_f-cattle', 'Top

In [292]:
import re
import xml.sax.saxutils as saxutils
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/sarvat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [293]:
def to_category_vector(categories):
    vector = zeros(len(categories)).astype(float32)
    
    for i in range(len(categories)):
        if target_categories[i] in categories:
            vector[i] = 1.0
    
    return vector

In [294]:
lemmatizer = WordNetLemmatizer()
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
stop_words = set(stopwords.words("english"))

def cleanUpSentence(r, stop_words = None):
    r = r.lower().replace("<br />", " ")
    r = re.sub(strip_special_chars, "", r.lower())
    if stop_words is not None:
        words = word_tokenize(r)
        filtered_sentence = []
        for w in words:
            w = lemmatizer.lemmatize(w)
            if w not in stop_words:
                filtered_sentence.append(w)
        return " ".join(filtered_sentence)
    else:
        return r

In [295]:
# Parse SGML files
document_X = {}
document_Y = {}
docid_traintest = {}
def strip_tags(text):
    return re.sub('<[^<]+?>', '', text).strip()

def unescape(text):
    return saxutils.unescape(text)

# Iterate all files
# for i in range(sgml_number_of_files):
#     if i < 10:
#         seq = '00' + str(i)
#     else:
#         seq = '0' + str(i)
        
#     file_name = sgml_file_name_template.replace('NNN', seq)
#     print('Reading file: %s' % file_name)
    #data_folder + file_name
with open(data_folder+'reut2-000.sgm', 'rb') as file:
    
    content = BeautifulSoup(file.read().lower(),'html.parser')

    for newsline in content('reuters'):
        document_categories = []

        # News-line Id
        document_id = newsline['newid']
#             print document_id,
        train_test = newsline['lewissplit']
        docid_traintest[document_id] = train_test
#             print "train_test: ",train_test

        # News-line text
        document_body = strip_tags(str(newsline('text')[0].body)).replace('reuter\n&#3;', '')
        doc_categories=strip_tags(str(newsline('topics')[0].body))
        doc_categories = unescape(doc_categories)

        document_body = unescape(document_body)

        # News-line categories
        topics = newsline.topics.contents
        places = newsline.places.contents
        people = newsline.people.contents
        orgs = newsline.orgs.contents
        exchanges = newsline.exchanges.contents

        for topic in topics:
            document_categories.append('to_' + strip_tags(str(topic)))

        for place in places:
            document_categories.append('pl_' + strip_tags(str(place)))

        for person in people:
            document_categories.append('pe_' + strip_tags(str(person)))

        for org in orgs:
            document_categories.append('or_' + strip_tags(str(org)))

        for exchange in exchanges:
            document_categories.append('ex_' + strip_tags(str(exchange)))
#             print "document_categories: ",document_categories
        # Create new document    
#             update_frequencies(document_categories)

        document_X[document_id] = document_body
        document_Y[document_id] = document_categories
# print(document_Y)
one_hot_label=[]
for key,v in document_Y.items():
    dict_temp={'Topics':0,'Places':0,'Peoples':0,'Exchanges':0,'Organizations':0}
    for i in v:
        string=i.split('_')
        category=string[0]
        if category=='to':
            dict_temp['Topics']+=1
        if category=='pl':
            dict_temp['Places']+=1
        if category=='ex':
            dict_temp['Exchanges']+=1
        if category=='or':
            dict_temp['Organizations']+=1
        if category=='pe':
            dict_temp['Peoples']+=1
    one_hot_label.append(dict_temp)
    
# print(one_hot_label)
ranking=[]
for i in one_hot_label:
    ranking.append(list(i.values()))
print(np.array(ranking).shape)

(1000, 5)


In [296]:
# data preprocessing

In [297]:
totalX = []
#totalY = np.array(document_Y)
nltk.download('punkt')
nltk.download('wordnet')
#print(document_X.keys())
for i, doc in document_X.items():
    #print(i)
    totalX.append(cleanUpSentence(doc, stop_words))


print(totalX[0])

[nltk_data] Downloading package punkt to /home/sarvat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sarvat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


shower continued throughout week inthe bahia cocoa zone alleviating drought since earlyjanuary improving prospect coming temporaoalthough normal humidity level restoredcomissaria smith said weekly review dry period mean temporao late year arrival week ended february 22 155221 bagsof 60 kilo making cumulative total season 593mln 581 stage last year seemsthat cocoa delivered earlier consignment wa included thearrivals figure comissaria smith said still doubt howmuch old crop cocoa still available harvesting haspractically come end total bahia crop estimatesaround 64 mln bag sale standing almost 62 mln thereare hundred thousand bag still hand farmersmiddlemen exporter processor doubt much cocoa would fitfor export shipper experiencing dificulties inobtaining bahia superior certificate view lower quality recent week farmer havesold good part cocoa held consignment comissaria smith said spot bean price rose 340 350cruzados per arroba 15 kilo bean shipper reluctant offer nearby shipment ando

In [298]:
import numpy as np
max_vocab_size = 200
input_tokenizer = Tokenizer()# change accuracy....
input_tokenizer.fit_on_texts(totalX)
#print(input_tokenizer.word_counts)
encoded_docs = input_tokenizer.texts_to_matrix(totalX, mode='count')
print(encoded_docs.shape)
input_vocab_size = len(input_tokenizer.word_index) + 1
print("input_vocab_size:",input_vocab_size)
totalX = np.array(pad_sequences(input_tokenizer.texts_to_sequences(totalX)))
# print(input_tokenizer.word_counts)
# print(t.document_count)
# print(t.word_index)
# print(t.word_docs)
# print(totalX.counts)

(1000, 17601)
input_vocab_size: 17601


In [299]:
from keras.models import Sequential
from keras.layers import Dense,Flatten
nn = Sequential()
nn.add(Dense(10, activation="relu", input_shape=(17601,)))
#nn.add(Flatten())
# nn.Flatten()
nn.add(Dense(5,activation="sigmoid"))

In [300]:
nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [301]:
import math

def softmax(z):
    z_exp = [math.exp(i) for i in z]
    sum_z_exp = sum(z_exp)
    return [i / sum_z_exp for i in z_exp]

In [302]:
nn.fit(np.array(encoded_docs), np.array(ranking), batch_size=16, epochs=5,
          verbose=1, validation_split=0.1)

Instructions for updating:
Use tf.cast instead.
Train on 900 samples, validate on 100 samples
Epoch 1/5

  % delta_t_median)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f2aab18d240>