In [154]:
data_folder = './reuters21578/'

sgml_number_of_files = 21
sgml_file_name_template = 'reut2-NNN.sgm'

# Category files
category_files = {
    'to_': ('Topics', 'all-topics-strings.lc.txt'),
    'pl_': ('Places', 'all-places-strings.lc.txt'),
    'pe_': ('People', 'all-people-strings.lc.txt'),
    'or_': ('Organizations', 'all-orgs-strings.lc.txt'),
    'ex_': ('Exchanges', 'all-exchanges-strings.lc.txt')
}

# Word2Vec number of features
num_features = 500
# Limit each newsline to a fixed number of words
# document_max_num_words = 100
# Selected categories
# selected_categories = ['pl_usa']

In [155]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [156]:
# Create category dataframe

# Read all categories
category_data = []

for category_prefix in category_files.keys():
    with open(data_folder + category_files[category_prefix][1], 'r') as file:
        for category in file.readlines():
            category_data.append([category_prefix + category.strip().lower(), 
                                  category_files[category_prefix][0]])

# Create category dataframe
news_categories = pd.DataFrame(data=category_data)

# print "category_data: ", category_data
#(news_categories.values).tolist()

In [157]:
import re
import xml.sax.saxutils as saxutils
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/sarvat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [158]:
def to_category_vector(categories):
    vector = zeros(len(categories)).astype(float32)
    
    for i in range(len(categories)):
        if target_categories[i] in categories:
            vector[i] = 1.0
    
    return vector

In [159]:
lemmatizer = WordNetLemmatizer()
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
stop_words = set(stopwords.words("english"))

def cleanUpSentence(r, stop_words = None):
    r = r.lower().replace("<br />", " ")
    r = re.sub(strip_special_chars, "", r.lower())
    if stop_words is not None:
        words = word_tokenize(r)
        filtered_sentence = []
        for w in words:
            w = lemmatizer.lemmatize(w)
            if w not in stop_words:
                filtered_sentence.append(w)
        return " ".join(filtered_sentence)
    else:
        return r

In [160]:
# Parse SGML files
document_X = {}
document_Y = {}
docid_traintest = {}
def strip_tags(text):
    return re.sub('<[^<]+?>', '', text).strip()

def unescape(text):
    return saxutils.unescape(text)

# Iterate all files
for i in range(sgml_number_of_files):
    if i < 10:
        seq = '00' + str(i)
    else:
        seq = '0' + str(i)
        
    file_name = sgml_file_name_template.replace('NNN', seq)
    print('Reading file: %s' % file_name)
    #data_folder + file_name
    with open(data_folder+file_name, 'rb') as file:
        content = BeautifulSoup(file.read().lower(),'html.parser')

        for newsline in content('reuters'):
            document_categories = []

            # News-line Id
            document_id = newsline['newid']
        #             print document_id,
            train_test = newsline['lewissplit']
            docid_traintest[document_id] = train_test
        #             print "train_test: ",train_test

            # News-line text
            document_body = strip_tags(str(newsline('text')[0].body)).replace('reuter\n&#3;', '')
            doc_categories=strip_tags(str(newsline('topics')[0].body))
            doc_categories = unescape(doc_categories)

            document_body = unescape(document_body)

            # News-line categories
            topics = newsline.topics.contents
            places = newsline.places.contents
            people = newsline.people.contents
            orgs = newsline.orgs.contents
            exchanges = newsline.exchanges.contents

            for topic in topics:
                document_categories.append('to_' + strip_tags(str(topic)))

            for place in places:
                document_categories.append('pl_' + strip_tags(str(place)))

            for person in people:
                document_categories.append('pe_' + strip_tags(str(person)))

            for org in orgs:
                document_categories.append('or_' + strip_tags(str(org)))

            for exchange in exchanges:
                document_categories.append('ex_' + strip_tags(str(exchange)))
        #             print "document_categories: ",document_categories
            # Create new document    
        #             update_frequencies(document_categories)

            document_X[document_id] = document_body
            document_Y[document_id] = document_categories
# print(document_Y)
one_hot_label=[]
for key,v in document_Y.items():
    dict_temp={'Topics':0,'Places':0,'Peoples':0,'Exchanges':0,'Organizations':0}
    for i in v:
        string=i.split('_')
        category=string[0]
        if category=='to':
            dict_temp['Topics']+=1
        if category=='pl':
            dict_temp['Places']+=1
        if category=='ex':
            dict_temp['Exchanges']+=1
        if category=='or':
            dict_temp['Organizations']+=1
        if category=='pe':
            dict_temp['Peoples']+=1
    one_hot_label.append(dict_temp)
    
# print(one_hot_label)
ranking=[]
for i in one_hot_label:
    ranking.append(list(i.values()))
#print(np.array(ranking).shape)

Reading file: reut2-000.sgm
Reading file: reut2-001.sgm
Reading file: reut2-002.sgm
Reading file: reut2-003.sgm
Reading file: reut2-004.sgm
Reading file: reut2-005.sgm
Reading file: reut2-006.sgm
Reading file: reut2-007.sgm
Reading file: reut2-008.sgm
Reading file: reut2-009.sgm
Reading file: reut2-010.sgm
Reading file: reut2-011.sgm
Reading file: reut2-012.sgm
Reading file: reut2-013.sgm
Reading file: reut2-014.sgm
Reading file: reut2-015.sgm
Reading file: reut2-016.sgm
Reading file: reut2-017.sgm
Reading file: reut2-018.sgm
Reading file: reut2-019.sgm
Reading file: reut2-020.sgm


In [161]:
print(document_X['1'])

showers continued throughout the week in
the bahia cocoa zone, alleviating the drought since early
january and improving prospects for the coming temporao,
although normal humidity levels have not been restored,
comissaria smith said in its weekly review.
    the dry period means the temporao will be late this year.
    arrivals for the week ended february 22 were 155,221 bags
of 60 kilos making a cumulative total for the season of 5.93
mln against 5.81 at the same stage last year. again it seems
that cocoa delivered earlier on consignment was included in the
arrivals figures.
    comissaria smith said there is still some doubt as to how
much old crop cocoa is still available as harvesting has
practically come to an end. with total bahia crop estimates
around 6.4 mln bags and sales standing at almost 6.2 mln there
are a few hundred thousand bags still in the hands of farmers,
middlemen, exporters and processors.
    there are doubts as to how much of this cocoa would be fit
for export 

In [162]:
# data preprocessing
import numpy as np

nltk.download('punkt')
nltk.download('wordnet')
def create_x_matrix(document_X):
    totalX = []
    for i, doc in document_X.items():
        totalX.append(cleanUpSentence(doc, stop_words))
    max_vocab_size = 200
    input_tokenizer = Tokenizer(200)
    input_tokenizer.fit_on_texts(totalX)
    encoded_docs = input_tokenizer.texts_to_matrix(totalX, mode='count')
    return totalX,encoded_docs

[nltk_data] Downloading package punkt to /home/sarvat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sarvat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [163]:
totalX,encoded_docs=create_x_matrix(document_X)

In [172]:
from keras.models import Sequential
from keras.layers import Dense,Flatten, Dropout,Embedding
nn = Sequential()
nn.add(Embedding(1000, 20, input_length=max_vocab_size))
nn.add(Dense(10, activation="relu", input_shape=(max_vocab_size,)))
nn.add(Dropout(0.15))
nn.add(Flatten())
nn.add(Dense(5,activation="softmax"))
# change binary_crossentropy
nn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
nn.fit(np.array(encoded_docs), np.array(ranking), batch_size=16, epochs=5,
          verbose=1, validation_split=0.2)

Train on 16800 samples, validate on 4200 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f2d98841e80>

In [165]:
# Iterate all files
document_X={}
document_Y={}
for i in range(21,22):
    if i < 10:
        seq = '00' + str(i)
    else:
        seq = '0' + str(i)
        
    file_name = sgml_file_name_template.replace('NNN', seq)
    print('Reading file: %s' % file_name)
    #data_folder + file_name
    with open(data_folder+file_name, 'rb') as file:
        content = BeautifulSoup(file.read().lower(),'html.parser')

        for newsline in content('reuters'):
            document_categories = []

            # News-line Id
            document_id = newsline['newid']
        #             print document_id,
            train_test = newsline['lewissplit']
            docid_traintest[document_id] = train_test
        #             print "train_test: ",train_test

            # News-line text
            document_body = strip_tags(str(newsline('text')[0].body)).replace('reuter\n&#3;', '')
            doc_categories=strip_tags(str(newsline('topics')[0].body))
            doc_categories = unescape(doc_categories)

            document_body = unescape(document_body)

            # News-line categories
            topics = newsline.topics.contents
            places = newsline.places.contents
            people = newsline.people.contents
            orgs = newsline.orgs.contents
            exchanges = newsline.exchanges.contents

            for topic in topics:
                document_categories.append('to_' + strip_tags(str(topic)))

            for place in places:
                document_categories.append('pl_' + strip_tags(str(place)))

            for person in people:
                document_categories.append('pe_' + strip_tags(str(person)))

            for org in orgs:
                document_categories.append('or_' + strip_tags(str(org)))

            for exchange in exchanges:
                document_categories.append('ex_' + strip_tags(str(exchange)))
        #             print "document_categories: ",document_categories
            # Create new document    
        #             update_frequencies(document_categories)

            document_X[document_id] = document_body
            document_Y[document_id] = document_categories


Reading file: reut2-021.sgm


In [166]:
# test_text="Huge oil platforms dot the Gulf like beacons -- usually lit up like Christmas trees at night. One of them, sitting astride the Rostam offshore oilfield,was all but blown out of the water by U.S. Warships on Monday.    The Iranian platform, an unsightly mass of steel andconcrete, was a three-tier structure rising 200 feet (60metres) above the warm waters of the Gulf until four U.S.Destroyers pumped some 1,000 shells into it.    The U.S. Defense Department said just 10 pct of one section of the structure remained.    U.S. helicopters destroyed three Iranian gunboats after an American helicopter came under fire earlier this month and U.S.forces attacked, seized, and sank an Iranian ship they said had been caught laying mines.    But Iran was not deterred, according to U.S. defense officials, who said Iranian forces used Chinese-made Silkworm missiles to hit a U.S.-owned Liberian-flagged ship on Thursday and the Sea Isle City on Friday.    Both ships were hit in the territorial waters of Kuwait, a key backer of Iraq in its war with Iran.    Henry Schuler, a former U.S. diplomat in the Middle Eastnow with CSIS said Washington had agreed to escort Kuwaiti tankers in order to deter Iranian attacks on shipping.    But he said the deterrence policy had failed and the level of violence and threats to shipping had increased as a result of U.S. intervention and Iran's response.    The attack on the oil platform was the latest example of a U.S. \"tit-for-tat\" policy that gave Iran the initiative, said Harlan Ullman, an ex-career naval officer now with CSIS.    He said with this appraoch America would suffer \"the deathof one thousand cuts.\"    But for the United States to grab the initiative litarily, it must take warlike steps such as mining Iran's harbors or blockading the mouth of the Gulf through which itsshipping must pass, Schuler said.    He was among those advocating mining as a means of bringing Iran to the neogtiating table. If vital supplies were cut off,Tehran could not continue the war with Iraq.    Ullman said Washington should join Moscow in a diplomatic initiative to end the war and the superpowers should impose anarms embargo against Tehran if it refused to negotiate.    He said the United States should also threaten to mine and blockade Iran if it continued fighting and must press Iraq to acknowledge responsibility for starting the war as part of asettlement.    Iranian and Western diplomats say Iraq started the war by invading Iran's territory in 1980. Iraq blames Iran for theoutbreak of hostilities, which have entailed World War I-stylenfantry attacks resulting in horrific casualties.    Each side has attacked the others' shipping." 
test_total_X,test_encoded_X=create_x_matrix(document_X)
y=nn.predict(test_encoded_X)

In [167]:
print(y)
# print(document_Y)

[[4.2206326e-01 4.9578655e-01 4.5901392e-02 9.9682007e-03 2.6280599e-02]
 [8.5214394e-01 1.4697693e-01 2.2689383e-04 3.3172313e-05 6.1893353e-04]
 [4.2206326e-01 4.9578655e-01 4.5901392e-02 9.9682007e-03 2.6280599e-02]
 ...
 [4.6937662e-01 5.2064669e-01 1.3368876e-03 3.4897355e-03 5.1501370e-03]
 [6.4680630e-01 3.4982586e-01 1.3416599e-03 5.3858594e-04 1.4876238e-03]
 [5.2861172e-01 4.7051844e-01 1.7207043e-04 2.1913134e-04 4.7857504e-04]]


In [168]:
output=[]
for i in y:
    dict_temp={'Topics':i[0],'Places':i[1],'Peoples':i[2],'Exchanges':i[3],'Organizations':i[4]}
    output.append(dict_temp)     

In [169]:
# print(output[0])
ranked_output=[]
for i in output:
    t={}
    for key, value in sorted(i.items(), key=lambda item: item[1]):
        t[key]=value
    rank=0
    for k in t.keys():
        t[k]=rank
        rank+=1
    ranked_output.append(t)

In [170]:
#print(ranked_output)