In [17]:
#read datafolder from Google Drive
from google.colab import drive
drive.mount('/content/drive')
data_folder = '/content/drive/My Drive/pg2k18/sem2/smai/smai_proj/reuters21578/'


sgml_number_of_files = 21
sgml_file_name_template = 'reut2-NNN.sgm'

# Category files
category_files = {
    'to_': ('Topics', 'all-topics-strings.lc.txt'),
    'pl_': ('Places', 'all-places-strings.lc.txt'),
    'pe_': ('People', 'all-people-strings.lc.txt'),
    'or_': ('Organizations', 'all-orgs-strings.lc.txt'),
    'ex_': ('Exchanges', 'all-exchanges-strings.lc.txt')
}

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [0]:
# Read all categories
category_data = []
category_dictionary={'Topics':[],'Places':[],'People':[],'Organizations':[],'Exchanges':[]}
for category_prefix in category_files.keys():
    with open(data_folder + category_files[category_prefix][1], 'r') as file:
        for category in file.readlines():
            category_data.append([category_prefix + category.strip().lower(), 
                                  category_files[category_prefix][0]])

# Create category dataframe
for i in category_data:
    category_dictionary[i[1]].append(i[0].split('_')[1])
news_categories = pd.DataFrame(data=category_data)


In [20]:
import re
import xml.sax.saxutils as saxutils
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
lemmatizer = WordNetLemmatizer()

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;\\n]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def cleanUpSentence(r):#, stop_words = None#
    r = r.lower().replace("<br />", " ")
    r = REPLACE_BY_SPACE_RE.sub(' ', r)
    r = BAD_SYMBOLS_RE.sub('', r)

    r = ' '.join(word for word in r.split() if word not in STOPWORDS)

    words = word_tokenize(r)

    for w in words:
        w = lemmatizer.lemmatize(w)

    return r

In [0]:
# Parse SGML files
def strip_tags(text):
    return re.sub('<[^<]+?>', '', text).strip()

def unescape(text):
    return saxutils.unescape(text)
  
def makeDict(filename, document_X, document_Y):
  with open(filename, 'rb') as file:

    content = BeautifulSoup(file.read().lower(),'html.parser')

    for newsline in content('reuters'):
      document_categories = []

      document_id = newsline['newid']



      document_body = strip_tags(str(newsline('text')[0].body)).replace('reuter\n&#3;', '')
      if document_body == 'None':
        continue

      doc_categories = strip_tags(str(newsline('topics')[0].body))
      doc_categories = unescape(doc_categories)

      document_body = unescape(document_body)

      topics = newsline.topics.contents
      places = newsline.places.contents
      people = newsline.people.contents
      orgs = newsline.orgs.contents
      exchanges = newsline.exchanges.contents

      for topic in topics:
          document_categories.append('to_' + strip_tags(str(topic)))

      for place in places:
          document_categories.append('pl_' + strip_tags(str(place)))

      for person in people:
          document_categories.append('pe_' + strip_tags(str(person)))

      for org in orgs:
          document_categories.append('or_' + strip_tags(str(org)))

      for exchange in exchanges:
          document_categories.append('ex_' + strip_tags(str(exchange)))

      document_X[document_id] = document_body
      document_Y[document_id] = document_categories

def readFiles(test_data = False):
  document_X = {}
  document_Y = {}
  if test_data == True:
    file_name = sgml_file_name_template.replace('NNN', '021')
    filename = data_folder + file_name
    makeDict(filename, document_X, document_Y)
  else:
    for i in range(sgml_number_of_files):
      if i < 10:
        seq = '00' + str(i)
      else:
        seq = '0' + str(i)

      file_name = sgml_file_name_template.replace('NNN', seq)
      print('Reading file: %s' % file_name)
      filename = data_folder + file_name
      makeDict(filename, document_X, document_Y)
  return document_X, document_Y



In [23]:
document_X, document_Y = readFiles()

Reading file: reut2-000.sgm
Reading file: reut2-001.sgm
Reading file: reut2-002.sgm
Reading file: reut2-003.sgm
Reading file: reut2-004.sgm
Reading file: reut2-005.sgm
Reading file: reut2-006.sgm
Reading file: reut2-007.sgm
Reading file: reut2-008.sgm
Reading file: reut2-009.sgm
Reading file: reut2-010.sgm
Reading file: reut2-011.sgm
Reading file: reut2-012.sgm
Reading file: reut2-013.sgm
Reading file: reut2-014.sgm
Reading file: reut2-015.sgm
Reading file: reut2-016.sgm
Reading file: reut2-017.sgm
Reading file: reut2-018.sgm
Reading file: reut2-019.sgm
Reading file: reut2-020.sgm


In [24]:
# data preprocessing
import numpy as np

nltk.download('punkt')
nltk.download('wordnet')

def create_x_matrix(document_X):
    totalX = []
    for i, doc in document_X.items():
        totalX.append(cleanUpSentence(doc))
    max_vocab_size = 200
    input_tokenizer = Tokenizer(200)
    input_tokenizer.fit_on_texts(totalX)
    encoded_docs = input_tokenizer.texts_to_matrix(totalX, mode='count')
    return totalX,encoded_docs

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
totalX,encoded_docs=create_x_matrix(document_X)

In [26]:
print(totalX[0])

showers continued throughout week bahia cocoa zone alleviating drought since early january improving prospects coming temporao although normal humidity levels restored comissaria smith said weekly review dry period means temporao late year arrivals week ended february 22 155 221 bags 60 kilos making cumulative total season 593 mln 581 stage last year seems cocoa delivered earlier consignment included arrivals figures comissaria smith said still doubt much old crop cocoa still available harvesting practically come end total bahia crop estimates around 64 mln bags sales standing almost 62 mln hundred thousand bags still hands farmers middlemen exporters processors doubts much cocoa would fit export shippers experiencing dificulties obtaining +bahia superior+ certificates view lower quality recent weeks farmers sold good part cocoa held consignment comissaria smith said spot bean prices rose 340 350 cruzados per arroba 15 kilos bean shippers reluctant offer nearby shipment limited sales b

In [0]:
#Creating one-hot encode
words_in_body={}

for i in range(len(totalX)):
    words=totalX[i].split(' ')
    words_in_body[i]=words    

one_hot_label=[]
for key,v in words_in_body.items():
    dict_temp={'Topics':0,'Places':0,'People':0,'Exchanges':0,'Organizations':0}
    for i in v:
        if i in category_dictionary['Topics']:
            dict_temp['Topics']+=1
        if i in category_dictionary['Places']:
            dict_temp['Places']+=1
        if i in category_dictionary['People']:
            dict_temp['People']+=1
        if i in category_dictionary['Exchanges']:
            dict_temp['Exchanges']+=1
        if i in category_dictionary['Organizations']:
            dict_temp['Organizations']+=1
            
    one_hot_label.append(dict_temp)
    

one_hot_label_list = []
for i in one_hot_label:

    one_hot_label_list.append(list(i.values()))



In [28]:
from keras.models import Sequential
from keras.layers import Dense,Flatten, Dropout,Embedding
nn = Sequential()
max_vocab_size = 200
nn.add(Embedding(200, 20, input_length=max_vocab_size))
nn.add(Dense(10, activation="relu", input_shape=(max_vocab_size,)))
nn.add(Dropout(0.15))
nn.add(Flatten())
nn.add(Dense(5,activation="softmax"))
nn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
nn.fit(np.array(encoded_docs), np.array(one_hot_label_list), batch_size=16, epochs=10,
          verbose=1, validation_split=0.2)

Train on 14866 samples, validate on 3717 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0aea578668>

In [29]:
test_document_X, test_document_Y = readFiles(test_data=True)
print(len(test_document_X))

460


In [30]:
test_total_X,test_encoded_X= create_x_matrix(test_document_X)
y = nn.predict(test_encoded_X)
print (y)

[[0.5740951  0.29711303 0.11903739 0.00575014 0.00400436]
 [0.18848616 0.5759137  0.04410791 0.03276887 0.15872331]
 [0.53887486 0.22211772 0.03371068 0.178308   0.02698878]
 ...
 [0.5454183  0.2936259  0.07274341 0.04950838 0.03870394]
 [0.46679986 0.14245215 0.32240778 0.03881722 0.02952303]
 [0.7624998  0.1556863  0.06086464 0.00761037 0.01333883]]


In [31]:
def getRankedOutput2(predicted_y):
    all_sorted_x = []
    for i in predicted_y:
        dict_temp={'Topics':i[0],'Places':i[1],'Peoples':i[2],'Exchanges':i[3],'Organizations':i[4]}
        sorted_x = sorted(dict_temp.items(), key=lambda kv: kv[1], reverse=True) 
        all_sorted_x.append(sorted_x)
    return all_sorted_x
   
ranked_output = getRankedOutput2(y)
print((ranked_output)) 

[[('Topics', 0.5740951), ('Places', 0.29711303), ('Peoples', 0.11903739), ('Exchanges', 0.005750142), ('Organizations', 0.0040043565)], [('Places', 0.5759137), ('Topics', 0.18848616), ('Organizations', 0.15872331), ('Peoples', 0.044107907), ('Exchanges', 0.03276887)], [('Topics', 0.53887486), ('Places', 0.22211772), ('Exchanges', 0.178308), ('Peoples', 0.03371068), ('Organizations', 0.026988778)], [('Topics', 0.4413671), ('Places', 0.2965116), ('Peoples', 0.21976028), ('Organizations', 0.034658466), ('Exchanges', 0.007702524)], [('Topics', 0.39966056), ('Exchanges', 0.37659654), ('Places', 0.15851277), ('Peoples', 0.036532696), ('Organizations', 0.028697468)], [('Topics', 0.579246), ('Places', 0.3068005), ('Peoples', 0.057579312), ('Organizations', 0.031028703), ('Exchanges', 0.025345523)], [('Topics', 0.7583613), ('Places', 0.16838542), ('Organizations', 0.029013667), ('Peoples', 0.022161221), ('Exchanges', 0.022078475)], [('Topics', 0.72361535), ('Places', 0.16878037), ('Organization