In [1]:
#read datafolder from Google Drive
from google.colab import drive
drive.mount('/content/drive')
data_folder = '/content/drive/My Drive/pg2k18/sem2/smai/smai_proj/reuters21578/'


sgml_number_of_files = 21
sgml_file_name_template = 'reut2-NNN.sgm'

# Category files
category_files = {
    'to_': ('Topics', 'all-topics-strings.lc.txt'),
    'pl_': ('Places', 'all-places-strings.lc.txt'),
    'pe_': ('People', 'all-people-strings.lc.txt'),
    'or_': ('Organizations', 'all-orgs-strings.lc.txt'),
    'ex_': ('Exchanges', 'all-exchanges-strings.lc.txt')
}


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [0]:
# Read all categories
category_data = []
category_dictionary={'Topics':[],'Places':[],'People':[],'Organizations':[],'Exchanges':[]}
for category_prefix in category_files.keys():
    with open(data_folder + category_files[category_prefix][1], 'r') as file:
        for category in file.readlines():
            category_data.append([category_prefix + category.strip().lower(), 
                                  category_files[category_prefix][0]])

# Create category dataframe
for i in category_data:
    category_dictionary[i[1]].append(i[0].split('_')[1])
news_categories = pd.DataFrame(data=category_data)


In [4]:
import re
import xml.sax.saxutils as saxutils
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('stopwords')

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
lemmatizer = WordNetLemmatizer()

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;\\n]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def cleanUpSentence(r):#, stop_words = None#
    r = r.lower().replace("<br />", " ")
    r = REPLACE_BY_SPACE_RE.sub(' ', r)
    r = BAD_SYMBOLS_RE.sub('', r)

    r = ' '.join(word for word in r.split() if word not in STOPWORDS)

    words = word_tokenize(r)

    for w in words:
        w = lemmatizer.lemmatize(w)

    return r

In [0]:
# Parse SGML files
def strip_tags(text):
    return re.sub('<[^<]+?>', '', text).strip()

def unescape(text):
    return saxutils.unescape(text)
  
def makeDict(filename, document_X):
  with open(filename, 'rb') as file:

    content = BeautifulSoup(file.read().lower(),'html.parser')

    for newsline in content('reuters'):
      document_categories = []

      document_id = newsline['newid']
      document_body = strip_tags(str(newsline('text')[0].body)).replace('reuter\n&#3;', '')
      if document_body == 'None':
        continue

      doc_categories = strip_tags(str(newsline('topics')[0].body))
      doc_categories = unescape(doc_categories)

      document_body = unescape(document_body)

      document_X[document_id] = document_body
      

def readFiles(test_data = False):
  document_X = {}
  
  if test_data == True:
    file_name = sgml_file_name_template.replace('NNN', '021')
    filename = data_folder + file_name
    makeDict(filename, document_X)
  else:
    for i in range(sgml_number_of_files):
      if i < 10:
        seq = '00' + str(i)
      else:
        seq = '0' + str(i)

      file_name = sgml_file_name_template.replace('NNN', seq)
      print('Reading file: %s' % file_name)
      filename = data_folder + file_name
      makeDict(filename, document_X)
  return document_X



In [7]:
document_X = readFiles()

Reading file: reut2-000.sgm
Reading file: reut2-001.sgm
Reading file: reut2-002.sgm
Reading file: reut2-003.sgm
Reading file: reut2-004.sgm
Reading file: reut2-005.sgm
Reading file: reut2-006.sgm
Reading file: reut2-007.sgm
Reading file: reut2-008.sgm
Reading file: reut2-009.sgm
Reading file: reut2-010.sgm
Reading file: reut2-011.sgm
Reading file: reut2-012.sgm
Reading file: reut2-013.sgm
Reading file: reut2-014.sgm
Reading file: reut2-015.sgm
Reading file: reut2-016.sgm
Reading file: reut2-017.sgm
Reading file: reut2-018.sgm
Reading file: reut2-019.sgm
Reading file: reut2-020.sgm


In [9]:
# data preprocessing
import numpy as np

nltk.download('punkt')
nltk.download('wordnet')
def create_x_matrix(document_X):
    totalX = []
    for i, doc in document_X.items():
        totalX.append(cleanUpSentence(doc))
    max_vocab_size = 200
    input_tokenizer = Tokenizer(max_vocab_size)
    input_tokenizer.fit_on_texts(totalX)
    encoded_docs = input_tokenizer.texts_to_matrix(totalX, mode='count')
    return totalX,encoded_docs

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
totalX,encoded_docs=create_x_matrix(document_X)

In [0]:
#Create one-hot encode
def makeOneHotEncoding(totalX):
  words_in_body={}

  for i in range(len(totalX)):
      words=totalX[i].split(' ')
      words_in_body[i]=words    

  one_hot_label=[]
  for key,v in words_in_body.items():
      dict_temp={'Topics':0,'Places':0,'People':0,'Exchanges':0,'Organizations':0}
      for i in v:
          if i in category_dictionary['Topics']:
              dict_temp['Topics']+=1
          if i in category_dictionary['Places']:
              dict_temp['Places']+=1
          if i in category_dictionary['People']:
              dict_temp['People']+=1
          if i in category_dictionary['Exchanges']:
              dict_temp['Exchanges']+=1
          if i in category_dictionary['Organizations']:
              dict_temp['Organizations']+=1

      one_hot_label.append(dict_temp)


  one_hot_label_list = []
  for i in one_hot_label:

      one_hot_label_list.append(list(i.values()))
  return one_hot_label_list

In [0]:
one_hot_label_list = makeOneHotEncoding(totalX)

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
!pip install scikit-multilearn
from skmultilearn.problem_transform import BinaryRelevance
X = np.array(encoded_docs)
Y = np.array(one_hot_label_list)
classifier = BinaryRelevance(MultinomialNB())
_ = classifier.fit(X,Y)




In [0]:
def getRankedOutput2(predicted_y):
  all_sorted_x = []
  for i in predicted_y:
    dict_temp={'Topics':i[0],'Places':i[1],'Peoples':i[2],'Exchanges':i[3],'Organizations':i[4]}
    sorted_x = sorted(dict_temp.items(), key=lambda kv: kv[1], reverse=True) 
    all_sorted_x.append(sorted_x)
  return all_sorted_x

In [0]:
def permissible(x, y):
  if (abs(x-y)) < 1:
    return True
  else:
    return False

def accuracy(predicted,actual):
    tp=0
    length = len(actual)
    for one_doc_idx in range(length):
        if permissible(predicted[one_doc_idx][0],actual[one_doc_idx][0]) and permissible(predicted[one_doc_idx][1],actual[one_doc_idx][1])\
          and permissible(predicted[one_doc_idx][2],actual[one_doc_idx][2]) and permissible(predicted[one_doc_idx][3],actual[one_doc_idx][3]):
            tp+=1
    return tp/float(length)

In [0]:
val_x_doc = readFiles(test_data = True)
totalX_test, encoded_val_x = create_x_matrix(val_x_doc)
ground_truth_val = makeOneHotEncoding(totalX_test)

In [0]:
X = np.array(encoded_val_x)
Y = np.array(ground_truth_val)

val_pred = classifier.predict(X)
val_pred_arr = val_pred.toarray()

In [30]:
validate_y_list = val_pred_arr.tolist()

newv = []
for i in range(len(validate_y_list)):
  validate_y_list_i = list(map(int, validate_y_list[i]))
  newv.append(validate_y_list_i)


print ("%f"%accuracy(newv, ground_truth_val))
pred_rank_list = getRankedOutput2(val_pred_arr)
actual_rank_list = getRankedOutput2(Y)


0.389130


In [33]:
pd.set_option('max_colwidth', 100)
my_df  = pd.DataFrame(columns = ['body', 'predicted ranking', 'actual ranking'])
my_df['body'] = totalX_test
my_df['predicted ranking'] = pred_rank_list
my_df['actual ranking'] = actual_rank_list
my_df.head()

Unnamed: 0,body,predicted ranking,actual ranking
0,huge oil platforms dot gulf like beacons usually lit like christmas trees night one sitting astr...,"[(Peoples, 14), (Topics, 2), (Places, 1), (Exchanges, 0), (Organizations, 0)]","[(Places, 12), (Topics, 3), (Peoples, 0), (Exchanges, 0), (Organizations, 0)]"
1,canadian auto workers union said accepted economic offer canadian division general motors corp g...,"[(Places, 9), (Topics, 5), (Peoples, 3), (Organizations, 3), (Exchanges, 1)]","[(Places, 2), (Topics, 1), (Peoples, 0), (Exchanges, 0), (Organizations, 0)]"
2,canada development corp said polysar ltd unit completed refinancing package worth 830 mln canadi...,"[(Topics, 0), (Places, 0), (Peoples, 0), (Exchanges, 0), (Organizations, 0)]","[(Topics, 4), (Places, 1), (Peoples, 0), (Exchanges, 0), (Organizations, 0)]"
3,us attack iranian oil platform gulf monday appeared titfortat raid carefully orchestrated provoc...,"[(Topics, 17), (Organizations, 15), (Peoples, 7), (Places, 2), (Exchanges, 1)]","[(Places, 6), (Peoples, 2), (Topics, 0), (Exchanges, 0), (Organizations, 0)]"
4,brown disc products co inc unit fo genevar enterprises inc said purchased ongoing business trade...,"[(Exchanges, 4), (Topics, 0), (Places, 0), (Peoples, 0), (Organizations, 0)]","[(Topics, 0), (Places, 0), (Peoples, 0), (Exchanges, 0), (Organizations, 0)]"
