In [1]:
#read datafolder from Google Drive
from google.colab import drive
drive.mount('/content/drive')
data_folder = '/content/drive/My Drive/SMAI/Assignments&Projects/Project/reuters21578/'


sgml_number_of_files = 21
sgml_file_name_template = 'reut2-NNN.sgm'

# Category files
category_files = {
    'to_': ('Topics', 'all-topics-strings.lc.txt'),
    'pl_': ('Places', 'all-places-strings.lc.txt'),
    'pe_': ('People', 'all-people-strings.lc.txt'),
    'or_': ('Organizations', 'all-orgs-strings.lc.txt'),
    'ex_': ('Exchanges', 'all-exchanges-strings.lc.txt')
}


ModuleNotFoundError: No module named 'google.colab'

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# Read all categories
category_data = []
category_dictionary={'Topics':[],'Places':[],'People':[],'Organizations':[],'Exchanges':[]}
for category_prefix in category_files.keys():
    with open(data_folder + category_files[category_prefix][1], 'r') as file:
        for category in file.readlines():
            category_data.append([category_prefix + category.strip().lower(), 
                                  category_files[category_prefix][0]])

# Create category dataframe
for i in category_data:
    category_dictionary[i[1]].append(i[0].split('_')[1])
news_categories = pd.DataFrame(data=category_data)


In [None]:
import re
import xml.sax.saxutils as saxutils
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('stopwords')

In [None]:
lemmatizer = WordNetLemmatizer()

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;\\n]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def cleanUpSentence(r):#, stop_words = None#
    r = r.lower().replace("<br />", " ")
    r = REPLACE_BY_SPACE_RE.sub(' ', r)
    r = BAD_SYMBOLS_RE.sub('', r)

    r = ' '.join(word for word in r.split() if word not in STOPWORDS)

    words = word_tokenize(r)

    for w in words:
        w = lemmatizer.lemmatize(w)

    return r

In [None]:
# Parse SGML files
def strip_tags(text):
    return re.sub('<[^<]+?>', '', text).strip()

def unescape(text):
    return saxutils.unescape(text)
  
def makeDict(filename, document_X):
  with open(filename, 'rb') as file:

    content = BeautifulSoup(file.read().lower(),'html.parser')

    for newsline in content('reuters'):
      document_categories = []

      document_id = newsline['newid']
      document_body = strip_tags(str(newsline('text')[0].body)).replace('reuter\n&#3;', '')
      if document_body == 'None':
        continue

      doc_categories = strip_tags(str(newsline('topics')[0].body))
      doc_categories = unescape(doc_categories)

      document_body = unescape(document_body)

      document_X[document_id] = document_body
      

def readFiles(test_data = False):
  document_X = {}
  
  if test_data == True:
    file_name = sgml_file_name_template.replace('NNN', '021')
    filename = data_folder + file_name
    makeDict(filename, document_X)
  else:
    for i in range(sgml_number_of_files):
      if i < 10:
        seq = '00' + str(i)
      else:
        seq = '0' + str(i)

      file_name = sgml_file_name_template.replace('NNN', seq)
      print('Reading file: %s' % file_name)
      filename = data_folder + file_name
      makeDict(filename, document_X)
  return document_X



In [None]:
document_X = readFiles()

In [None]:
# data preprocessing
import numpy as np

nltk.download('punkt')
nltk.download('wordnet')
def create_x_matrix(document_X):
    totalX = []
    for i, doc in document_X.items():
        totalX.append(cleanUpSentence(doc))
    max_vocab_size = 200
    input_tokenizer = Tokenizer(max_vocab_size)
    input_tokenizer.fit_on_texts(totalX)
    encoded_docs = input_tokenizer.texts_to_matrix(totalX, mode='count')
    return totalX,encoded_docs

In [None]:
totalX,encoded_docs=create_x_matrix(document_X)

In [None]:
#Create one-hot encode
def makeOneHotEncoding(totalX):
  words_in_body={}

  for i in range(len(totalX)):
      words=totalX[i].split(' ')
      words_in_body[i]=words    

  one_hot_label=[]
  for key,v in words_in_body.items():
      dict_temp={'Topics':0,'Places':0,'People':0,'Exchanges':0,'Organizations':0}
      for i in v:
          if i in category_dictionary['Topics']:
              dict_temp['Topics']+=1
          if i in category_dictionary['Places']:
              dict_temp['Places']+=1
          if i in category_dictionary['People']:
              dict_temp['People']+=1
          if i in category_dictionary['Exchanges']:
              dict_temp['Exchanges']+=1
          if i in category_dictionary['Organizations']:
              dict_temp['Organizations']+=1

      one_hot_label.append(dict_temp)


  one_hot_label_list = []
  for i in one_hot_label:

      one_hot_label_list.append(list(i.values()))
  return one_hot_label_list

In [None]:
one_hot_label_list = makeOneHotEncoding(totalX)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
!pip install scikit-multilearn
from skmultilearn.problem_transform import BinaryRelevance
X = np.array(encoded_docs)
Y = np.array(one_hot_label_list)
classifier = BinaryRelevance(MultinomialNB())
_ = classifier.fit(X,Y)


In [None]:
def getRankedOutput2(predicted_y):
  all_sorted_x = []
  for i in predicted_y:
    dict_temp={'Topics':i[0],'Places':i[1],'Peoples':i[2],'Exchanges':i[3],'Organizations':i[4]}
    sorted_x = sorted(dict_temp.items(), key=lambda kv: kv[1], reverse=True) 
    all_sorted_x.append(sorted_x)
  return all_sorted_x

In [None]:
def permissible(x, y):
  if (abs(x-y)) < 2:
    return True
  else:
    return False

def accuracy(predicted,actual):
    tp=0
    fp = 0
    tp_list = []
    length = len(actual)
    print("length: ",length)
    for one_doc_idx in range(length):
        if permissible(predicted[one_doc_idx][0],actual[one_doc_idx][0]) and permissible(predicted[one_doc_idx][1],actual[one_doc_idx][1])\
          and permissible(predicted[one_doc_idx][2],actual[one_doc_idx][2]) and permissible(predicted[one_doc_idx][3],actual[one_doc_idx][3]):
            tp+=1
            x = tp
        else:
          fp = 0
          x = fp
        tp_list.append(x)
    return tp/float(length), tp_list
  

In [None]:
val_x_doc = readFiles(test_data = True)
totalX_test, encoded_val_x = create_x_matrix(val_x_doc)
ground_truth_val = makeOneHotEncoding(totalX_test)

In [None]:
X = np.array(encoded_val_x)
Y = np.array(ground_truth_val)

val_pred = classifier.predict(X)
val_pred_arr = val_pred.toarray()

In [None]:
validate_y_list = val_pred_arr.tolist()

newv = []
for i in range(len(validate_y_list)):
  validate_y_list_i = list(map(int, validate_y_list[i]))
  newv.append(validate_y_list_i)

acc,tpl=accuracy(newv, ground_truth_val)
print ("%f"%acc)
pred_rank_list = getRankedOutput2(val_pred_arr)
actual_rank_list = getRankedOutput2(Y)


In [None]:
pd.set_option('max_colwidth', 100)
my_df  = pd.DataFrame(columns = ['body', 'predicted ranking', 'actual ranking', '_'])
my_df['body'] = totalX_test
my_df['predicted ranking'] = pred_rank_list
my_df['actual ranking'] = actual_rank_list
my_df['_'] = tpl
my_df = my_df.sort_values(by='_', ascending=False).reset_index(drop=True)
my_df[['body', 'predicted ranking', 'actual ranking']]