**Importing libraries**

In [7]:
import os, re
from bs4 import BeautifulSoup
import tensorflow as tf
tf.test.gpu_device_name()
import string
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from keras.layers.core import Dense, Activation
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from keras.utils import to_categorical
import pickle
from keras.optimizers import RMSprop
import numpy as np
from numpy import array

import pandas as pd
import matplotlib.pyplot as plt
import sklearn
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, multilabel_confusion_matrix
import seaborn as sns
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

**Import data** (Reuters-21578 text categorization test collection corpus)

In [18]:
def minor_preprocess(file):

  with open(file, 'rb') as f:
    lines = f.readlines()
    utf8_safe_lines = [line.decode('utf-8', 'ignore') for line in lines]
    xml_safe_lines = [re.sub(r'&#\d*;', '', line) for line in utf8_safe_lines] # Get rid of problematic strings
    no_newlines = [line.replace('\n', ' ') for line in xml_safe_lines]
  f.close()

  return ''.join(no_newlines)

**Processing and Compiling the Corpus**

The dataset contains ".sgm" files. We convert the data into a single dataset of dictionaries with all the information. No preprocessing (other than that which is built into scikit-learn's CountVectorizer and TfidfVectorizer) was performed outside of ensuring that the contents of the data were utf-8 encoded, in xml-safe format for Beautiful Soup, and free of superfluous and otherwise undesirable symbols like '\n'.

The files contained 1000+ entries each, except for the last one. All entries were parsed into a single dataset, retaining all information, and converted into a dictionary.

In [19]:
def compile_data(datapath="sample_data/"):
  dataset = []
  for file in os.listdir(datapath):
    if file.endswith('.sgm'): # it is important for GoogleColab
      preprocessed_data = minor_preprocess(datapath + '/' + file)
      records = [record + '</REUTERS>' for record in preprocessed_data.split('</REUTERS>') if record] # Retain all original formatting
      dataset.extend(records)
  
  return dataset

**The following code chunks were copied from the gihub link sent by Bishal, but I think these are not necessary!**

In [10]:
def compile_dictionary(data):
  
  data_dict = {
      'REUTERS TOPICS': '', # Initialize each key with some value. Important for the try/except block.
      'TOPICS': 'none', # Consider empty topics as a new category, 'none'. See next section.
      'TITLE': '',
      'BODY': '',
      }

  # Grab the Reuters Topics between the following tags
  start = data.find('<REUTERS TOPICS="') + len('<REUTERS TOPICS="')
  end = data.find('" LEWISSPLIT=')
  data_dict['REUTERS TOPICS'] = data[start:end]


  soup = BeautifulSoup(data, 'xml')

  # Use a try/except block to grab Topics, Title, and Body in case they are empty
  # If empty, the default value remains unchanged
  try:
    if soup.TOPICS.contents:
      data_dict['TOPICS'] = soup.TOPICS.D.contents[0]

    if soup.TITLE.contents:
      data_dict['TITLE'] = soup.TITLE.contents[0]

    if soup.BODY.contents:
      body = soup.BODY.contents[0]
      data_dict['BODY'] = soup.BODY.contents[0]
  except AttributeError:
    pass

  return data_dict

In [11]:
# Get a list of all documents
dataset = compile_data()

In [12]:
# Convert the list into a dictionary with fields of interest
dataset_dicts = [compile_dictionary(data) for data in dataset]

In [21]:
dataset_dicts

[{'REUTERS TOPICS': 'YES',
  'TOPICS': 'earn',
  'TITLE': 'GANTOS INC <GTOS> 4TH QTR JAN 31 NET',
  'BODY': 'Shr 43 cts vs 37 cts     Net 2,276,000 vs 1,674,000     Revs 32.6 mln vs 24.4 mln     Year     Shr 90 cts vs 69 cts     Net 4,508,000 vs 3,096,000     Revs 101.0 mln vs 76.9 mln     Avg shrs 5,029,000 vs 4,464,000     NOTE: 1986 fiscal year ended Feb 1, 1986  Reuter '},
 {'REUTERS TOPICS': 'YES',
  'TOPICS': 'acq',
  'TITLE': 'CHEMLAWN CORP, ECHOLAB INC SIGN DEFINITIVE MERGER AGREEMENT ',
  'BODY': 'CHEMLAWN CORP, ECHOLAB INC SIGN DEFINITIVE MERGER AGREEMENT '},
 {'REUTERS TOPICS': 'YES',
  'TOPICS': 'grain',
  'TITLE': 'LDC FOOD AID NEEDS DECLINE IN 1986/87 - USDA',
  'BODY': 'Total food aid needs in 69 of the least developed countries declined in 1986/87, as requirments fell in many countries in Africa, the Middle East and Asia, the U.S. Agriculture Department said.     In a summary of its World Agriculture Report, the department said grain production in sub-Saharan Africa was

We have **selected the following 20 'TOPICS' out of 135**that we want to use for our prediction exercise: 
1.)  "trade"
2.)  "earn"
3.)  "grain"
4.)  "money-fx"
5.)  "coffee"
6.)  "gold"
7.)  "acq"
8.)  "wheat"
9.)  "veg-oil"
10.) "nat-gas"
11.) "cooper"
12.) "ship"
13.) "dlr"
14.) "crude"
15.) "interest"
16.) "meal-feed"
17.) "alum"
18.) "money-supply"
19.) "cocoa"
20.) "livestock"

In [None]:
documents = []
for file in os.listdir("sample_data/"): # original: "reuters_data/"
    if '.sgm' in file: # I needed to add it because of GoogleColab specific
        filename = os.path.join("sample_data", file) # original: "reuters_data"
        f = open(filename, 'r', encoding='utf-8', errors='ignore')
        dataFile = f.read()
        
        soup = BeautifulSoup(dataFile, 'html.parser')
        contents = soup.findAll('body')
        
        for content in contents:
            documents.append(content.text)

In [None]:
print('Number of documents: {}'.format(len(documents)))

Number of documents: 19043


In [None]:
print(documents[0])

Shr 43 cts vs 37 cts
    Net 2,276,000 vs 1,674,000
    Revs 32.6 mln vs 24.4 mln
    Year
    Shr 90 cts vs 69 cts
    Net 4,508,000 vs 3,096,000
    Revs 101.0 mln vs 76.9 mln
    Avg shrs 5,029,000 vs 4,464,000
    NOTE: 1986 fiscal year ended Feb 1, 1986
 Reuter



### Remove 'Reauter and ' from each document

In [None]:
new_doc = []
for d in documents:
    new_doc.append(d[:-8])
print(new_doc[0])

Shr 43 cts vs 37 cts
    Net 2,276,000 vs 1,674,000
    Revs 32.6 mln vs 24.4 mln
    Year
    Shr 90 cts vs 69 cts
    Net 4,508,000 vs 3,096,000
    Revs 101.0 mln vs 76.9 mln
    Avg shrs 5,029,000 vs 4,464,000
    NOTE: 1986 fiscal year ended Feb 1, 1986
 


### Join the documents

In [None]:
data = ""
for d in new_doc:
    data += d
len(data)

15775658

In [None]:
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
print('Data length:', len(data))
print(data[:100])

Data length: 15466792
Shr 43 cts vs 37 cts    Net 2,276,000 vs 1,674,000    Revs 32.6 mln vs 24.4 mln    Year    Shr 90 ct


### Tokenization

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded_data= tokenizer.texts_to_sequences([data])[0]
encoded_data

[71,
 814,
 32,
 16,
 674,
 32,
 48,
 30,
 4166,
 20,
 16,
 15,
 9392,
 20,
 114,
 501,
 64,
 8,
 16,
 324,
 44,
 8,
 26,
 71,
 530,
 32,
 16,
 1107,
 32,
 48,
 44,
 4039,
 20,
 16,
 39,
 9017,
 20,
 114,
 1092,
 77,
 8,
 16,
 1124,
 74,
 8,
 269,
 275,
 41,
 6685,
 20,
 16,
 44,
 5175,
 20,
 157,
 53,
 378,
 26,
 311,
 1743,
 15,
 53,
 140,
 706,
 925,
 1125,
 5,
 1107,
 2,
 24629,
 1469,
 218,
 478,
 5,
 53,
 256,
 34,
 50358,
 5,
 446,
 218,
 5,
 1196,
 1,
 1525,
 837,
 7,
 1585,
 994,
 29,
 437,
 262,
 4,
 5,
 6,
 4772,
 2,
 24,
 179,
 437,
 223,
 3495,
 4,
 558,
 152,
 5,
 5063,
 9018,
 1196,
 27,
 4167,
 273,
 5,
 53,
 25,
 689,
 5,
 1011,
 1487,
 563,
 249,
 706,
 1125,
 5,
 244,
 845,
 147,
 17704,
 50359,
 730,
 1413,
 7,
 2972,
 20399,
 169,
 332,
 152,
 5,
 53,
 256,
 13,
 3677,
 1694,
 50360,
 9,
 706,
 3,
 6,
 51,
 273,
 7,
 296,
 50361,
 49,
 73,
 1192,
 545,
 717,
 1703,
 50362,
 6,
 153,
 665,
 10,
 4,
 249,
 79,
 1049,
 218,
 5,
 8654,
 50363,
 245,
 34,
 5063,
 9018,


In [None]:
unique_words = np.unique(encoded_data)
print('Unique words length: ', len(unique_words))
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))

Unique words length:  133585


### Feature engineering

In [None]:
WORD_LENGTH = 5
prev_words = []
next_words = []
    
for i in range(len(encoded_data) - WORD_LENGTH):
    prev_words.append(encoded_data[i:i + WORD_LENGTH])
    next_words.append(encoded_data[i + WORD_LENGTH])
print(prev_words[0])
print(next_words[0])

[71, 814, 32, 16, 674]
32


### Define X and Y

In [None]:
X = np.zeros((len(prev_words), WORD_LENGTH, len(unique_words)), dtype=bool)
Y = np.zeros((len(next_words), len(unique_words)), dtype=bool)

In [None]:
Y = to_categorical(Y, num_classes=len(unique_words))

### Build the model

In [None]:
model = Sequential()
model.add(LSTM(32, input_shape=(WORD_LENGTH, len(unique_words))))
model.add(Dense(len(unique_words)))
model.add(Activation('softmax'))

### Traing the model

In [None]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
history = model.fit(X, Y, validation_split=0.05, batch_size=50, epochs=20, shuffle=True).history