In [None]:
########################################################################################################################
# Filename: Text_Preprocessing.ipynb
#
# Purpose: Preprocess text data using both bag-of-words and sequence of integers approaches.
#
# Author(s): Bobby (Robert) Lumpkin
#
# Library Dependencies: numpy, pandas, tensorflow, bpmll
#
# NOTES: (1) A lot of the data cleaning performed here is due to a great tutorial written by Patrick Loeber
#        which can be found at: https://github.com/python-engineer/tensorflow-course/blob/master/11_NLP.ipynb
#        (2) The code organizing the Reuters-21578 dataset into a pandas dataframe came from Kaggle
#        and can be found at: https://www.kaggle.com/boldy717/reutersnltk
########################################################################################################################

# Text Preprocessing for Multilabel Classification

In [16]:
from scipy.io import arff
import numpy as np
import json
import pandas as pd
from bpmll import bp_mll_loss
import sklearn_json as skljson
from sklearn.model_selection import train_test_split
from nltk.corpus import reuters    ## This downloads the reduced Reuters-21578 dataset

In [17]:
reuters.words('training/9865')[:14]

['FRENCH',
 'FREE',
 'MARKET',
 'CEREAL',
 'EXPORT',
 'BIDS',
 'DETAILED',
 'French',
 'operators',
 'have',
 'requested',
 'licences',
 'to',
 'export']

In [18]:
reuters.categories('training/9865')

['barley', 'corn', 'grain', 'wheat']

In [19]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(reuters.categories(file))
    text.append(reuters.raw(file))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
reutersDf = pd.DataFrame({'ids':fileids, 'categories':categories, 'text':text})

In [21]:
reutersDf

Unnamed: 0,ids,categories,text
0,test/14826,[trade],ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...
1,test/14828,[grain],CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...
2,test/14829,"[crude, nat-gas]",JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...
4,test/14833,"[palm-oil, veg-oil]",INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...
...,...,...,...
10783,training/999,"[interest, money-fx]",U.K. MONEY MARKET SHORTAGE FORECAST REVISED DO...
10784,training/9992,[earn],KNIGHT-RIDDER INC &lt;KRN> SETS QUARTERLY\n Q...
10785,training/9993,[earn],TECHNITROL INC &lt;TNL> SETS QUARTERLY\n Qtly...
10786,training/9994,[earn],NATIONWIDE CELLULAR SERVICE INC &lt;NCEL> 4TH ...
