In [None]:
########################################################################################################################
# Filename: Text_Preprocessing.ipynb
#
# Purpose: Preprocess text data using both bag-of-words and sequence of integers approaches.
#
# Author(s): Bobby (Robert) Lumpkin
#
# Library Dependencies: numpy, pandas, tensorflow, bpmll
#
# NOTES: (1) A lot of the data cleaning performed here is due to a great tutorial written by Patrick Loeber
#        which can be found at: https://github.com/python-engineer/tensorflow-course/blob/master/11_NLP.ipynb
#        (2) The code organizing the Reuters-21578 dataset into a pandas dataframe came from Kaggle
#        and can be found at: https://www.kaggle.com/boldy717/reutersnltk
########################################################################################################################

# Text Preprocessing for Multilabel Classification

In [23]:
from scipy.io import arff
import re
import string
import numpy as np
import json
import pandas as pd
from bpmll import bp_mll_loss
import sklearn_json as skljson
from sklearn.model_selection import train_test_split
from nltk.corpus import reuters    ## This downloads the reduced Reuters-21578 dataset

## Loading the Reuters-21578 Dataset

In [17]:
reuters.words('training/9865')[:14]

['FRENCH',
 'FREE',
 'MARKET',
 'CEREAL',
 'EXPORT',
 'BIDS',
 'DETAILED',
 'French',
 'operators',
 'have',
 'requested',
 'licences',
 'to',
 'export']

In [18]:
reuters.categories('training/9865')

['barley', 'corn', 'grain', 'wheat']

In [81]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(reuters.categories(file))
    text.append(reuters.raw(file))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
reutersDF = pd.DataFrame({'ids':fileids, 'categories':categories, 'text':text})

In [76]:
reutersDF.head()

Unnamed: 0,ids,categories,text
0,test/14826,[trade],ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...
1,test/14828,[grain],CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...
2,test/14829,"[crude, nat-gas]",JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...
4,test/14833,"[palm-oil, veg-oil]",INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...


## Data Cleaning

In [77]:
## Check if there are any urls in articles
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in reutersDF.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

In [78]:
## Define a function to remove punctuation from documents
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [83]:
## Remove punctuation from documents
reutersDF["text"] = reutersDF.text.apply(remove_punct)

In [84]:
reutersDF.text[0]



In [64]:
## Split the data into train and test sets
training_indices = [index for index in reutersDF.index if 'training' in reutersDF.loc[index,].ids]
test_indices = [index for index in reutersDF.index if 'test' in reutersDF.loc[index,].ids]

reuters_training = reutersDF.loc[training_indices,]
reuters_test = reutersDF.loc[test_indices]

reuters_training.head()

Unnamed: 0,ids,categories,text
3019,training/1,[cocoa],BAHIA COCOA REVIEW\n Showers continued throug...
3020,training/10,[acq],COMPUTER TERMINAL SYSTEMS &lt;CPML> COMPLETES ...
3021,training/100,[money-supply],N.Z. TRADING BANK DEPOSIT GROWTH RISES SLIGHTL...
3022,training/1000,[acq],NATIONAL AMUSEMENTS AGAIN UPS VIACOM &lt;VIA> ...
3023,training/10000,[earn],ROGERS &lt;ROG> SEES 1ST QTR NET UP SIGNIFICAN...


## Generating TF-IDF Featuer Vectors

## Generating Sequence of Integers Feature Vectors (for RNNs)