In [None]:
########################################################################################################################
# Filename: Text_Preprocessing.ipynb
#
# Purpose: Preprocess text data using both bag-of-words and sequence of integers approaches.
#
# Author(s): Bobby (Robert) Lumpkin
#
# Library Dependencies: numpy, pandas, tensorflow, bpmll
#
# NOTES: (1) A lot of the data cleaning performed here is due to a great tutorial written by Patrick Loeber
#        which can be found at: https://github.com/python-engineer/tensorflow-course/blob/master/11_NLP.ipynb
#        (2) The code organizing the Reuters-21578 dataset into a pandas dataframe came from Kaggle
#        and can be found at: https://www.kaggle.com/boldy717/reutersnltk
########################################################################################################################

# Text Preprocessing for Multilabel Classification

In [92]:
from scipy.io import arff
import re
import string
import numpy as np
import json
import pandas as pd
from bpmll import bp_mll_loss
import sklearn_json as skljson
from sklearn.model_selection import train_test_split
from nltk.corpus import reuters    ## This downloads the reduced Reuters-21578 dataset
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

## Loading the Reuters-21578 Dataset

In [17]:
reuters.words('training/9865')[:14]

['FRENCH',
 'FREE',
 'MARKET',
 'CEREAL',
 'EXPORT',
 'BIDS',
 'DETAILED',
 'French',
 'operators',
 'have',
 'requested',
 'licences',
 'to',
 'export']

In [18]:
reuters.categories('training/9865')

['barley', 'corn', 'grain', 'wheat']

In [81]:
# Extract fileids from the reuters corpus
fileids = reuters.fileids()

# Initialize empty lists to store categories and raw text
categories = []
text = []

# Loop through each file id and collect each files categories and raw text
for file in fileids:
    categories.append(reuters.categories(file))
    text.append(reuters.raw(file))

# Combine lists into pandas dataframe. reutersDf is the final dataframe. 
reutersDF = pd.DataFrame({'ids':fileids, 'categories':categories, 'text':text})

In [76]:
reutersDF.head()

Unnamed: 0,ids,categories,text
0,test/14826,[trade],ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...
1,test/14828,[grain],CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...
2,test/14829,"[crude, nat-gas]",JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...
3,test/14832,"[corn, grain, rice, rubber, sugar, tin, trade]",THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...
4,test/14833,"[palm-oil, veg-oil]",INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...


## Data Cleaning

In [77]:
## Check if there are any urls in articles
pattern = re.compile(r"https?://(\S+|www)\.\S+")
for t in reutersDF.text:
    matches = pattern.findall(t)
    for match in matches:
        print(t)
        print(match)
        print(pattern.sub(r"", t))
    if len(matches) > 0:
        break

In [78]:
## Define a function to remove punctuation from documents
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [83]:
## Remove punctuation from documents
reutersDF["text"] = reutersDF.text.apply(remove_punct)

In [87]:
# Define a function to remove stopwords
nltk.download('stopwords')

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine
# has been programmed to ignore, both when indexing entries for searching and when retrieving them 
# as the result of a search query.
stop = set(stopwords.words("english"))

def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rober\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [89]:
## Remove stopwords
reutersDF["text"] = reutersDF.text.apply(remove_stopwords)

## Generating TF-IDF Feature Vectors

In [95]:
## Generate tf-idf vectors for each instance
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(reutersDF.text)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
tf_idf_DF = pd.DataFrame(denselist, columns=feature_names)

MemoryError: Unable to allocate 84.3 KiB for an array with shape (10788,) and data type uint64

In [None]:
tf_idf_DF.head()

## Generating Sequence of Integers Feature Vectors (for RNNs)

In [91]:
## Split the data into train and test sets
training_indices = [index for index in reutersDF.index if 'training' in reutersDF.loc[index,].ids]
test_indices = [index for index in reutersDF.index if 'test' in reutersDF.loc[index,].ids]

reuters_training = reutersDF.loc[training_indices,]
reuters_test = reutersDF.loc[test_indices]

reuters_training.head()

Unnamed: 0,ids,categories,text
3019,training/1,[cocoa],bahia cocoa review showers continued throughou...
3020,training/10,[acq],computer terminal systems ltcpml completes sal...
3021,training/100,[money-supply],nz trading bank deposit growth rises slightly ...
3022,training/1000,[acq],national amusements ups viacom ltvia bid viaco...
3023,training/10000,[earn],rogers ltrog sees 1st qtr net significantly ro...
