# Text classification with Reuters-21578 datasets

In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib


In [2]:
import copy
import re
import xml.sax.saxutils as saxutils
import sklearn.ensemble
import sklearn.pipeline

from bs4 import BeautifulSoup

from gensim.models.word2vec import Word2Vec

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM

from multiprocessing import cpu_count

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

from pandas import DataFrame

from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## General constants (modify them according to you environment)

In [3]:
# Set Numpy random seed
import numpy as np
import random
random.seed(1000)

# Newsline folder and format
data_folder = 'C:\\semdoc\\data\\reuters\\'

sgml_number_of_files = 22
sgml_file_name_template = 'reut2-NNN.sgm'

# Category files
category_files = {
    'to_': ('Topics', 'all-topics-strings.lc.txt'),
    'pl_': ('Places', 'all-places-strings.lc.txt'),
    'pe_': ('People', 'all-people-strings.lc.txt'),
    'or_': ('Organizations', 'all-orgs-strings.lc.txt'),
    'ex_': ('Exchanges', 'all-exchanges-strings.lc.txt')
}

# Word2Vec number of features
num_features = 500
# Limit each newsline to a fixed number of words
document_max_num_words = 100
# Selected categories
selected_categories = ['to_earn']

## Prepare documents and categories

In [4]:
# Create category dataframe

# Read all categories
category_data = []

for category_prefix in category_files.keys():
    with open(data_folder + category_files[category_prefix][1], 'r') as file:
        for category in file.readlines():
            category_data.append([category_prefix + category.strip().lower(), 
                                  category_files[category_prefix][0], 
                                  0])

# Create category dataframe
news_categories = DataFrame(data=category_data, columns=['Name', 'Type', 'Newslines'])

In [5]:
def update_frequencies(categories):
    for category in categories:
        idx = news_categories[news_categories.Name == category].index[0]
        f = news_categories.get_value(idx, 'Newslines')
        news_categories.set_value(idx, 'Newslines', f+1)
    
def to_category_vector(categories, target_categories):
    vector = np.zeros(1).astype(np.float32)
    
    #vector = np.zeros(len(target_categories)).astype(np.float32)
    is_true_label = 0.0
    for i in range(len(target_categories)):
        if target_categories[i] in categories:
            is_true_label = 1.0
            break
            #vector[i] = 1.0
    vector[0] = is_true_label
    return vector

In [6]:
# Parse SGML files
document_X = {}
document_Y = {}

def strip_tags(text):
    return re.sub('<[^<]+?>', '', text).strip()

def unescape(text):
    return saxutils.unescape(text)

# Iterate all files
for i in range(sgml_number_of_files):
    if i < 10:
        seq = '00' + str(i)
    else:
        seq = '0' + str(i)
        
    file_name = sgml_file_name_template.replace('NNN', seq)
    print('Reading file: %s' % file_name)
    
    with open(data_folder + file_name, 'r') as file:
        content = BeautifulSoup(file.read().lower())
        n_non_neg_docs = 0
        n_neg_docs = 0
        for newsline in content('reuters'):
            document_categories = []
            
            # News-line Id
            document_id = newsline['newid']
            
            # News-line text
            document_body = strip_tags(str(newsline('text')[0].body)).replace('reuter\n&#3;', '')
            document_body = unescape(document_body)
            
            # News-line categories
            topics = newsline.topics.contents
            places = newsline.places.contents
            people = newsline.people.contents
            orgs = newsline.orgs.contents
            exchanges = newsline.exchanges.contents
            
            for topic in topics:
                document_categories.append('to_' + strip_tags(str(topic)))
                
            for place in places:
                document_categories.append('pl_' + strip_tags(str(place)))
                
            for person in people:
                document_categories.append('pe_' + strip_tags(str(person)))
                
            for org in orgs:
                document_categories.append('or_' + strip_tags(str(org)))
                
            for exchange in exchanges:
                document_categories.append('ex_' + strip_tags(str(exchange)))
                
            # Create new document    
            update_frequencies(document_categories)
            
            document_X[document_id] = document_body
            document_Y[document_id] = to_category_vector(document_categories, selected_categories)
            if (document_Y[document_id][0] == 1):
                n_non_neg_docs += 1
            else:
                n_neg_docs += 1
            #rint(document_Y[document_id])
        print(n_non_neg_docs, n_neg_docs)

Reading file: reut2-000.sgm




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  after removing the cwd from sys.path.
  """


193 807
Reading file: reut2-001.sgm
227 773
Reading file: reut2-002.sgm
238 762
Reading file: reut2-003.sgm
163 837
Reading file: reut2-004.sgm
207 793
Reading file: reut2-005.sgm
230 770
Reading file: reut2-006.sgm
286 714
Reading file: reut2-007.sgm
251 749
Reading file: reut2-008.sgm
222 778
Reading file: reut2-009.sgm
220 780
Reading file: reut2-010.sgm
192 808
Reading file: reut2-011.sgm
204 796
Reading file: reut2-012.sgm
161 839
Reading file: reut2-013.sgm
55 945
Reading file: reut2-014.sgm
80 920
Reading file: reut2-015.sgm
236 764
Reading file: reut2-016.sgm
240 760
Reading file: reut2-017.sgm
48 952
Reading file: reut2-018.sgm
58 942
Reading file: reut2-019.sgm
67 933
Reading file: reut2-020.sgm
258 742
Reading file: reut2-021.sgm
151 427


## Top 20 categories (by number of newslines)

In [7]:
news_categories.sort_values(by='Newslines', ascending=False, inplace=True)
news_categories.head(20)

Unnamed: 0,Name,Type,Newslines
296,pl_usa,Places,12542
35,to_earn,Topics,3987
0,to_acq,Topics,2448
293,pl_uk,Places,1489
219,pl_japan,Places,1138
166,pl_canada,Places,1104
73,to_money-fx,Topics,801
28,to_crude,Topics,634
45,to_grain,Topics,628
302,pl_west-germany,Places,567


## Tokenize newsline documents

In [8]:
# Load stop-words
stop_words = set(stopwords.words('english'))

# Initialize tokenizer
# It's also possible to try with a stemmer or to mix a stemmer and a lemmatizer
tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Tokenized document collection
newsline_documents = []

In [9]:
def tokenize(document):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer.tokenize(sentence) if t.lower() not in stop_words]
        words += tokens

    return words

# Tokenize
for key in document_X.keys():
    newsline_documents.append(tokenize(document_X[key]))

number_of_documents = len(document_X)

## Word2Vec Model


In [10]:
# Load an existing Word2Vec model
#w2v_model = Word2Vec.load(data_folder + 'news1.word2vec')

In [11]:
# Create new Gensim Word2Vec model
w2v_model = Word2Vec(newsline_documents, size=num_features, min_count=1, window=10, workers=cpu_count())
w2v_model.init_sims(replace=True)
w2v_model.save(data_folder + 'news.word2vec')

## Vectorize each document

In [12]:
#import numpy
#num_categories = len(selected_categories)
#X = numpy.zeros(shape=(number_of_documents, document_max_num_words, num_features)).astype(numpy.float32)
#Y = numpy.zeros(shape=(number_of_documents, num_categories)).astype(numpy.float32)

#empty_word = numpy.zeros(num_features).astype(numpy.float32)

#for idx, document in enumerate(newsline_documents):
#    for jdx, word in enumerate(document):
#        if jdx == document_max_num_words:
#            break
            
#        else:
#            if word in w2v_model:
#                X[idx, jdx, :] = w2v_model[word]
#            else:
#                X[idx, jdx, :] = empty_word

#for idx, key in enumerate(document_Y.keys()):
#    Y[idx, :] = document_Y[key]

import numpy
import copy
num_categories = len(selected_categories)
X = numpy.zeros(shape=(number_of_documents, num_features)).astype(numpy.float32)
Y = numpy.zeros(shape=(number_of_documents, num_categories)).astype(numpy.float32)

empty_word = numpy.zeros(num_features).astype(numpy.float32)


zero_vec = numpy.zeros(shape = (1, num_features) )
for idx, document in enumerate(newsline_documents):
    vec = copy.deepcopy(zero_vec)
    n_words = 0
    for jdx, word in enumerate(document):
        #if jdx == document_max_num_words:
        #    break
            
        #else:
        if word in w2v_model:
            vec += w2v_model[word]
            #X[idx, jdx, :] = w2v_model[word]
        else:
            vec += empty_word
            #X[idx, jdx, :] = empty_word
        n_words += 1
    vec /= n_words
    X[idx, :] = copy.deepcopy(vec) 
            
for idx, key in enumerate(document_Y.keys()):
    Y[idx, :] = document_Y[key]





## Split training and test sets

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

## Create Keras model

In [14]:
# Keras.
#model = Sequential()

#model.add(LSTM(int(document_max_num_words*1.5), input_shape=(document_max_num_words, num_features)))
#model.add(Dropout(0.3))
#model.add(Dense(num_categories))
#model.add(Activation('sigmoid'))

#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Gradient Boosting
#model = sklearn.ensemble.GradientBoostingClassifier(n_estimators = 127, learning_rate = 0.001)
ctg = sklearn.ensemble.GradientBoostingClassifier()

## Train and evaluate model

In [15]:
# Keras.
# Train model
# model.fit(X_train, Y_train, batch_size=128, nb_epoch=5, validation_data=(X_test, Y_test))

# Evaluate model
#score, acc = model.evaluate(X_test, Y_test, batch_size=128)
    
#print('Score: %1.4f' % score)
#print('Accuracy: %1.4f' % acc)

# Gradient Boosting

#model.fit(X, Y)
#scores = cross_val_score(model, X, Y, cv=3)
#accuracy = scores.mean()
#print("Accuracy: %.3f" % (accuracy))

model_selection_mdl = sklearn.model_selection
pipeline_mdl = sklearn.pipeline
pipeline_list = [("categorizer", ctg)]
pipeline = pipeline_mdl.Pipeline(pipeline_list)
param_grid = [
    {

        'categorizer__n_estimators': [63, 127, 255, 511],
        'categorizer__learning_rate':[0.1, 0.01, 0.001],
    }
]

grid_searcher = model_selection_mdl.GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid, verbose =3)
grid_searcher.fit(X, Y)
accuracy = grid_searcher.best_score_
print("Accuracy: %.3f" % (accuracy))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] categorizer__learning_rate=0.1, categorizer__n_estimators=63 ....


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.1, categorizer__n_estimators=63, score=0.8152370360072293, total=  26.3s
[CV] categorizer__learning_rate=0.1, categorizer__n_estimators=63 ....


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.4s remaining:    0.0s
  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.1, categorizer__n_estimators=63, score=0.8152370360072293, total=  19.4s
[CV] categorizer__learning_rate=0.1, categorizer__n_estimators=63 ....


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   46.0s remaining:    0.0s
  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.1, categorizer__n_estimators=63, score=0.8152113459399333, total=  10.4s
[CV] categorizer__learning_rate=0.1, categorizer__n_estimators=127 ...


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.1, categorizer__n_estimators=127, score=0.8152370360072293, total=  26.6s
[CV] categorizer__learning_rate=0.1, categorizer__n_estimators=127 ...


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.1, categorizer__n_estimators=127, score=0.8152370360072293, total=  24.9s
[CV] categorizer__learning_rate=0.1, categorizer__n_estimators=127 ...


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.1, categorizer__n_estimators=127, score=0.8152113459399333, total=  25.8s
[CV] categorizer__learning_rate=0.1, categorizer__n_estimators=255 ...


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.1, categorizer__n_estimators=255, score=0.8152370360072293, total=  44.9s
[CV] categorizer__learning_rate=0.1, categorizer__n_estimators=255 ...


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.1, categorizer__n_estimators=255, score=0.8152370360072293, total=  40.9s
[CV] categorizer__learning_rate=0.1, categorizer__n_estimators=255 ...


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.1, categorizer__n_estimators=255, score=0.8152113459399333, total=  43.5s
[CV] categorizer__learning_rate=0.1, categorizer__n_estimators=511 ...


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.1, categorizer__n_estimators=511, score=0.8152370360072293, total= 1.6min
[CV] categorizer__learning_rate=0.1, categorizer__n_estimators=511 ...


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.1, categorizer__n_estimators=511, score=0.8152370360072293, total= 1.6min
[CV] categorizer__learning_rate=0.1, categorizer__n_estimators=511 ...


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.1, categorizer__n_estimators=511, score=0.8152113459399333, total= 1.9min
[CV] categorizer__learning_rate=0.01, categorizer__n_estimators=63 ...


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.01, categorizer__n_estimators=63, score=0.8152370360072293, total=  12.7s
[CV] categorizer__learning_rate=0.01, categorizer__n_estimators=63 ...


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.01, categorizer__n_estimators=63, score=0.8152370360072293, total=  11.1s
[CV] categorizer__learning_rate=0.01, categorizer__n_estimators=63 ...


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.01, categorizer__n_estimators=63, score=0.8152113459399333, total=  11.4s
[CV] categorizer__learning_rate=0.01, categorizer__n_estimators=127 ..


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.01, categorizer__n_estimators=127, score=0.8152370360072293, total=  20.8s
[CV] categorizer__learning_rate=0.01, categorizer__n_estimators=127 ..


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.01, categorizer__n_estimators=127, score=0.8152370360072293, total=  20.3s
[CV] categorizer__learning_rate=0.01, categorizer__n_estimators=127 ..


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.01, categorizer__n_estimators=127, score=0.8152113459399333, total=  20.6s
[CV] categorizer__learning_rate=0.01, categorizer__n_estimators=255 ..


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.01, categorizer__n_estimators=255, score=0.8152370360072293, total=  41.0s
[CV] categorizer__learning_rate=0.01, categorizer__n_estimators=255 ..


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.01, categorizer__n_estimators=255, score=0.8152370360072293, total=  41.5s
[CV] categorizer__learning_rate=0.01, categorizer__n_estimators=255 ..


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.01, categorizer__n_estimators=255, score=0.8152113459399333, total=  41.2s
[CV] categorizer__learning_rate=0.01, categorizer__n_estimators=511 ..


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.01, categorizer__n_estimators=511, score=0.8152370360072293, total= 1.6min
[CV] categorizer__learning_rate=0.01, categorizer__n_estimators=511 ..


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.01, categorizer__n_estimators=511, score=0.8152370360072293, total= 1.5min
[CV] categorizer__learning_rate=0.01, categorizer__n_estimators=511 ..


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.01, categorizer__n_estimators=511, score=0.8152113459399333, total= 1.5min
[CV] categorizer__learning_rate=0.001, categorizer__n_estimators=63 ..


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.001, categorizer__n_estimators=63, score=0.8152370360072293, total=  11.7s
[CV] categorizer__learning_rate=0.001, categorizer__n_estimators=63 ..


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.001, categorizer__n_estimators=63, score=0.8152370360072293, total=  12.1s
[CV] categorizer__learning_rate=0.001, categorizer__n_estimators=63 ..


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.001, categorizer__n_estimators=63, score=0.8152113459399333, total=  12.5s
[CV] categorizer__learning_rate=0.001, categorizer__n_estimators=127 .


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.001, categorizer__n_estimators=127, score=0.8152370360072293, total=  23.4s
[CV] categorizer__learning_rate=0.001, categorizer__n_estimators=127 .


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.001, categorizer__n_estimators=127, score=0.8152370360072293, total=  22.8s
[CV] categorizer__learning_rate=0.001, categorizer__n_estimators=127 .


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.001, categorizer__n_estimators=127, score=0.8152113459399333, total=  29.2s
[CV] categorizer__learning_rate=0.001, categorizer__n_estimators=255 .


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.001, categorizer__n_estimators=255, score=0.8152370360072293, total=  41.9s
[CV] categorizer__learning_rate=0.001, categorizer__n_estimators=255 .


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.001, categorizer__n_estimators=255, score=0.8152370360072293, total=  45.6s
[CV] categorizer__learning_rate=0.001, categorizer__n_estimators=255 .


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.001, categorizer__n_estimators=255, score=0.8152113459399333, total=  50.7s
[CV] categorizer__learning_rate=0.001, categorizer__n_estimators=511 .


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.001, categorizer__n_estimators=511, score=0.8152370360072293, total= 1.5min
[CV] categorizer__learning_rate=0.001, categorizer__n_estimators=511 .


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.001, categorizer__n_estimators=511, score=0.8152370360072293, total= 1.5min
[CV] categorizer__learning_rate=0.001, categorizer__n_estimators=511 .


  y = column_or_1d(y, warn=True)


[CV]  categorizer__learning_rate=0.001, categorizer__n_estimators=511, score=0.8152113459399333, total= 1.7min


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 26.7min finished
  y = column_or_1d(y, warn=True)


Accuracy: 0.815
