In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import re
import xml.sax.saxutils as saxutils

from bs4 import BeautifulSoup

from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from pandas import DataFrame

from random import random
import numpy as np
import nltk
# Set Numpy random seed
np.random.seed(1)

Using TensorFlow backend.


In [3]:
data_folder = './reuters/'

sgml_number_of_files = 22
sgml_file_name_template = 'reut2-{}.sgm'

# Category files
category_files = {
    'to_': ('Topics', 'all-topics-strings.lc.txt'),
    'pl_': ('Places', 'all-places-strings.lc.txt'),
    'pe_': ('People', 'all-people-strings.lc.txt'),
    'or_': ('Organizations', 'all-orgs-strings.lc.txt'),
    'ex_': ('Exchanges', 'all-exchanges-strings.lc.txt')
}

In [4]:
# Prepare documents and categories
# Read all categories
category_data = []

for category_prefix in category_files.keys():
    with open(data_folder + category_files[category_prefix][1], 'r') as file:
        for category in file.readlines():
            category_data.append([category_prefix + category.strip().lower(), 
                                  category_files[category_prefix][0], 
                                  0])

# Create category dataframe
news_categories = DataFrame(data=category_data, columns=['Name', 'Type', 'Newslines'])

In [5]:
def update_frequencies(categories):
    for category in categories:
        idx = news_categories[news_categories.Name == category].index[0]
        f = news_categories.get_value(idx, 'Newslines')
        news_categories.set_value(idx, 'Newslines', f+1)
    
def to_category_vector(categories, target_categories):
    vector = zeros(len(target_categories)).astype(float32)
    
    for i in range(len(target_categories)):
        if target_categories[i] in categories:
            vector[i] = 1.0
    
    return vector

In [6]:
# Those are the top 20 categories we will use for the classification
selected_categories = ['pl_usa', 'to_earn', 'to_acq', 'pl_uk', 'pl_japan', 'pl_canada', 'to_money-fx',
 'to_crude', 'to_grain', 'pl_west-germany', 'to_trade', 'to_interest',
 'pl_france', 'or_ec', 'pl_brazil', 'to_wheat', 'to_ship', 'pl_australia',
 'to_corn', 'pl_china']

In [7]:
# Parse SGML files
document_X = []
document_Y = []

def strip_tags(text):
    return re.sub('<[^<]+?>', '', text).strip()

def unescape(text):
    return saxutils.unescape(text)

In [8]:
# Iterate all files
for i in range(sgml_number_of_files):
    file_name = sgml_file_name_template.format(str(i).zfill(3))
    print('Reading file: %s' % file_name)
    
    with open(data_folder + file_name, 'rb') as file:
        content = BeautifulSoup(file.read().lower(), "lxml")
        
        for newsline in content('reuters'):
            document_categories = []
            
            # News-line Id
            document_id = newsline['newid']
            
            # News-line text
            document_body = strip_tags(str(newsline('text')[0].text)).replace('reuter\n&#3;', '')
            document_body = unescape(document_body)
            
            # News-line categories
            topics = newsline.topics.contents
            places = newsline.places.contents
            people = newsline.people.contents
            orgs = newsline.orgs.contents
            exchanges = newsline.exchanges.contents
            
            for topic in topics:
                document_categories.append('to_' + strip_tags(str(topic)))
                
            for place in places:
                document_categories.append('pl_' + strip_tags(str(place)))
                
            for person in people:
                document_categories.append('pe_' + strip_tags(str(person)))
                
            for org in orgs:
                document_categories.append('or_' + strip_tags(str(org)))
                
            for exchange in exchanges:
                document_categories.append('ex_' + strip_tags(str(exchange)))
                
            # Create new document    
            update_frequencies(document_categories)
            
            document_X.append(document_body)
            document_Y.append(to_category_vector(document_categories, selected_categories))

Reading file: reut2-000.sgm


  after removing the cwd from sys.path.
  """


Reading file: reut2-001.sgm
Reading file: reut2-002.sgm
Reading file: reut2-003.sgm
Reading file: reut2-004.sgm
Reading file: reut2-005.sgm
Reading file: reut2-006.sgm
Reading file: reut2-007.sgm
Reading file: reut2-008.sgm
Reading file: reut2-009.sgm
Reading file: reut2-010.sgm
Reading file: reut2-011.sgm
Reading file: reut2-012.sgm
Reading file: reut2-013.sgm
Reading file: reut2-014.sgm
Reading file: reut2-015.sgm
Reading file: reut2-016.sgm
Reading file: reut2-017.sgm
Reading file: reut2-018.sgm
Reading file: reut2-019.sgm
Reading file: reut2-020.sgm
Reading file: reut2-021.sgm


In [9]:
# Select top 20 categories (by number of newslines)
news_categories.sort_values(by='Newslines', ascending=False, inplace=True)
# Selected categories
selected_categories = np.array(news_categories["Name"].head(20))
num_categories = 20
news_categories.head(num_categories)

Unnamed: 0,Name,Type,Newslines
296,pl_usa,Places,12542
35,to_earn,Topics,3987
0,to_acq,Topics,2448
293,pl_uk,Places,1489
219,pl_japan,Places,1138
166,pl_canada,Places,1104
73,to_money-fx,Topics,801
28,to_crude,Topics,634
45,to_grain,Topics,628
302,pl_west-germany,Places,567


In [10]:
# Take a look at the input and output data
print(document_X[220])
print(document_Y[220])

average yen cd rates fall in latest week
    tokyo, feb 27 - average interest rates on yen certificates
of deposit, cd, fell to 4.27 pct in the week ended february 25
from 4.32 pct the previous week, the bank of japan said.
    new rates (previous in brackets), were -
    average cd rates all banks 4.27 pct (4.32)
    money market certificate, mmc, ceiling rates for the week
starting from march 2          3.52 pct (3.57)
    average cd rates of city, trust and long-term banks
    less than 60 days          4.33 pct (4.32)
    60-90 days                 4.13 pct (4.37)
    average cd rates of city, trust and long-term banks
    90-120 days             4.35 pct (4.30)
    120-150 days            4.38 pct (4.29)
    150-180 days            unquoted (unquoted)
    180-270 days            3.67 pct (unquoted)
    over 270 days           4.01 pct (unquoted)
    average yen bankers' acceptance rates of city, trust and
long-term banks
    30 to less than 60 days unquoted (4.13)
    60-90 days  

In [11]:
# Clean up the data
lemmatizer = WordNetLemmatizer()
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
stop_words = set(nltk.download('stopwords').words("english"))

def cleanUpSentence(r, stop_words = None):
    r = r.lower().replace("<br />", " ")
    r = re.sub(strip_special_chars, "", r.lower())
    if stop_words is not None:
        words = word_tokenize(r)
        filtered_sentence = []
        for w in words:
            w = lemmatizer.lemmatize(w)
            if w not in stop_words:
                filtered_sentence.append(w)
        return " ".join(filtered_sentence)
    else:
        return r

[nltk_data] Downloading package stopwords to /home/anna/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


AttributeError: 'bool' object has no attribute 'words'