# Load articles from RSS feeds

In [1]:
import feedparser
import eatiht.v2 as v2

def load_articles(feed_url, target, n=1):
    d = feedparser.parse(feed_url)
    for e in d.entries:
        name = str(n).zfill(4) + '.dat'
        file = open(target + '/' + name, 'w+')
        try:
            file.write(e.title.encode('utf8'))
            file.write(v2.extract(e.link).encode('utf8'))
            n = n + 1
        except Exception:
            print('error ' + e.link)

In [2]:
import os

root = 'examples'

feeds_for_category = {
    'sports': 'http://estaticos.elmundo.es/elmundodeporte/rss/futbol.xml',
    'economy': 'http://estaticos.elmundo.es/elmundo/rss/economia.xml',
    'science': 'http://estaticos.elmundo.es/elmundo/rss/ciencia.xml'
}

for category, feed in feeds_for_category.iteritems():
    subdir = '/'.join([root, category])
    if not os.path.exists(subdir):
        os.makedirs(subdir)
    load_articles(feed, subdir)

error http://www.elmundo.es/comunidad-valenciana/2016/02/15/56c1b79aca4741fb3e8b45f9.html


# Read all documents in a directory and its subdirectories

In [3]:
def read_all_documents(root):
    labels = []
    docs = []
    for r, dirs, files in os.walk(root):
        for file in files:
            with open(os.path.join(r, file), "r") as f:
                docs.append(f.read())     
            labels.append(r.replace(root, ''))
    return dict([('docs', docs), ('labels', labels)])

In [4]:
data = read_all_documents('examples')
documents = data['docs']
labels = data['labels']

# Count words frequency in documents

In [5]:
import re
from collections import defaultdict

def tokens(doc):
    return (tok.lower() for tok in re.findall(r"\w+", doc))

def frequency(tokens):
    f = defaultdict(int)
    for token in tokens:
        f[token] += 1
    return f

def tokens_frequency(doc):
    return frequency(tokens(doc))

# Extract features from documents

## Symbolic features names

In [6]:
from sklearn.feature_extraction import DictVectorizer, FeatureHasher

vectorizer = DictVectorizer()
vectorizer.fit_transform(tokens_frequency(d) for d in documents)

vectorizer.get_feature_names()



['0',
 '00',
 '000',
 '02',
 '024',
 '03',
 '042',
 '05',
 '068',
 '069',
 '072',
 '086',
 '095',
 '1',
 '10',
 '100',
 '101',
 '102',
 '105',
 '106',
 '107',
 '11',
 '110',
 '113',
 '115',
 '12',
 '120',
 '127',
 '13',
 '130',
 '134',
 '135',
 '139',
 '14',
 '140',
 '146',
 '148',
 '15',
 '150',
 '155',
 '157',
 '15km',
 '16',
 '160',
 '166',
 '1662',
 '16m',
 '17',
 '173',
 '176',
 '1781',
 '1799',
 '18',
 '180',
 '185',
 '1859',
 '186',
 '1864',
 '1872',
 '1880',
 '189',
 '1891',
 '19',
 '192',
 '194',
 '1943',
 '1945',
 '1948',
 '1957',
 '1958',
 '1961',
 '1970',
 '1978',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1997',
 '1998',
 '1999',
 '1a',
 '2',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2007',
 '2008',
 '2009',
 '201',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2019urrutia',
 '2020',
 '2021',
 '2024',
 '202

## Sparse matrices

In [7]:
http://www.abc.es/economia/abci-bufetes-intentan-accionistas-bankia-vayan-juicio-201602190746_noticia.html#hasher = FeatureHasher(n_features=2**8)
#X = hasher.transform(tokens_frequency(d) for d in documents)

hasher = FeatureHasher(n_features=2**8, input_type="string")
X = hasher.transform(tokens(d) for d in documents)

print(X.toarray())

[[  2.  -3.   0. ...,   2.   0.  -5.]
 [  2. -10.  -1. ...,   2.   0. -12.]
 [  1.   0.   3. ...,   0.   1. -12.]
 ..., 
 [  2.  -1.  -2. ...,   0.   0. -12.]
 [  4.  -2.  -1. ...,   1.   0. -13.]
 [ -1.  -2.  -2. ...,   6.   0.  -8.]]


# Train a text classifier using K-Means clustering
See http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

prepositions =['a','ante','bajo','cabe','con','contra','de','desde','en','entre','hacia','hasta','para','por','según','sin','so','sobre','tras']
prep_alike = ['durante','mediante','excepto','salvo','incluso','más','menos']
adverbs = ['no','si','sí']
articles = ['el','la','los','las','un','una','unos','unas','este','esta','estos','estas','aquel','aquella','aquellos','aquellas']
aux_verbs = ['he','has','ha','hemos','habéis','han','había','habías','habíamos','habíais','habían']
tfid = TfidfVectorizer(stop_words=prepositions+prep_alike+adverbs+articles+aux_verbs)

X_train = tfid.fit_transform(documents)
y_train = labels

clf = KNeighborsClassifier(n_neighbors=10)
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

# Predict categories for new articles

In [19]:
test = read_all_documents('examples2')
X_test = tfid.transform(test['docs'])
y_test = test['labels']
pred = clf.predict(X_test)

print('accuracy score %0.3f' % clf.score(X_test, y_test))
print(pred)

accuracy score 0.922
['/science' '/science' '/science' '/science' '/science' '/science'
 '/science' '/science' '/science' '/science' '/science' '/science'
 '/sports' '/science' '/science' '/science' '/science' '/economy'
 '/science' '/science' '/science' '/sports' '/science' '/science'
 '/economy' '/science' '/science' '/science' '/science' '/science'
 '/sports' '/science' '/science' '/science' '/science' '/sports' '/science'
 '/sports' '/sports' '/sports' '/sports' '/sports' '/sports' '/sports'
 '/sports' '/sports' '/sports' '/sports' '/sports' '/sports' '/sports'
 '/sports' '/sports' '/sports' '/sports' '/sports' '/sports' '/sports'
 '/sports' '/sports' '/sports' '/sports' '/sports' '/sports' '/sports'
 '/sports' '/sports' '/sports' '/sports' '/sports' '/sports' '/sports'
 '/sports' '/sports' '/sports' '/sports' '/sports' '/sports' '/sports'
 '/sports' '/sports' '/sports' '/sports' '/sports' '/sports' '/sports'
 '/sports' '/economy' '/sports' '/economy' '/sports' '/economy' '/economy

## Try with new, fresh articles

In [30]:
def predict_category(url, classifier):
    article = v2.extract(url).encode('utf8')
    X_test = tfid.transform([article])
    return clf.predict(X_test)[0]

def show_predicted_categories(urls, classifier):
    for url in urls:
        print('predicted category: ' + predict_category(url, clf))

show_predicted_categories(
    [
        'http://www.abc.es/economia/abci-bufetes-intentan-accionistas-bankia-vayan-juicio-201602190746_noticia.html',
        'http://www.elconfidencial.com/deportes/futbol/2016-02-19/torres-atletico-cope_1154857/'],
    clf)

predicted category: /economy
predicted category: /sports
