# Read all documents in a directory and its subdirectories

In [3]:
import os

def read_all_documents(root):
    labels = []
    docs = []
    for r, dirs, files in os.walk(root):
        for file in files:
            with open(os.path.join(r, file), "r") as f:
                docs.append(f.read())     
            labels.append(r.replace(root, ''))
    return dict([('docs', docs), ('labels', labels)])

In [20]:
data = read_all_documents('training')
documents = data['docs']
labels = data['labels']

# Count words frequency in documents

In [5]:
import re
from collections import defaultdict

def tokens(doc):
    return (tok.lower() for tok in re.findall(r"\w+", doc))

def frequency(tokens):
    f = defaultdict(int)
    for token in tokens:
        f[token] += 1
    return f

def tokens_frequency(doc):
    return frequency(tokens(doc))

# Extract features from documents

## Symbolic features names

In [9]:
from sklearn.feature_extraction import DictVectorizer, FeatureHasher

vectorizer = DictVectorizer()
vectorizer.fit_transform(tokens_frequency(d) for d in documents)

vectorizer.get_feature_names()

['0',
 '00',
 '000',
 '013',
 '015',
 '02',
 '047',
 '05',
 '053',
 '06',
 '09',
 '1',
 '10',
 '100',
 '105',
 '108',
 '109',
 '11',
 '110',
 '115',
 '12',
 '120',
 '123',
 '124',
 '13',
 '130',
 '131',
 '132',
 '133',
 '137',
 '13h',
 '14',
 '1420',
 '144',
 '15',
 '150',
 '152',
 '154',
 '1543',
 '155',
 '156',
 '1582',
 '15km',
 '16',
 '160',
 '1610',
 '1616',
 '1623',
 '1633',
 '165',
 '1650',
 '166',
 '17',
 '170',
 '1729',
 '175',
 '1750',
 '176',
 '18',
 '1835',
 '1859',
 '187',
 '1872',
 '1876',
 '1878',
 '1880',
 '1884',
 '1886',
 '1887',
 '189',
 '1890',
 '1891',
 '19',
 '190',
 '1900',
 '1937',
 '1946',
 '1949',
 '1950',
 '1955',
 '1956',
 '1957',
 '1958',
 '1960',
 '1961',
 '1964',
 '1971',
 '1972',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1981',
 '1982',
 '1985',
 '1986',
 '1987',
 '1988',
 '1990',
 '1991',
 '1992',
 '1996',
 '1998',
 '1998aporta',
 '1999',
 '1a',
 '2',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '

## Sparse matrices

In [10]:
http://www.abc.es/economia/abci-bufetes-intentan-accionistas-bankia-vayan-juicio-201602190746_noticia.html#hasher = FeatureHasher(n_features=2**8)
#X = hasher.transform(tokens_frequency(d) for d in documents)

hasher = FeatureHasher(n_features=2**8, input_type="string")
X = hasher.transform(tokens(d) for d in documents)

print(X.toarray())

SyntaxError: invalid syntax (<ipython-input-10-9e1be0c9eebd>, line 1)

# Train a text classifier using K-Means clustering
See http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

prepositions =['a','ante','bajo','cabe','con','contra','de','desde','en','entre','hacia','hasta','para','por','según','sin','so','sobre','tras']
prep_alike = ['durante','mediante','excepto','salvo','incluso','más','menos']
adverbs = ['no','si','sí']
articles = ['el','la','los','las','un','una','unos','unas','este','esta','estos','estas','aquel','aquella','aquellos','aquellas']
aux_verbs = ['he','has','ha','hemos','habéis','han','había','habías','habíamos','habíais','habían']
tfid = TfidfVectorizer(stop_words=prepositions+prep_alike+adverbs+articles+aux_verbs)

X_train = tfid.fit_transform(documents)
y_train = labels

clf = KNeighborsClassifier(n_neighbors=10)
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

# Predict categories for new articles

In [19]:
test = read_all_documents('test')
X_test = tfid.transform(test['docs'])
y_test = test['labels']
pred = clf.predict(X_test)

print('accuracy score %0.3f' % clf.score(X_test, y_test))

accuracy score 0.922


## Try with new, fresh articles

In [18]:
import eatiht.v2 as v2

def predict_category(url, classifier):
    article = v2.extract(url).encode('utf8')
    X_test = tfid.transform([article])
    return clf.predict(X_test)[0]

def show_predicted_categories(urls, classifier):
    for url in urls:
        print('predicted category: ' + predict_category(url, clf))

show_predicted_categories(
    [
        'http://www.abc.es/economia/abci-bufetes-intentan-accionistas-bankia-vayan-juicio-201602190746_noticia.html',
        'http://www.elconfidencial.com/deportes/futbol/2016-02-19/torres-atletico-cope_1154857/'],
    clf)

predicted category: /economy
predicted category: /science
