In [258]:
from sklearn.datasets import load_files 

import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

from __future__ import print_function

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.externals import joblib
from sklearn.cluster import KMeans


In [278]:
# preparar funciones de procesamiento de texto
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [279]:
# e.g.:
tokenize_and_stem('cats are running')

[u'cat', u'are', u'run']

**Texto de prueba**

In [261]:
text1 = 'Far out in the uncharted backwaters of the unfashionable end of \
  the  western  spiral  arm  of  the  Galaxy  lies  a  small  unregarded \
  yellow sun.'
text2 = 'In a hole in the ground there lived a hobbit. Not a nasty, dirty, \
  wet hole, filled with the ends of worms and an oozy smell, nor yet a dry, bare, \
  sandy hole with nothing in it to sit down on or to eat:  it  was  a  \
  hobbit-hole,  and  that  means comfort.'
text3 = 'In the beginning was the Word, and the Word was with God, and the Word was God.\
  The same was in the beginning with God. \
  All things were made by him; and without him was not any thing made that was made.'
text4 = 'There was Eru, the One, who in Arda is called Iluvatar; and he made first the Ainur, \
  the Holy Ones, that were the offspring of his thought, and they were with him before aught \
  else was made. And he spoke to them, propounding to them themes of music; and they sang \
  before him, and he was glad. But for a long while they sang only each alone, or but few together, \
  while the rest hearkened; for each comprehended only that part of me mind of Ilúvatar \
  from which he came, and in the understanding of their brethren they grew but slowly. \
  Yet ever as they listened they came to deeper understanding, and increased in unison and harmony.'
text5 = 'The morning had dawned clear and cold, with a crispness that hinted at the end of summer. \
  They set forth at daybreak to see a man beheaded, twenty in all, and Bran rode among them, \
  nervous with excitement. This was the first time he had been deemed old enough to go with \
  his lord father and his brothers to see the kings justice done. It was the ninth year of summer, \
  and the seventh of Brans life.'

text = []
text.append(text1.decode('utf-8').strip())
text.append(text2.decode('utf-8').strip())
text.append(text3.decode('utf-8').strip())
text.append(text4.decode('utf-8').strip())
text.append(text5.decode('utf-8').strip())

In [262]:
text

[u'Far out in the uncharted backwaters of the unfashionable end of   the  western  spiral  arm  of  the  Galaxy  lies  a  small  unregarded   yellow sun.',
 u'In a hole in the ground there lived a hobbit. Not a nasty, dirty,   wet hole, filled with the ends of worms and an oozy smell, nor yet a dry, bare,   sandy hole with nothing in it to sit down on or to eat:  it  was  a    hobbit-hole,  and  that  means comfort.',
 u'In the beginning was the Word, and the Word was with God, and the Word was God.  The same was in the beginning with God.   All things were made by him; and without him was not any thing made that was made.',
 u'There was Eru, the One, who in Arda is called Iluvatar; and he made first the Ainur,   the Holy Ones, that were the offspring of his thought, and they were with him before aught   else was made. And he spoke to them, propounding to them themes of music; and they sang   before him, and he was glad. But for a long while they sang only each alone, or but few togeth

**Separar en palabras y usar las raíces de los vocablos**

In [263]:
palabras = []

for i in text:
    p = tokenize_and_stem(i)
    palabras.extend(p)

**Limpiar las "palabras vacias"**

In [264]:
# lista de "stopwords"
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u"you're", u"you've", u"you'll", u"you'd", u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u"she's", u'her', u'hers', u'herself', u'it', u"it's", u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u"that'll", u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'eac

In [265]:
f_text = [word for word in palabras if word not in stopwords]
print(f_text)

[u'far', u'unchart', u'backwat', u'unfashion', u'end', u'western', u'spiral', u'arm', u'galaxi', u'lie', u'small', u'unregard', u'yellow', u'sun', u'hole', u'ground', u'live', u'hobbit', u'nasti', u'dirti', u'wet', u'hole', u'fill', u'end', u'worm', u'oozi', u'smell', u'yet', u'dri', u'bare', u'sandi', u'hole', u'noth', u'sit', u'eat', u'hobbit-hol', u'mean', u'comfort', u'begin', u'word', u'word', u'god', u'word', u'god', u'begin', u'god', u'thing', u'made', u'without', u'ani', u'thing', u'made', u'made', u'eru', u'one', u'arda', u'call', u'iluvatar', u'made', u'first', u'ainur', u'holi', u'one', u'offspr', u'thought', u'befor', u'aught', u'els', u'made', u'spoke', u'propound', u'theme', u'music', u'sang', u'befor', u'glad', u'long', u'sang', u'onli', u'alon', u'togeth', u'rest', u'hearken', u'comprehend', u'onli', u'part', u'mind', u'il\xfavatar', u'came', u'understand', u'brethren', u'grew', u'slowli', u'yet', u'ever', u'listen', u'came', u'deeper', u'understand', u'increas', u'unis

**Crear una tabla de palabras**

In [266]:
vocab_frame = pd.DataFrame({'words': f_text}, index = range(len(f_text)))
print(vocab_frame[0:10])

       words
0        far
1    unchart
2    backwat
3  unfashion
4        end
5    western
6     spiral
7        arm
8     galaxi
9        lie


In [276]:
# Compute the term frequency-inverse document frequency matrix
tfidf_vectorizer = TfidfVectorizer(max_df=0.98, max_features=200000,
                                 min_df=0.01, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(text)
print("La matrix tiene %i filas (documentos) y %i columnas (palabras)\n" % tfidf_matrix.shape)



La matrix tiene 5 filas (documentos) y 324 columnas (palabras)



In [277]:
terms = tfidf_vectorizer.get_feature_names()
print("Hay en total %i palabras:\n" % len(terms))
print(terms)

Hay en total 324 palabras:

[u'ainur', u'ainur holi', u'ainur holi offspr', u'alon', u'alon togeth', u'alon togeth rest', u'ani', u'ani thing', u'arda', u'arda iluvatar', u'arda iluvatar ainur', u'arm', u'arm galaxi', u'arm galaxi lie', u'aught', u'aught els', u'aught els spoke', u'backwat', u'backwat unfashion', u'backwat unfashion end', u'bare', u'bare sandi', u'bare sandi hole', u'befor', u'befor aught', u'befor aught els', u'befor glad', u'befor glad long', u'begin', u'begin god', u'begin god thing', u'begin word', u'begin word word', u'behead', u'behead twenti', u'behead twenti bran', u'bran', u'bran life', u'bran rode', u'bran rode nervous', u'brethren', u'brethren grew', u'brethren grew slowli', u'brother', u'brother king', u'brother king justic', u'came', u'came deeper', u'came deeper understand', u'came understand', u'came understand brethren', u'clear', u'clear cold', u'clear cold crisp', u'cold', u'cold crisp', u'cold crisp hint', u'comfort', u'comprehend', u'comprehend onli

## Buscando grupos de documentos con K-Means

In [272]:
num_clusters = 3
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

print(clusters)

[1, 1, 0, 2, 1]


In [273]:
# Recuento del número de elementos en cada cluster
for i in range(num_clusters):
    print ('El cluster %i tiene %i elementos' % (i, clusters.count(i)))

El cluster 0 tiene 1 elementos
El cluster 1 tiene 3 elementos
El cluster 2 tiene 1 elementos


In [274]:
dist = 1 - cosine_similarity(tfidf_matrix)

In [287]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]     
        
for i in range(num_clusters):
    print("[[ Cluster %d ]]" % i, end='\n\n')
    
    print("   WORDS /// ", end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(terms[ind], end=' / ')
    print('\n\n')


Top terms per cluster:

[[ Cluster 0 ]]

   WORDS /// god / word / word god / begin / thing / begin god / 


[[ Cluster 1 ]]

   WORDS /// hole / end / summer / bran / yellow sun / small unregard yellow / 


[[ Cluster 2 ]]

   WORDS /// sang / onli / came / understand / befor / ainur / 


