# Import necessary dependencies and settings

In [34]:
import pandas as pd
import numpy as np
import re
import nltk

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /Users/erika/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Sample corpus of text documents

In [36]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!']
labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals']

# carga en un DataFrame los documentos y sus categorías

df = pd.DataFrame({"Document":corpus,"Category": labels})

corpus_df = df[["Document","Category"]]

# Simple text pre-processing

In [37]:
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,The brown fox is quick and the blue dog is lazy!,animals
4,The sky is very blue and the sky is very beaut...,weather
5,The dog is lazy but the brown fox is quick!,animals


In [49]:
#usa nltk.WordPunctTokenizer()


wpt = nltk.WordPunctTokenizer()
stop_words= nltk.corpus.stopwords.words("english")


def normalize_document(doc):
    doc = re.sub(r"[^a-zA-Z0-9\s]","",doc,re.I)
    # ^ que NO sea \s es espacio en blanco, lo cambia a "", re.I es ignore case
    doc = doc.lower()
    doc = doc.strip()
# usa stop words
# normaliza el texto (que solo sean números, espacios en blanco y caracteres de la "a" a la "z")
# asegúrate de que todo el texto esté en minúsculas

    tokens = wpt.tokenize(doc)
    #print(tokens)
# te puede ayudar usar np.vectorize()

    filtered_tokens = [token for token in tokens if token not in stop_words]
    #print(filtered_tokens)
    doc = " ".join(filtered_tokens)
    return doc
normalize_corpus=np.vectorize(normalize_document)

In [50]:
frase = "The sky is blue and beatiful"

frase_n=normalize_document(frase)
frase_n

'sky blue beatiful'

Veamos qué hace np.vectorize().

Imaginemos una función que acepta un número y devuelve True o False si el número es o no par

In [51]:
# Probemos la función con un par de valores:



Esta función acepta un escalar, por lo que intentar usarla con un array NumPy devolverá un error. Pero podemos "vectorizarla" con la función np.vectorize

In [52]:
# Ahora ya es posible usarla con arrays


Volvamos a procesamiento de Texto.

In [53]:
# muestra el corpus

In [54]:
#normaliza una frase
frase = "The sky is blue and beatiful"

frase_n=normalize_document(frase)
frase_n

'sky blue beatiful'

In [55]:
# normaliza el corpus
norm_corpus=normalize_corpus(corpus)
norm_corpus

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
       'sky blue sky beautiful today', 'dog lazy brown fox quick'],
      dtype='<U30')

In [65]:
vocab = cv.get_feature_names()
print(vocab)
pd.DataFrame(cv_matrix, columns = vocab)

['beautiful', 'blue', 'brown', 'dog', 'fox', 'jumps', 'lazy', 'love', 'quick', 'sky', 'today']


Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,lazy,love,quick,sky,today
0,1,1,0,0,0,0,0,0,0,1,0
1,1,1,0,0,0,0,0,1,0,1,0
2,0,0,1,1,1,1,1,0,1,0,0
3,0,1,1,1,1,0,1,0,1,0,0
4,1,1,0,0,0,0,0,0,0,2,1
5,0,0,1,1,1,0,1,0,1,0,0


# Bag of Words Model

In [57]:
from sklearn.feature_extraction.text import CountVectorizer
# usa countvectorizer con el corpus normalizado
cv = CountVectorizer(min_df=0., max_df=1.)

# son los cortes para el vocabulario segun las frecuencias en los docuets
#aqui estamos cogiendo todas
#tanto las palabras que aparecen muy poco 0
#como las que aparecen mucho 1 

cv_matrix =cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1],
       [0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0]])

In [None]:

# muestra el vocabulario en una matriz

# Bag of N-Grams Model

In [74]:
bv= CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
bigrams = pd.DataFrame(bv_matrix, columns = vocab)

# calcula bigramas (2,2)

In [75]:
bigrams# muestra los bigramas

Unnamed: 0,beautiful sky,beautiful today,blue beautiful,blue dog,blue sky,brown fox,dog lazy,fox jumps,fox quick,jumps lazy,lazy brown,lazy dog,love blue,quick blue,quick brown,sky beautiful,sky blue
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0
3,0,0,0,1,0,1,1,0,1,0,0,0,0,1,0,0,0
4,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
5,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0


# TF-IDF Model

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix=tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2),columns = vocab)# usa tf-idf en el corpus normalizado

Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,lazy,love,quick,sky,today
0,0.6,0.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0
1,0.46,0.39,0.0,0.0,0.0,0.0,0.0,0.66,0.0,0.46,0.0
2,0.0,0.0,0.38,0.38,0.38,0.54,0.38,0.0,0.38,0.0,0.0
3,0.0,0.36,0.42,0.42,0.42,0.0,0.42,0.0,0.42,0.0,0.0
4,0.36,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72,0.52
5,0.0,0.0,0.45,0.45,0.45,0.0,0.45,0.0,0.45,0.0,0.0


# Document Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# usa la similitud de coseno en el tf-idf


## Clustering documents using similarity features

In [None]:
from sklearn.cluster import KMeans

# haz un kmeans con 2 clusters
# haz un fit_transform de la similitud del coseno anterior

# Topic models

In [None]:
from sklearn.decomposition import LatentDirichletAllocation


# usa LatentDirichletAllocation y calcula 2 topics del tf-idf

## Show topics and their weights

In [None]:
# imprimo el vocabulario perteneciente a cada tópico

## Clustering documents using topic model features

In [None]:
# ahora agrupo en clusters usando los tópicos, no la similaridad basada en cosenos

# Word Embeddings

In [None]:
# Ahora voy a trabajar mapear palabras en vectores

from gensim.models import word2vec



size: The number of dimensions of the embeddings and the default is 100.

window: The maximum distance between a target word and words around the target word. The default window is 5.

min_count: The minimum count of words to consider when training the model; words with occurrence less than this count will be ignored. The default for min_count is 5.

In [None]:

# de palabra a vector

In [None]:
'''
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
   
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model.wv, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)
'''

In [None]:
from sklearn.cluster import AffinityPropagation

# en affinitypropagation no se especifica el número de clusters, lo encuentra él

