<a href="https://colab.research.google.com/github/joseogg/textanalytics/blob/main/extraccion_caracteristicas_texto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Librerías 

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt

# Corpus etiquetado

In [None]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today']

etiquetas = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather']


corpus = np.array(corpus)
df_corpus = pd.DataFrame({"documento": corpus, 
                          "categoria": etiquetas})
df_corpus

Unnamed: 0,documento,categoria
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,"A king's breakfast has sausages, ham, bacon, e...",food
4,"I love green eggs, ham, sausages and bacon!",food
5,The brown fox is quick and the blue dog is lazy!,animals
6,The sky is very blue and the sky is very beaut...,weather


# Pre-procesamiento


In [None]:
nltk.download("stopwords")
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words("english")

def normaliza_documento(doc):
    # quita caracteres especiales\espacios
    doc = re.sub(r"[^a-zA-Z\s]", "", doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokeniza el documento
    tokens = wpt.tokenize(doc)
    # filtra stopwords del documento
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # recrea el documento
    doc = " ".join(filtered_tokens)
    return doc

normaliza_corpus = np.vectorize(normaliza_documento)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
corpus_normalizado = normaliza_corpus(corpus)
corpus_normalizado

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog',
       'kings breakfast sausages ham bacon eggs toast beans',
       'love green eggs ham sausages bacon',
       'brown fox quick blue dog lazy', 'sky blue sky beautiful today'],
      dtype='<U51')

# Modelo de Bolsa de Palabras

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# bolsa de palabras en matriz dispersa
count_vectorizer = CountVectorizer(min_df=0.0, max_df=1.0)
matriz_conteo = count_vectorizer.fit_transform(corpus_normalizado)
matriz_conteo

<7x20 sparse matrix of type '<class 'numpy.int64'>'
	with 37 stored elements in Compressed Sparse Row format>

In [None]:
# ver valores diferentes de cero en la matriz dispersa
print(matriz_conteo)

  (0, 17)	1
  (0, 3)	1
  (0, 2)	1
  (1, 17)	1
  (1, 3)	1
  (1, 2)	1
  (1, 14)	1
  (2, 15)	1
  (2, 5)	1
  (2, 8)	1
  (2, 11)	1
  (2, 13)	1
  (2, 6)	1
  (3, 12)	1
  (3, 4)	1
  (3, 16)	1
  (3, 10)	1
  (3, 0)	1
  (3, 7)	1
  (3, 18)	1
  (3, 1)	1
  (4, 14)	1
  (4, 16)	1
  (4, 10)	1
  (4, 0)	1
  (4, 7)	1
  (4, 9)	1
  (5, 3)	1
  (5, 15)	1
  (5, 5)	1
  (5, 8)	1
  (5, 13)	1
  (5, 6)	1
  (6, 17)	2
  (6, 3)	1
  (6, 2)	1
  (6, 19)	1


In [None]:
# ver la representación densa
matriz_conteo = matriz_conteo.toarray()
matriz_conteo

array([[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1]])

In [None]:
# obten todas las palabras únicas del corpus
vocabulario = count_vectorizer.get_feature_names_out()
# muestra los vectores de características del documento
pd.DataFrame(matriz_conteo, columns=vocabulario)

Unnamed: 0,bacon,beans,beautiful,blue,breakfast,brown,dog,eggs,fox,green,ham,jumps,kings,lazy,love,quick,sausages,sky,toast,today
0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
2,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,1,0,0,0,0
3,1,1,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,1,0
4,1,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,1,0,0,0
5,0,0,0,1,0,1,1,0,1,0,0,0,0,1,0,1,0,0,0,0
6,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1


# N-gramas

In [None]:
# Ajusta el rango del n-grama entre 1 y 2 to para obtener tanto unigramas como bigramas
count_vectorizer = CountVectorizer(ngram_range=(2, 2))
matriz_n_gramas = count_vectorizer.fit_transform(corpus_normalizado)

matriz_n_gramas = matriz_n_gramas.toarray()
vocabulario = count_vectorizer.get_feature_names_out()
df = pd.DataFrame(matriz_n_gramas, columns=vocabulario)
df.sum()

bacon eggs            1
beautiful sky         1
beautiful today       1
blue beautiful        2
blue dog              1
blue sky              1
breakfast sausages    1
brown fox             2
dog lazy              1
eggs ham              1
eggs toast            1
fox jumps             1
fox quick             1
green eggs            1
ham bacon             1
ham sausages          1
jumps lazy            1
kings breakfast       1
lazy dog              1
love blue             1
love green            1
quick blue            1
quick brown           1
sausages bacon        1
sausages ham          1
sky beautiful         1
sky blue              2
toast beans           1
dtype: int64

# Modelo TF-IDF 
* TF: term-frequency 
* TF-IDF: (term-frequency) * (inverse document-frequency)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

# Este par de líneas se ejecutan solo para obtener el vocabulario
count_vectorizer = CountVectorizer(min_df=0.0, max_df=1.0)
count_vectorizer.fit_transform(corpus_normalizado)

# Aquí inicia la transformación
transformador = TfidfTransformer()
matriz_transformada = transformador.fit_transform(matriz_conteo)
matriz_transformada = matriz_transformada.toarray()
vocabulario = count_vectorizer.get_feature_names_out()
pd.DataFrame(np.round(matriz_transformada, 2), columns=vocabulario)

Unnamed: 0,bacon,beans,beautiful,blue,breakfast,brown,dog,eggs,fox,green,ham,jumps,kings,lazy,love,quick,sausages,sky,toast,today
0,0.0,0.0,0.6,0.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0
1,0.0,0.0,0.49,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.58,0.0,0.0,0.49,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.39,0.39,0.0,0.39,0.0,0.0,0.47,0.0,0.39,0.0,0.39,0.0,0.0,0.0,0.0
3,0.32,0.38,0.0,0.0,0.38,0.0,0.0,0.32,0.0,0.0,0.32,0.0,0.38,0.0,0.0,0.0,0.32,0.0,0.38,0.0
4,0.39,0.0,0.0,0.0,0.0,0.0,0.0,0.39,0.0,0.47,0.39,0.0,0.0,0.0,0.39,0.0,0.39,0.0,0.0,0.0
5,0.0,0.0,0.0,0.31,0.0,0.42,0.42,0.0,0.42,0.0,0.0,0.0,0.0,0.42,0.0,0.42,0.0,0.0,0.0,0.0
6,0.0,0.0,0.36,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72,0.0,0.51


# Similaridad entre documentos

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

matriz_similaridad = cosine_similarity(matriz_transformada)
df_similaridad = pd.DataFrame(matriz_similaridad)
df_similaridad

Unnamed: 0,0,1,2,3,4,5,6
0,1.0,0.817309,0.0,0.0,0.0,0.164799,0.813073
1,0.817309,1.0,0.0,0.0,0.226856,0.134692,0.664532
2,0.0,0.0,1.0,0.0,0.0,0.835548,0.0
3,0.0,0.0,0.0,1.0,0.502932,0.0,0.0
4,0.0,0.226856,0.0,0.502932,1.0,0.0,0.0
5,0.164799,0.134692,0.835548,0.0,0.0,1.0,0.098298
6,0.813073,0.664532,0.0,0.0,0.0,0.098298,1.0


In [None]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
          'I love green eggs, ham, sausages and bacon!',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today']

etiquetas = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather']