# Vectorización TF-IDF y similaridad

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
# librerías para procesar
import numpy as np
import pandas as pd

# expresiones regulares
import re

# librerías para graficar
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# librería de PLN en español
from nltk.corpus import stopwords
stopwords_sp = stopwords.words('spanish')

from nltk.stem.snowball import SnowballStemmer
spanishStemmer=SnowballStemmer("spanish")

# Count vectorizer usando NLTK
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances

# Algunos elementos de preprocesamiento: escalamiento y selección de k en k-means
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score

# Para los algotitmos de machine learning
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.decomposition import PCA

#Para omitir los warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
stemmer = SnowballStemmer("spanish")

palabras = ["correr", "corriendo", "corre", "programar", "programando","programa"]
lema =  [stemmer.stem(palabra) for palabra in palabras]
print(lema)

['corr', 'corr', 'corr', 'program', 'program', 'program']


Cargamos los datos

In [4]:
url = "https://raw.githubusercontent.com/Fabian830348/Bases_Datos/master/arenita.csv"

In [5]:
datos = pd.read_csv(url,encoding='utf-8')

In [6]:
datos.head()

Unnamed: 0,Personaje,Descripción
0,Bob Esponja Pantalones Cuadrados,Bob Esponja es una esponja de mar con forma re...
1,Patricio Estrella,"Es una estrella de mar de color rosa, el mejor..."
2,Calamardo Tentáculos,Calamardo Tentáculos es un personaje principal...
3,Arenita Mejillas,Arenita Mejillas (Sandy Cheeks en inglés) en u...
4,Eugenio H. Cangrejo,Don Eugenio H. Cangrejo (normalmente llamado S...


In [7]:
datos.shape

(10, 2)

In [8]:
datos1 = [x for x in datos.Descripción]

In [9]:
def pre_procesado(texto):
    texto = texto.lower()
    texto = re.sub(r"[\W\d_]+", " ", texto)
    texto = [palabra for palabra in texto.split() if palabra not in stopwords_sp]
    texto = " ".join(texto)
    return texto

In [10]:
datos['pre-procesado'] = datos['Descripción'].apply(lambda texto: pre_procesado(texto))
datos

Unnamed: 0,Personaje,Descripción,pre-procesado
0,Bob Esponja Pantalones Cuadrados,Bob Esponja es una esponja de mar con forma re...,bob esponja esponja mar forma rectangular colo...
1,Patricio Estrella,"Es una estrella de mar de color rosa, el mejor...",estrella mar color rosa mejor amigo bob esponj...
2,Calamardo Tentáculos,Calamardo Tentáculos es un personaje principal...,calamardo tentáculos personaje principal bob e...
3,Arenita Mejillas,Arenita Mejillas (Sandy Cheeks en inglés) en u...,arenita mejillas sandy cheeks inglés personaje...
4,Eugenio H. Cangrejo,Don Eugenio H. Cangrejo (normalmente llamado S...,don eugenio h cangrejo normalmente llamado señ...
5,Sheldon J. Plankton,"Sheldon J. Plankton, o simplemente Plankton, e...",sheldon j plankton simplemente plankton dos an...
6,Karen Plankton,Karen Plankton es uno de los dos principales a...,karen plankton dos principales antagonistas bo...
7,Perlita Cangrejo,"Perla ""Perlita"" Cangrejo es un personaje princ...",perla perlita cangrejo personaje principal bob...
8,Sra. Puff,La Señora Puff es un personaje principal de Bo...,señora puff personaje principal bob esponja ma...
9,Gary el Caracol,Gary el Caracol es un personaje principal de B...,gary caracol personaje principal bob esponja q...


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
tfidf = tfidf_vec.fit_transform(datos['pre-procesado'].values)

tfidf_matrix = pd.DataFrame(tfidf.toarray())
tfidf_matrix.columns = tfidf_vec.get_feature_names_out()
tfidf_matrix.index = datos["Personaje"]

tfidf_matrix = tfidf_matrix.T

tfidf_matrix.round(3)

Personaje,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
abajo,0.059,0.0,0.000,0.000,0.000,0.0,0.000,0.0,0.000,0.000
aborrece,0.000,0.0,0.091,0.000,0.000,0.0,0.000,0.0,0.000,0.000
acaba,0.000,0.0,0.091,0.000,0.000,0.0,0.000,0.0,0.000,0.000
accidente,0.050,0.0,0.000,0.000,0.046,0.0,0.000,0.0,0.000,0.000
aceptó,0.000,0.0,0.000,0.000,0.000,0.0,0.064,0.0,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...
zapatos,0.044,0.0,0.000,0.000,0.000,0.0,0.000,0.0,0.092,0.069
árbol,0.000,0.0,0.000,0.081,0.000,0.0,0.000,0.0,0.000,0.000
éste,0.000,0.0,0.000,0.000,0.000,0.0,0.000,0.0,0.000,0.093
única,0.059,0.0,0.000,0.000,0.000,0.0,0.000,0.0,0.000,0.000


In [12]:
tfidf_matrix

Personaje,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
abajo,0.058705,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
aborrece,0.000000,0.0,0.090712,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
acaba,0.000000,0.0,0.090712,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
accidente,0.049905,0.0,0.000000,0.000000,0.045528,0.0,0.000000,0.0,0.000000,0.000000
aceptó,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.063905,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
zapatos,0.043661,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.091678,0.069085
árbol,0.000000,0.0,0.000000,0.080677,0.000000,0.0,0.000000,0.0,0.000000,0.000000
éste,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.092890
única,0.058705,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000


Calculamos la distancia del coseno

$$similridad = \cos(\theta) = \dfrac{u.v}{\|u\| \|v\|} = \dfrac{\sum \limits_{i=1}^{n}u_iv_i}{\left(\sum \limits_{i=1}^{n} u_i^2 \right)^{1/2}  \left(\sum \limits_{i=1}^{n} v_i^2 \right)^{1/2}}  $$

$$cosineDistance = 1- cosineSimilarity$$

In [13]:
dist_cos = cosine_distances(tfidf_matrix.T.values)
dist_cos = pd.DataFrame(dist_cos, 
                        columns = tfidf_matrix.columns, 
                        index = tfidf_matrix.columns)
dist_cos

Personaje,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
Personaje,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bob Esponja Pantalones Cuadrados,0.0,0.850507,0.927257,0.871921,0.88122,0.901743,0.95792,0.916038,0.882641,0.931071
Patricio Estrella,0.850507,0.0,0.808538,0.893063,0.89373,0.968058,0.961623,0.936298,0.954744,0.922686
Calamardo Tentáculos,0.927257,0.808538,0.0,0.943984,0.873121,0.921201,0.96945,0.97291,0.968216,0.914684
Arenita Mejillas,0.871921,0.893063,0.943984,0.0,0.927035,0.982037,0.961451,0.982508,0.983039,0.930316
Eugenio H. Cangrejo,0.88122,0.89373,0.873121,0.927035,0.0,0.704685,0.90382,0.829929,0.938435,0.928366
Sheldon J. Plankton,0.901743,0.968058,0.921201,0.982037,0.704685,0.0,0.80837,0.921811,0.990016,0.971474
Karen Plankton,0.95792,0.961623,0.96945,0.961451,0.90382,0.80837,0.0,0.984115,0.991454,0.978322
Perlita Cangrejo,0.916038,0.936298,0.97291,0.982508,0.829929,0.921811,0.984115,0.0,0.8954,0.967919
Sra. Puff,0.882641,0.954744,0.968216,0.983039,0.938435,0.990016,0.991454,0.8954,0.0,0.963591
Gary el Caracol,0.931071,0.922686,0.914684,0.930316,0.928366,0.971474,0.978322,0.967919,0.963591,0.0


In [18]:
dist_cos[dist_cos>0 ].min()

Personaje
Bob Esponja Pantalones Cuadrados    0.850507
Patricio Estrella                   0.808538
Calamardo Tentáculos                0.808538
Arenita Mejillas                    0.871921
Eugenio H. Cangrejo                 0.704685
Sheldon J. Plankton                 0.704685
Karen Plankton                      0.808370
Perlita Cangrejo                    0.829929
Sra. Puff                           0.882641
Gary el Caracol                     0.914684
dtype: float64

In [15]:
dist_cos[dist_cos>0 ].idxmin()

Personaje
Bob Esponja Pantalones Cuadrados                   Patricio Estrella
Patricio Estrella                               Calamardo Tentáculos
Calamardo Tentáculos                               Patricio Estrella
Arenita Mejillas                    Bob Esponja Pantalones Cuadrados
Eugenio H. Cangrejo                              Sheldon J. Plankton
Sheldon J. Plankton                              Eugenio H. Cangrejo
Karen Plankton                                   Sheldon J. Plankton
Perlita Cangrejo                                 Eugenio H. Cangrejo
Sra. Puff                           Bob Esponja Pantalones Cuadrados
Gary el Caracol                                 Calamardo Tentáculos
dtype: object

In [16]:
dist_cos.max()

Personaje
Bob Esponja Pantalones Cuadrados    0.957920
Patricio Estrella                   0.968058
Calamardo Tentáculos                0.972910
Arenita Mejillas                    0.983039
Eugenio H. Cangrejo                 0.938435
Sheldon J. Plankton                 0.990016
Karen Plankton                      0.991454
Perlita Cangrejo                    0.984115
Sra. Puff                           0.991454
Gary el Caracol                     0.978322
dtype: float64

In [19]:
dist_cos.idxmax(skipna=True)

Personaje
Bob Esponja Pantalones Cuadrados         Karen Plankton
Patricio Estrella                   Sheldon J. Plankton
Calamardo Tentáculos                   Perlita Cangrejo
Arenita Mejillas                              Sra. Puff
Eugenio H. Cangrejo                           Sra. Puff
Sheldon J. Plankton                           Sra. Puff
Karen Plankton                                Sra. Puff
Perlita Cangrejo                         Karen Plankton
Sra. Puff                                Karen Plankton
Gary el Caracol                          Karen Plankton
dtype: object