1. Pre-procesamiento

In [1]:
import re
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
stopwords_sp = stopwords.words('spanish')

from nltk.stem.snowball import SnowballStemmer
spanishStemmer=SnowballStemmer("spanish")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances

In [2]:
# Leer archivo

doc = pd.read_csv("C:/Users/eliza/OneDrive/Desktop/Claus/Konrad/NLP/bob_esponja.csv")
doc

Unnamed: 0,Personaje,Descripción
0,Bob Esponja Pantalones Cuadrados,Bob Esponja es una esponja de mar con forma re...
1,Patricio Estrella,"Es una estrella de mar de color rosa, el mejor..."
2,Calamardo Tentáculos,Calamardo Tentáculos es un personaje principal...
3,Arenita Mejillas,Arenita Mejillas (Sandy Cheeks en inglés) en u...
4,Eugenio H. Cangrejo,Don Eugenio H. Cangrejo (normalmente llamado S...
5,Sheldon J. Plankton,"Sheldon J. Plankton, o simplemente Plankton, e..."
6,Karen Plankton,Karen Plankton es uno de los dos principales a...
7,Perlita Cangrejo,"Perla ""Perlita"" Cangrejo es un personaje princ..."
8,Sra. Puff,La Señora Puff es un personaje principal de Bo...
9,Gary el Caracol,Gary el Caracol es un personaje principal de B...


In [3]:
#Nueva columna con modificaciones

def pre_procesado(texto):
    texto = texto.lower()                                                         #pasar a minúsculas
    texto = re.sub(r"[\W\d_´]+", " ", texto)                                      #eliminar caracteres
    texto = [palabra for palabra in texto.split() if palabra not in stopwords_sp] #quitar palabras vacías
    texto = [spanishStemmer.stem(texto) for texto in texto]                       #stemming de las palabras
    texto = [texto.lower() for texto in texto if len(texto) > 2 and texto.isalpha()] #quitar palabras de menos de 2 caracteres
    texto = " ".join(texto)
    return (texto)

doc['pre_procesado']=doc['Descripción'].apply(lambda texto: pre_procesado(texto))
doc

Unnamed: 0,Personaje,Descripción,pre_procesado
0,Bob Esponja Pantalones Cuadrados,Bob Esponja es una esponja de mar con forma re...,bob esponj esponj mar form rectangul color ama...
1,Patricio Estrella,"Es una estrella de mar de color rosa, el mejor...",estrell mar color ros mejor amig bob esponj ju...
2,Calamardo Tentáculos,Calamardo Tentáculos es un personaje principal...,calamard tentacul personaj principal bob espon...
3,Arenita Mejillas,Arenita Mejillas (Sandy Cheeks en inglés) en u...,arenit mejill sandy cheeks ingles personaj pri...
4,Eugenio H. Cangrejo,Don Eugenio H. Cangrejo (normalmente llamado S...,don eugeni cangrej normal llam señor cangrej e...
5,Sheldon J. Plankton,"Sheldon J. Plankton, o simplemente Plankton, e...",sheldon plankton simplement plankton dos antag...
6,Karen Plankton,Karen Plankton es uno de los dos principales a...,kar plankton dos principal antagon bob esponj ...
7,Perlita Cangrejo,"Perla ""Perlita"" Cangrejo es un personaje princ...",perl perlit cangrej personaj principal bob esp...
8,Sra. Puff,La Señora Puff es un personaje principal de Bo...,señor puff personaj principal bob esponj maest...
9,Gary el Caracol,Gary el Caracol es un personaje principal de B...,gary caracol personaj principal bob esponj que...


2. Matriz TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(doc['pre_procesado'].values)

tfidf_matrix = pd.DataFrame(tfidf.toarray(), columns=tfidf_vect.get_feature_names())
tfidf_matrix.index = doc.index

tfidf_matrix = tfidf_matrix.T.round(3)
tfidf_matrix.columns = doc.Personaje.values
tfidf_matrix

Unnamed: 0,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
abaj,0.058,0.00,0.000,0.000,0.000,0.0,0.000,0.0,0.000,0.000
aborrec,0.000,0.00,0.091,0.000,0.000,0.0,0.000,0.0,0.000,0.000
acab,0.000,0.00,0.091,0.000,0.000,0.0,0.000,0.0,0.000,0.000
accident,0.049,0.00,0.000,0.000,0.045,0.0,0.000,0.0,0.000,0.000
acept,0.000,0.00,0.000,0.000,0.000,0.0,0.064,0.0,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...
viv,0.000,0.05,0.060,0.108,0.070,0.0,0.000,0.0,0.000,0.000
vol,0.058,0.00,0.000,0.000,0.000,0.0,0.000,0.0,0.000,0.000
voz,0.197,0.00,0.000,0.139,0.000,0.0,0.000,0.0,0.000,0.000
vuelv,0.173,0.00,0.000,0.000,0.000,0.0,0.000,0.0,0.000,0.000


3. Medidas de Similitud

In [5]:
# distancia del coseno entre personajes
from sklearn.metrics.pairwise import cosine_distances

dist_cos = cosine_distances(tfidf_matrix.T.values)
dist_cos = pd.DataFrame(dist_cos, columns = tfidf_matrix.columns, index = tfidf_matrix.columns)
dist_cos

Unnamed: 0,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
Bob Esponja Pantalones Cuadrados,0.0,0.831677,0.918198,0.836849,0.867014,0.88235,0.938282,0.884641,0.856903,0.923351
Patricio Estrella,0.831677,0.0,0.790027,0.840653,0.830567,0.971415,0.929954,0.89113,0.952511,0.894189
Calamardo Tentáculos,0.918198,0.790027,0.0,0.919676,0.83083,0.91676,0.949913,0.967253,0.968111,0.851343
Arenita Mejillas,0.836849,0.840653,0.919676,0.0,0.907361,0.967271,0.948252,0.961946,0.972006,0.900317
Eugenio H. Cangrejo,0.867014,0.830567,0.83083,0.907361,0.0,0.688275,0.891333,0.79792,0.895439,0.920153
Sheldon J. Plankton,0.88235,0.971415,0.91676,0.967271,0.688275,0.0,0.792585,0.921823,0.986654,0.968991
Karen Plankton,0.938282,0.929954,0.949913,0.948252,0.891333,0.792585,0.0,0.966145,0.984919,0.965109
Perlita Cangrejo,0.884641,0.89113,0.967253,0.961946,0.79792,0.921823,0.966145,0.0,0.873508,0.969351
Sra. Puff,0.856903,0.952511,0.968111,0.972006,0.895439,0.986654,0.984919,0.873508,0.0,0.964104
Gary el Caracol,0.923351,0.894189,0.851343,0.900317,0.920153,0.968991,0.965109,0.969351,0.964104,0.0


In [6]:
def color_red(val):
    color = 'green' if val ==True else 'red'
    return f'color: {color}'
    
t = dist_cos>0.95
t.style.applymap(color_red)

Unnamed: 0,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
Bob Esponja Pantalones Cuadrados,False,False,False,False,False,False,False,False,False,False
Patricio Estrella,False,False,False,False,False,True,False,False,True,False
Calamardo Tentáculos,False,False,False,False,False,False,False,True,True,False
Arenita Mejillas,False,False,False,False,False,True,False,True,True,False
Eugenio H. Cangrejo,False,False,False,False,False,False,False,False,False,False
Sheldon J. Plankton,False,True,False,True,False,False,False,False,True,True
Karen Plankton,False,False,False,False,False,False,False,True,True,True
Perlita Cangrejo,False,False,True,True,False,False,True,False,False,True
Sra. Puff,False,True,True,True,False,True,True,False,False,True
Gary el Caracol,False,False,False,False,False,True,True,True,True,False


In [7]:
# Mayor similitud entre personajes

np.fill_diagonal(dist_cos.values, 999999)
resultado = round(dist_cos.min().min(),6)
resultado

0.688275

In [8]:
sim = dist_cos.where(dist_cos==resultado).dropna(how='all').dropna(axis=1)
sim

Unnamed: 0,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol


In [9]:
# Mayor Diferencia entre personajes

np.fill_diagonal(dist_cos.values, -999999)
round(dist_cos.max().max(),6)

0.986654

In [10]:
dif = dist_cos.where(dist_cos==round(dist_cos.max().max(),6)).dropna(how='all').dropna(axis=1)
dif

Unnamed: 0,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
