**Danilo Plazas Irreño - Cód.: 616202048**

In [1]:
import re
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
stopwords_sp = stopwords.words('spanish')

from nltk.stem.snowball import SnowballStemmer
spanishStemmer=SnowballStemmer("spanish")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances

# Punto 1: Pre-Procesamiento

In [2]:
#Leer el archivo
df = pd.read_csv("C:/Users/Danilo P/Documents/KonradLorenz/NLP/Taller5/bob_esponja.csv")
df

Unnamed: 0,Personaje,Descripción
0,Bob Esponja Pantalones Cuadrados,Bob Esponja es una esponja de mar con forma re...
1,Patricio Estrella,"Es una estrella de mar de color rosa, el mejor..."
2,Calamardo Tentáculos,Calamardo Tentáculos es un personaje principal...
3,Arenita Mejillas,Arenita Mejillas (Sandy Cheeks en inglés) en u...
4,Eugenio H. Cangrejo,Don Eugenio H. Cangrejo (normalmente llamado S...
5,Sheldon J. Plankton,"Sheldon J. Plankton, o simplemente Plankton, e..."
6,Karen Plankton,Karen Plankton es uno de los dos principales a...
7,Perlita Cangrejo,"Perla ""Perlita"" Cangrejo es un personaje princ..."
8,Sra. Puff,La Señora Puff es un personaje principal de Bo...
9,Gary el Caracol,Gary el Caracol es un personaje principal de B...


In [3]:
def pre_procesado(texto):
    texto = texto.lower()
    texto = re.sub(r"[\W\d_]+", " ", texto)
    texto = texto.split() # tokenización 
    texto = [palabra for palabra in texto if palabra not in stopwords_sp]
    texto = [spanishStemmer.stem(frase) for frase in texto]
    texto = " ".join(texto)
    return texto

In [4]:
#Nueva Columna
df['pre_procesado'] = df['Descripción'].apply(lambda val: pre_procesado(val))
df

Unnamed: 0,Personaje,Descripción,pre_procesado
0,Bob Esponja Pantalones Cuadrados,Bob Esponja es una esponja de mar con forma re...,bob esponj esponj mar form rectangul color ama...
1,Patricio Estrella,"Es una estrella de mar de color rosa, el mejor...",estrell mar color ros mejor amig bob esponj ju...
2,Calamardo Tentáculos,Calamardo Tentáculos es un personaje principal...,calamard tentacul personaj principal bob espon...
3,Arenita Mejillas,Arenita Mejillas (Sandy Cheeks en inglés) en u...,arenit mejill sandy cheeks ingles personaj pri...
4,Eugenio H. Cangrejo,Don Eugenio H. Cangrejo (normalmente llamado S...,don eugeni h cangrej normal llam señor cangrej...
5,Sheldon J. Plankton,"Sheldon J. Plankton, o simplemente Plankton, e...",sheldon j plankton simplement plankton dos ant...
6,Karen Plankton,Karen Plankton es uno de los dos principales a...,kar plankton dos principal antagon bob esponj ...
7,Perlita Cangrejo,"Perla ""Perlita"" Cangrejo es un personaje princ...",perl perlit cangrej personaj principal bob esp...
8,Sra. Puff,La Señora Puff es un personaje principal de Bo...,señor puff personaj principal bob esponj maest...
9,Gary el Caracol,Gary el Caracol es un personaje principal de B...,gary caracol personaj principal bob esponj que...


# Punto 2: TF-IDF

In [5]:
# tfidf_matrix
tfidf_vec = TfidfVectorizer()
tfidf = tfidf_vec.fit_transform(df['pre_procesado'].values)

tfidf_matrix = pd.DataFrame(tfidf.toarray())
tfidf_matrix.columns = tfidf_vec.get_feature_names()
tfidf_matrix.index = df['Personaje']

tfidf_matrix = tfidf_matrix.T.round(3)

tfidf_matrix

Personaje,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
abaj,0.057,0.00,0.000,0.000,0.000,0.0,0.000,0.0,0.000,0.000
aborrec,0.000,0.00,0.091,0.000,0.000,0.0,0.000,0.0,0.000,0.000
acab,0.000,0.00,0.091,0.000,0.000,0.0,0.000,0.0,0.000,0.000
accident,0.049,0.00,0.000,0.000,0.045,0.0,0.000,0.0,0.000,0.000
acept,0.000,0.00,0.000,0.000,0.000,0.0,0.064,0.0,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...
viv,0.000,0.05,0.060,0.108,0.070,0.0,0.000,0.0,0.000,0.000
vol,0.057,0.00,0.000,0.000,0.000,0.0,0.000,0.0,0.000,0.000
voz,0.195,0.00,0.000,0.139,0.000,0.0,0.000,0.0,0.000,0.000
vuelv,0.172,0.00,0.000,0.000,0.000,0.0,0.000,0.0,0.000,0.000


# Punto 3: Distancia del coseno

In [6]:
#Distancia del coseno entre cada una de los personajes
dist_cos = cosine_distances(tfidf_matrix.T.values)
dist_cos = pd.DataFrame(dist_cos, columns = tfidf_matrix.columns, index = tfidf_matrix.columns)
dist_cos

Personaje,Bob Esponja Pantalones Cuadrados,Patricio Estrella,Calamardo Tentáculos,Arenita Mejillas,Eugenio H. Cangrejo,Sheldon J. Plankton,Karen Plankton,Perlita Cangrejo,Sra. Puff,Gary el Caracol
Personaje,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bob Esponja Pantalones Cuadrados,0.0,0.826052,0.918644,0.838317,0.868204,0.872076,0.938609,0.88581,0.858077,0.924158
Patricio Estrella,0.826052,0.0,0.79053,0.841122,0.831634,0.966449,0.930045,0.891502,0.952583,0.89479
Calamardo Tentáculos,0.918644,0.79053,0.0,0.919857,0.831344,0.917606,0.949913,0.967265,0.968111,0.851644
Arenita Mejillas,0.838317,0.841122,0.919857,0.0,0.908202,0.967774,0.948323,0.962015,0.97212,0.900641
Eugenio H. Cangrejo,0.868204,0.831634,0.831344,0.908202,0.0,0.68738,0.891897,0.796723,0.89608,0.920664
Sheldon J. Plankton,0.872076,0.966449,0.917606,0.967774,0.68738,0.0,0.794981,0.922577,0.986733,0.969239
Karen Plankton,0.938609,0.930045,0.949913,0.948323,0.891897,0.794981,0.0,0.966088,0.984919,0.965252
Perlita Cangrejo,0.88581,0.891502,0.967265,0.962015,0.796723,0.922577,0.966088,0.0,0.873832,0.969504
Sra. Puff,0.858077,0.952583,0.968111,0.97212,0.89608,0.986733,0.984919,0.873832,0.0,0.964177
Gary el Caracol,0.924158,0.89479,0.851644,0.900641,0.920664,0.969239,0.965252,0.969504,0.964177,0.0


In [7]:
dist_cos2 = cosine_distances(tfidf_matrix.T.values)
dist_cos2.max()

0.986733085245766

In [8]:
difer = dist_cos.max(axis=1)
df2 = pd.DataFrame({'Personaje':difer.index, 'DistanciaCoseno':difer.values})
df_difer = df2[df2["DistanciaCoseno"]==df2.max()[1]]
df_difer

Unnamed: 0,Personaje,DistanciaCoseno
5,Sheldon J. Plankton,0.986733
8,Sra. Puff,0.986733


In [9]:
#¿Cuáles son los personajes más parecidos?
print(f'Los personajes mas diferentes son: {df_difer["Personaje"].values}' )

Los personajes mas diferentes son: ['Sheldon J. Plankton' 'Sra. Puff']


In [10]:
dist_cos2[dist_cos2 > 0.00001].min()

0.687380491002058

In [11]:
parecidos = dist_cos[dist_cos > 0.00001].min(axis=1)
df3 = pd.DataFrame({'Personaje':parecidos.index, 'DistanciaCoseno':parecidos.values})
df_parecidos = df3[df3["DistanciaCoseno"]==df3.min()[1]]
df_parecidos

Unnamed: 0,Personaje,DistanciaCoseno
4,Eugenio H. Cangrejo,0.68738
5,Sheldon J. Plankton,0.68738


In [12]:
#¿Cuáles son los personajes más diferentes?
print(f'Los personajes mas parecidos son: {df_parecidos["Personaje"].values}' )

Los personajes mas parecidos son: ['Eugenio H. Cangrejo' 'Sheldon J. Plankton']
