In [12]:
from sentence_transformers import SentenceTransformer
import os
import numpy as np
from sklearn.preprocessing import Normalizer
import json
import pandas as pd
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from tqdm import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\josea\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Cargamos el modelo de hiiamsid; 
model = SentenceTransformer("./modelo")

In [15]:
#Cargamos los archivos con los datos
with open("./DATA/tickets.json", 'r') as ticket_file:
    tickets = json.load(ticket_file)

with open("./DATA/articles.json", 'r') as article_file:
    articles = json.load(article_file)


 <center><h3>Preprocesamiento de texto</h3></center>

(1) <s> Eliminación de signos de puntuación </s>

(2) <s> Eliminación de stopwords </s> 

(3) <s> Eliminación URLs </s>

(4) <s> Eliminación correos </s>

(5) <s> Eliminación archivos .png </s>

(6)  Corrección ortográfica 

In [14]:
def elimina_puntuacion(cadena):
    return " ".join("".join(["" if p in string.punctuation else p for p in cadena]).split())

def elimina_stopwords(cadena):
    lista_stop = stopwords.words('spanish')
    return ' '.join([word for word in cadena.split() if word not in lista_stop])

def elimina_urls(cadena):
    return re.compile(r'https?://\S+|www\.\S+').sub('',cadena)

def elimina_correos(cadena):
    return re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b').sub('',cadena)

def elimina_imagenes(cadena):
    return re.compile(r'\b[A-Za-z0-9]+\.+png\b').sub('',cadena)


In [17]:
#El orden que hay que seguir para aplicarselo al texto es: 
# 1º ---> eliminar correos / urls / eliminar imagenes >> eliminar puntuacion >> elimina stopword
for a in tqdm(articles):
    a['body'] = elimina_stopwords(elimina_puntuacion(elimina_imagenes(elimina_urls(elimina_correos(a['body'])))))

100%|██████████| 310/310 [00:00<00:00, 723.27it/s]


In [19]:
#Calculamos los embeddings y comparamos, para la prueba solo los articulos
articles_emb = []
normalizer = Normalizer(norm='l2')

for article in articles:
    article['emb_body'] = normalizer.transform(model.encode([article['body']])).tolist()
    articles_emb.append(article)

In [20]:
def calculaDistancia(pregunta, limit):
    df = pd.DataFrame(columns = ['id','title', 'emb'])
    print(pregunta)
    for a in articles_emb:
        dic = {}
        dic['id'] = a['id']
        dic['title'] = a['title']
        dic['emb'] = a['emb_body']
        #df = df.append(dic, ignore_index = True)
        df = pd.concat([df, pd.DataFrame(dic, index=[0])], ignore_index=True)

    #convertimos la pregunta a embedding:
    emb_preg = normalizer.transform(model.encode([pregunta]))
    df['distancia'] = df['emb'].apply(lambda x: np.array(x) @ emb_preg[0])
    df = df.sort_values('distancia', ascending = False)
    return df.head(limit)
    

In [21]:
calculaDistancia('Cómo fidelizar a tus compradores de Hot Sale', 10)

Cómo fidelizar a tus compradores de Hot Sale


Unnamed: 0,id,title,emb,distancia
0,8764508351644,Workflows Hot Sale 2023,"[0.03203045576810837, -0.04343095421791077, 0....",0.468583
73,6508957523730,Pack newsletters,"[0.027412476018071175, -0.020317386835813522, ...",0.422809
5,7752237540508,[LBD] Curso: arranca tus primeras estrategias ...,"[-0.001850444939918816, -0.02464452013373375, ...",0.404683
306,360014428839,Lead scoring: asignar puntos a un contacto,"[0.03397448733448982, 0.01221911795437336, 0.0...",0.394848
222,7869332050588,[LBD] Reactiva a tus compradores dormidos,"[0.019675597548484802, -0.022051917389035225, ...",0.381216
8,7087098296604,Lead Nurturing Preventa,"[0.014185058884322643, -0.014065049588680267, ...",0.377712
28,6054245431954,Nuevos workflows del mes,"[0.04314965754747391, -0.0017913426272571087, ...",0.376813
7,7116448367260,Lead Nurturing Postventa,"[0.013755285181105137, -0.027920132502913475, ...",0.373746
26,6454162374034,Campaña de productos recomendados según RFM,"[0.019641388207674026, -0.023620642721652985, ...",0.356227
138,360018401679,Integrar formulario de suscripción a newslette...,"[-0.00726541830226779, 0.036456137895584106, 0...",0.347204
