In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from time import time
import pandas as pd
import re
from unidecode import unidecode
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import joblib


# Descargar stopwords
nltk.download('stopwords')

# Lista de stopwords en español
stop_words = set(stopwords.words('Portuguese'))

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words):
        self.stop_words = stop_words
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(self._preprocess_text)
    
    def _preprocess_text(self, text):
        # Convertir a minúsculas
        text = text.lower()
        # Eliminar acentos
        text = unidecode(text)
        # Eliminar caracteres especiales
        text = re.sub(r'[^\w\s]', '', text)
        # Eliminar stopwords
        text = ' '.join([word for word in text.split() if word not in self.stop_words])
        return text
    
def calculate_similarity_pairs(df_dev, cosine_sim_sparse, similarity_threshold=0.8):
    start_time = time()  # Start time measurement
        
    # Filtrar similitudes mayores que el threshol
    rows, cols = cosine_sim_sparse.nonzero()
    filtered_pairs = [(i, j, cosine_sim_sparse[i, j]) for i, j in zip(rows, cols) if i < j and cosine_sim_sparse[i, j] > similarity_threshold]

    pairs = [{
        'ITE_ITEM_TITLE_1': df_dev['ITE_ITEM_TITLE'].iloc[i],
        'ITE_ITEM_TITLE_2': df_dev['ITE_ITEM_TITLE'].iloc[j],
        'Score Similitud (0,1)': similarity
    } for i, j, similarity in filtered_pairs]

    # Ordenar los pares por la similitud en orden descendente
    pairs = sorted(pairs, key=lambda x: x['Score Similitud (0,1)'], reverse=True)

    end_time = time()
    time_taken = end_time - start_time
    return pd.DataFrame(pairs), time_taken

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\longo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df_dev = pd.read_csv('archivos_ml/items_titles.csv')
df_test = pd.read_csv('archivos_ml/items_titles_test.csv')

print(df_dev.shape, df_test.shape)
print(df_dev.head(1))
print(df_test.head(1))

(30000, 1) (10000, 1)
                                      ITE_ITEM_TITLE
0  Tênis Ascension Posh Masculino - Preto E Verme...
                                     ITE_ITEM_TITLE
0  Tênis Olympikus Esporte Valente - Masculino Kids


# Preprosamiento

In [3]:
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor(stop_words)),  ### - limpieza
    ('tfidf', TfidfVectorizer())  ### - vectorizacion
])


In [4]:
pipeline.fit(df_dev['ITE_ITEM_TITLE'])

# Calculo de similitud

In [5]:
tfidf_matrix = pipeline.transform(df_dev['ITE_ITEM_TITLE'])
cosine_sim_sparse = cosine_similarity(tfidf_matrix, dense_output=False)

In [6]:
pairs = []
n_items = df_dev.shape[0]
#for i in range(n_items):
for i in [0,1]:
    for j in range(i + 1, n_items):
        similarity = cosine_sim_sparse[i, j]
        if similarity > 0:  # Ajusta este umbral según sea necesario
            pairs.append({
                'ITE_ITEM_TITLE_1': df_dev['ITE_ITEM_TITLE'].iloc[i],
                'ITE_ITEM_TITLE_2': df_dev['ITE_ITEM_TITLE'].iloc[j],
                'Score Similitud (0,1)': similarity
            })
            
# Ordena por puntaje de similitud
pairs = sorted(pairs, key=lambda x: x['Score Similitud (0,1)'], reverse=True)
similarities_df = pd.DataFrame(pairs)

In [7]:
similarities_df.head()

Unnamed: 0,ITE_ITEM_TITLE_1,ITE_ITEM_TITLE_2,"Score Similitud (0,1)"
0,Tenis Para Caminhada Super Levinho Spider Corr...,Tênis Masculino Caminhada Levinho Spider Acade...,0.757001
1,Tenis Para Caminhada Super Levinho Spider Corr...,Sapatênis Tênis Masculino Spider Casual Levinh...,0.685372
2,Tenis Para Caminhada Super Levinho Spider Corr...,Tênis Feminino Super Leve Spider Caminhada Aca...,0.625899
3,Tenis Para Caminhada Super Levinho Spider Corr...,Tênis Feminino Esportivo Levinho Academia Cami...,0.604302
4,Tenis Para Caminhada Super Levinho Spider Corr...,Tênis Feminino Caminhada Academia Spider Super...,0.592381


# 3. Guardado Pipeline

In [8]:
joblib.dump(pipeline, 'title_processing_pipeline.pkl')

['title_processing_pipeline.pkl']

# Cargar el pipeline desde el archivo .pkl

In [9]:
loaded_pipeline = joblib.load('title_processing_pipeline.pkl')

transformed_data = loaded_pipeline.transform(df_test['ITE_ITEM_TITLE'])

cosine_sim_sparse = cosine_similarity(transformed_data, dense_output=False)

In [10]:
similarities_df, time_taken = calculate_similarity_pairs(df_test, cosine_sim_sparse)

print(f"Time taken to calculate pairs: {time_taken:.4f} seconds")

Time taken to calculate pairs: 2645.1486 seconds


In [11]:
similarities_df.tail()

Unnamed: 0,ITE_ITEM_TITLE_1,ITE_ITEM_TITLE_2,"Score Similitud (0,1)"
1703,Tênis Usthemp Volare Temático - Armani Vira-lata,Tênis Usthemp Long Temático - Armani Vira-lata 2,0.800165
1704,Molekinho 2136.130 Tênis Iate Street Infantil ...,Tênis Infantil Masculino Molekinho Iate Napa C...,0.800145
1705,Tênis Sapatênis Masculino Casual Velluti Origi...,Tênis Sapatênis Masculino Velluti Original Com...,0.800087
1706,Tênis Masculino Olympikus Index / 767,Tênis Masculino Olympikus Index 767 Azul/preto,0.800064
1707,Tênis Rainha,"Tênis Rainha Stone - Feminino, Rosa/roxo - Rai...",0.800001


In [15]:
similarities_df.shape

(1708, 3)

# Analisis Tiempo de Ejecucion y Escalabilidad

In [19]:
print(f"minutos en procesar {2645/60}")
## solucion poco escalable, cada producto adicional genera una matriz de comparacion aun mayor, quiza se podria hacer una 
## clasificacion previa con un Kmeans o una reduccion de escalabilidad

minutos en procesar 44.083333333333336
