In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans


In [3]:
df = pd.read_csv('datos/olist_products_dataset.csv')

df_copia = df.copy()

In [6]:
df_copia.head()

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0


In [4]:
df_copia.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.3+ MB


In [5]:
# Vemos los datos nulos
df_copia.isnull().sum()


product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64

In [7]:
# Rellenar los valores nulos con la media de la columna
df_copia['product_category_name'] = df_copia['product_category_name'].fillna('Desconocido')
df_copia['product_name_lenght'] = df_copia['product_name_lenght'].fillna(df_copia['product_name_lenght'].mean())
df_copia['product_description_lenght'] = df_copia['product_description_lenght'].fillna(df_copia['product_description_lenght'].mean())
df_copia['product_photos_qty'] = df_copia['product_photos_qty'].fillna(df_copia['product_photos_qty'].mean())
df_copia['product_weight_g'] = df_copia['product_weight_g'].fillna(df_copia['product_weight_g'].mean())
df_copia['product_length_cm'] = df_copia['product_length_cm'].fillna(df_copia['product_length_cm'].mean())
df_copia['product_height_cm'] = df_copia['product_height_cm'].fillna(df_copia['product_height_cm'].mean())
df_copia['product_width_cm'] = df_copia['product_width_cm'].fillna(df_copia['product_width_cm'].mean())

In [8]:
# Codificar las categorías de productos
encoder = LabelEncoder()
df_copia['categoria_codificada'] = encoder.fit_transform(df_copia['product_category_name'])

# Escalar las características numéricas usamos 'product_weight_g' 
scaler = StandardScaler()
df_copia['peso_scaled'] = scaler.fit_transform(df_copia[['product_weight_g']])

In [11]:
# Seleccionar las columnas que usamos para el clustering
X = df_copia[['categoria_codificada', 'peso_scaled']]

# Aplicar k-Means para agrupar en 5 clusters
kmeans = KMeans(n_clusters=75, random_state=42)
df_copia['cluster'] = kmeans.fit_predict(X)

In [12]:
# obtener productos recomendados para un producto específico
producto_id = 1  # ID del producto
cluster_producto = df_copia.loc[producto_id, 'cluster']

# Recomendamos productos del mismo cluster
productos_recomendados = df_copia[df_copia['cluster'] == cluster_producto]
print(productos_recomendados[['product_id', 'product_category_name']])

                             product_id product_category_name
1      3aa071139cb16b67ca9e5dea641aaa2f                 artes
129    cbaff83e2c00a2b731d27276bbf52082                 artes
233    aedb7e30007f6051c5b3f97156b0848a                 artes
593    32f186a3f6239888c37adf90db857098     alimentos_bebidas
737    33202a8e7a645388c41ed714203d7131             alimentos
...                                 ...                   ...
32149  cba233cdf732bfe917cc13a00836a969             alimentos
32278  774e2ab4466ae169c99921fb8c12a390             alimentos
32810  3eb3d83d082ce242d4eae1f57e62640d     alimentos_bebidas
32841  092be1e8336fc404c57bd5970d056886             alimentos
32932  0bf1dea484fbb8cdfa09e2767ce30574             alimentos

[240 rows x 2 columns]
