In [1]:
import pandas as pd

#Apertura del dataset
df = pd.read_json('../datasets/shelflife.json')
df.head(5)

Unnamed: 0,barcode,brand,product_name,category_off,shelf_life_pantry_days,shelf_life_fridge_days,shelf_life_freezer_days,source_rule,country,confidence,generated_on
0,7501234567897,Jumex,Néctar de durazno 1 L,juice-box,270.0,10.0,60.0,Heuristic: Juice boxed unopened ~6–9 m; opened...,Mexico,0.55,2025-10-08
1,7507890123451,Dolores,Sardinas en salsa 155 g,canned-fish,1095.0,,,FoodKeeper: Canned fish unopened ~2–5 y,Mexico,0.8,2025-10-08
2,7505678901233,La Villita,Queso gouda 300 g,cheese-hard,,56.0,180.0,FoodKeeper: Hard cheeses ~3–6 w; freezer up to...,Mexico,0.75,2025-10-08
3,7500123456788,La Sierra,Frijol pinto 1 kg,beans-dry,730.0,,,FoodKeeper: Dry beans 1–2 y,Mexico,0.85,2025-10-08
4,7506789012342,San Juan,Huevo blanco 12 pzas,eggs,,28.0,,FoodKeeper: Eggs shell (fridge ~3–5 w),Mexico,0.9,2025-10-08


In [2]:
#Preparando el dataset para hacer la clasificacion 
df_clasificado = df.copy()
columnas_usar = ['product_name', 'category_off', 'shelf_life_pantry_days', 'shelf_life_fridge_days', 'shelf_life_freezer_days']
df_clasificado = df_clasificado[columnas_usar]

#Rellenar los valores null con ceros
columnas_numericas = ['shelf_life_pantry_days', 'shelf_life_fridge_days', 'shelf_life_freezer_days']
df_clasificado[columnas_usar] = df_clasificado[columnas_usar].fillna(0)

df_clasificado.head(15)

Unnamed: 0,product_name,category_off,shelf_life_pantry_days,shelf_life_fridge_days,shelf_life_freezer_days
0,Néctar de durazno 1 L,juice-box,270.0,10.0,60.0
1,Sardinas en salsa 155 g,canned-fish,1095.0,0.0,0.0
2,Queso gouda 300 g,cheese-hard,0.0,56.0,180.0
3,Frijol pinto 1 kg,beans-dry,730.0,0.0,0.0
4,Huevo blanco 12 pzas,eggs,0.0,28.0,0.0
5,Azúcar blanca 1 kg,sugar-white,1825.0,0.0,0.0
6,Frijoles refritos 430 g,canned-beans,1095.0,3.0,60.0
7,Sal fina 750 g,salt,3650.0,0.0,0.0
8,Tortillas de maíz 1 kg,tortillas-corn,3.0,7.0,60.0
9,Aguacate Hass,avocado,5.0,7.0,120.0


In [3]:
# Clasificacion en perecedero = 1 o no_perecedero = 0

def clasificar_perecedero(fila):
    categorias_no_perecederas = [
        'juice-box', 'canned-fish', 'beans-dry', 'sugar-white', 'salt',
        'soft-drinks', 'tea-bags', 'oil-vegetable', 'chips', 'rice-white-dry',
        'pasta-dry', 'canned-tomato', 'canned-meat', 'cookies', 'cereal-box',
        'coffee-ground', 'milk-uht', 'bottled-water', 'salsa-jarred',
        'toothpaste', 'shampoo', 'household-detergent'
    ]

    # Si pertenece a una categoría no perecedera 0
    if fila['category_off'] in categorias_no_perecederas:
        return 0
    # Si requiere de estar en un ambiente frio es 1
    elif (fila['shelf_life_fridge_days'] > 0 or fila['shelf_life_freezer_days'] > 0):
        return 1  # Perecedero (Se beneficia del frío)

    # Condición 2: El producto no va en frío pero su vida útil en despensa es muy corta
    elif fila['shelf_life_pantry_days'] < 20:
        return 1  # Perecedero (Vida útil corta en general)

    # De lo contrario, es no perecedero
    else:
        return 0

# Aplicar la clasificación
df_clasificado['perecedero'] = df.apply(clasificar_perecedero, axis=1)


df_clasificado.tail(100)

Unnamed: 0,product_name,category_off,shelf_life_pantry_days,shelf_life_fridge_days,shelf_life_freezer_days,perecedero
100,Leche semidescremada UHT 1L,milk-uht,90.0,0.0,0.0,0
101,Pasta dental 75 ml,toothpaste,0.0,0.0,0.0,0
102,Salsa verde 210 g,salsa-jarred,540.0,30.0,120.0,0
103,Queso gouda 300 g,cheese-hard,0.0,56.0,180.0,1
104,Yoghurt griego 150 g,yogurt,0.0,14.0,0.0,1
...,...,...,...,...,...,...
195,Azúcar estándar 2 kg,sugar-white,1825.0,0.0,0.0,0
196,Pasta fusilli 500 g,pasta-dry,730.0,0.0,0.0,0
197,Totopos 200 g,chips,150.0,0.0,0.0,0
198,Galletas de chocolate 110 g,cookies,365.0,0.0,0.0,0


In [4]:
df_clasificado.to_csv('../datasets/productos_clasificados.csv')

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

X = df_clasificado[['category_off', 'shelf_life_pantry_days', 'shelf_life_fridge_days', 'shelf_life_freezer_days']]
y = df_clasificado['perecedero']

# Conversion de categorias a numeros para que el modelo pueda ejecutar el algoritmo
encoder = LabelEncoder()
X['category_off'] = encoder.fit_transform(X['category_off'])

# Separar los datos para entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Entrenar modelo
modelo = RandomForestClassifier(n_estimators=500, random_state=42)
modelo.fit(X_train, y_train)

# Evaluar modelo
score = modelo.score(X_test, y_test)
print(f"Precisión: {score:.2f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['category_off'] = encoder.fit_transform(X['category_off'])


Precisión: 1.00


In [6]:
#Matriz de confusion
from sklearn.metrics import confusion_matrix, classification_report

y_pred = modelo.predict(X_test)

#Generacion de la matriz
matriz_confusion = confusion_matrix(y_test, y_pred)
print('Matriz de confusin :')
print(matriz_confusion)

reporte = classification_report(y_test, y_pred)
print('Reporte de clasificacion: ')
print(reporte)

Matriz de confusin :
[[23  0]
 [ 0 27]]
Reporte de clasificacion: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        27

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50



In [7]:
#Haciendo pruebas del modelo con datos nnunca vistos (completamnete nuevos)

# Los nuevos productos
productos_nuevos = [
    ["Plátano tabasco 1 kg", "fruit-fresh", 5, 0, 0],
    ["Tomate saladet 1 kg", "vegetable-fresh", 4, 0, 0],
    ["Limón agrio 1 kg", "fruit-fresh", 7, 0, 0],
    ["Aguacate hass 1 kg", "fruit-fresh", 3, 0, 0],
    ["Pan dulce 1 pza", "bakery-fresh", 3, 0, 0],
    ["Tortillas de maíz 1 kg", "tortilla", 5, 10, 30],
    ["Jamón de pavo 250 g", "meat-processed", 0, 10, 60],
    ["Yogur de fresa 1L", "yogurt", 0, 15, 0],
    ["Queso panela 400 g", "cheese-soft", 0, 14, 90],
    ["Refresco de cola 600 ml", "soda", 365, 0, 0],
    ["Aceite vegetal 1L", "oil-vegetable", 730, 0, 0],
    ["Café molido 250 g", "coffee-ground", 540, 0, 0],
    ["Sal yodada 1 kg", "salt", 1825, 0, 0],
    ["Harina de trigo 1 kg", "flour", 365, 0, 0],
    ["Frijol negro 1 kg", "beans-dry", 365, 0, 0],
    ["Pollo entero crudo 1 kg", "meat-raw", 0, 5, 180],
    ["Pescado mojarra 1 kg", "fish-fresh", 0, 3, 150],
    ["Atún enlatado 140 g", "canned-fish", 720, 0, 0],
    ["Cereal de maíz 500 g", "cereal", 365, 0, 0],
    ["Mantequilla 200 g", "butter", 0, 90, 180],
]

df_nuevos = pd.DataFrame(productos_nuevos, columns=[
    "product_name", "category_off", "shelf_life_pantry_days",
    "shelf_life_fridge_days", "shelf_life_freezer_days"
])

In [8]:
X_nuevos = df_nuevos[['category_off', 'shelf_life_pantry_days', 'shelf_life_fridge_days', 'shelf_life_freezer_days']]

X_nuevos['category_off'] = encoder.fit_transform(X_nuevos['category_off'])

predicciones = modelo.predict(X_nuevos)
df_nuevos['perecedero_prediccion'] = predicciones

df_nuevos.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_nuevos['category_off'] = encoder.fit_transform(X_nuevos['category_off'])


Unnamed: 0,product_name,category_off,shelf_life_pantry_days,shelf_life_fridge_days,shelf_life_freezer_days,perecedero_prediccion
0,Plátano tabasco 1 kg,fruit-fresh,5,0,0,0
1,Tomate saladet 1 kg,vegetable-fresh,4,0,0,0
2,Limón agrio 1 kg,fruit-fresh,7,0,0,0
3,Aguacate hass 1 kg,fruit-fresh,3,0,0,0
4,Pan dulce 1 pza,bakery-fresh,3,0,0,0
5,Tortillas de maíz 1 kg,tortilla,5,10,30,1
6,Jamón de pavo 250 g,meat-processed,0,10,60,1
7,Yogur de fresa 1L,yogurt,0,15,0,1
8,Queso panela 400 g,cheese-soft,0,14,90,1
9,Refresco de cola 600 ml,soda,365,0,0,0


In [17]:
# Agregar las etiquetas reales
etiquetas_reales = [1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,1,1,0,0,1]
df_nuevos['perecedero_real'] = etiquetas_reales

# Comparar
df_nuevos['acierto'] = df_nuevos['perecedero_prediccion'] == df_nuevos['perecedero_real']

# Mostrar resultados
print(df_nuevos[['product_name', 'category_off', 'perecedero_real', 'perecedero_prediccion', 'acierto']])

# Calcular precisión con productos nuevos
precision_nuevos = df_nuevos['acierto'].mean()
print(f"Precisión con productos nuevos: {precision_nuevos:.2f}")


               product_name     category_off  perecedero_real  \
0      Plátano tabasco 1 kg      fruit-fresh                1   
1       Tomate saladet 1 kg  vegetable-fresh                1   
2          Limón agrio 1 kg      fruit-fresh                1   
3        Aguacate hass 1 kg      fruit-fresh                1   
4           Pan dulce 1 pza     bakery-fresh                1   
5    Tortillas de maíz 1 kg         tortilla                1   
6       Jamón de pavo 250 g   meat-processed                1   
7         Yogur de fresa 1L           yogurt                1   
8        Queso panela 400 g      cheese-soft                1   
9   Refresco de cola 600 ml             soda                0   
10        Aceite vegetal 1L    oil-vegetable                0   
11        Café molido 250 g    coffee-ground                0   
12          Sal yodada 1 kg             salt                0   
13     Harina de trigo 1 kg            flour                0   
14        Frijol negro 1 