https://www.kaggle.com/datasets/acostasg/random-shopping-cart

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import networkx as nx

In [4]:
ruta_data = './DataSetAsociacion/dataset_group.csv'
# Títulos de las columnas
column_names = ['Fecha_Registro', 'ID_Compra', 'Producto']
# Cargar el archivo CSV en un DataFrame con los títulos de las columnas
df = pd.read_csv(ruta_data, names=column_names)
df

Unnamed: 0,Fecha_Registro,ID_Compra,Producto
0,2000-01-01,1,yogurt
1,2000-01-01,1,pork
2,2000-01-01,1,sandwich bags
3,2000-01-01,1,lunch meat
4,2000-01-01,1,all- purpose
...,...,...,...
22338,2002-02-26,1139,soda
22339,2002-02-26,1139,laundry detergent
22340,2002-02-26,1139,vegetables
22341,2002-02-26,1139,shampoo


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22343 entries, 0 to 22342
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Fecha_Registro  22343 non-null  object
 1   ID_Compra       22343 non-null  int64 
 2   Producto        22343 non-null  object
dtypes: int64(1), object(2)
memory usage: 523.8+ KB


In [6]:
# Agrupar los productos por ID_Compra
transactions = df.groupby('ID_Compra')['Producto'].apply(list).values.tolist()

# Inicializar y ajustar el TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)

# Convertir la matriz codificada en un DataFrame
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Mostrar el DataFrame codificado
print(df_encoded)

      all- purpose  aluminum foil  bagels   beef  butter  cereals  cheeses  \
0             True           True   False   True    True    False    False   
1            False           True   False  False   False     True     True   
2            False          False    True  False   False     True     True   
3             True          False   False  False   False     True    False   
4             True          False   False  False   False    False    False   
...            ...            ...     ...    ...     ...      ...      ...   
1134          True          False   False   True   False     True     True   
1135         False          False   False  False   False     True     True   
1136         False          False    True   True   False    False    False   
1137          True          False   False   True   False    False     True   
1138         False          False   False  False   False    False    False   

      coffee/tea  dinner rolls  dishwashing liquid/detergent  .

In [7]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1139 entries, 0 to 1138
Data columns (total 38 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   all- purpose                  1139 non-null   bool 
 1   aluminum foil                 1139 non-null   bool 
 2   bagels                        1139 non-null   bool 
 3   beef                          1139 non-null   bool 
 4   butter                        1139 non-null   bool 
 5   cereals                       1139 non-null   bool 
 6   cheeses                       1139 non-null   bool 
 7   coffee/tea                    1139 non-null   bool 
 8   dinner rolls                  1139 non-null   bool 
 9   dishwashing liquid/detergent  1139 non-null   bool 
 10  eggs                          1139 non-null   bool 
 11  flour                         1139 non-null   bool 
 12  fruits                        1139 non-null   bool 
 13  hand soap                     113

In [8]:
# Aplicar Apriori para encontrar conjuntos frecuentes
frequent_itemsets = apriori(df_encoded, min_support=0.05, use_colnames=True)

# Mostrar los conjuntos frecuentes
print(frequent_itemsets)

        support                                        itemsets
0      0.374890                                  (all- purpose)
1      0.384548                                 (aluminum foil)
2      0.385426                                        (bagels)
3      0.374890                                          (beef)
4      0.367867                                        (butter)
...         ...                                             ...
15744  0.058824            (yogurt, vegetables, waffles, sugar)
15745  0.055312  (toilet paper, tortillas, vegetables, waffles)
15746  0.053556   (toilet paper, yogurt, tortillas, vegetables)
15747  0.064969     (toilet paper, yogurt, vegetables, waffles)
15748  0.059701        (yogurt, tortillas, vegetables, waffles)

[15749 rows x 2 columns]


In [9]:
# Generar reglas de asociación
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# Mostrar las reglas
print(rules)

                              antecedents   consequents  antecedent support  \
0                          (all- purpose)  (vegetables)            0.374890   
1                         (aluminum foil)  (vegetables)            0.384548   
2                                (bagels)  (vegetables)            0.385426   
3                                  (beef)  (vegetables)            0.374890   
4                                (butter)  (vegetables)            0.367867   
...                                   ...           ...                 ...   
11265            (yogurt, waffles, sugar)  (vegetables)            0.069359   
11266  (toilet paper, tortillas, waffles)  (vegetables)            0.068481   
11267   (toilet paper, tortillas, yogurt)  (vegetables)            0.064969   
11268     (toilet paper, yogurt, waffles)  (vegetables)            0.082529   
11269        (tortillas, yogurt, waffles)  (vegetables)            0.067603   

       consequent support   support  confidence    

In [13]:
ruta_out = './resultados_association_ShoppingCart.xlsx'

# Guardar los resultados en un archivo Excel
rules.to_excel(ruta_out, index=False)