In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import sys

sys.path.append('../src/')

import load_data


from load_data import load_instacart_data

In [None]:
data = load_instacart_data()

orders = data['orders']
order_products_prior = data['order_products_prior']
products = data['products']



In [None]:
products.head()

In [None]:
# merge of the 2 tables to get the product names
orders_products=order_products_prior.merge(products[['product_id', 'product_name']], on='product_id', how='left')

In [None]:
# how many times each product was ordered
product_counts = orders_products['product_id'].value_counts()

# filter on top 1000 products
top_n=200
top_products = product_counts.head(top_n).index

filtered_orders_products = orders_products[orders_products['product_id'].isin(top_products)]

filtered_orders_products.head()

In [None]:
# percentage of transactions kept in scope after filtering on top_n products
percentage_transactions=len(filtered_orders_products)/len(order_products_prior)
print(percentage_transactions)

In [None]:
# filter basket sizes to remove outliers
order_sizes=filtered_orders_products.groupby('order_id').size()

valid_orders=order_sizes[(order_sizes>3) & (order_sizes<20)].index
filtered_orders=filtered_orders_products[filtered_orders_products['order_id'].isin(valid_orders)]

In [None]:
len(filtered_orders)

In [None]:
basket_list=filtered_orders.groupby('order_id')['product_name'].apply(list).reset_index()
basket_list.columns=['order_id', 'products']

In [None]:
basket_list.head()

In [None]:
len(basket_list)

In [None]:
sample_size = 10000
basket_sample = basket_list.sample(n=sample_size, random_state=42)

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(basket_sample['products']).transform(basket_sample['products'])
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

In [None]:
df_encoded.head()

In [None]:
# generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)

print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))

rules_sorted = rules.sort_values('lift', ascending=False)
print(rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))

Support = frequency of the association
Confidence = % of times that when I buy antecedent I get consequent
Lift = Probability of association compared to random = PB(A)/P(B)

TEST ECLAT

In [None]:
from collections import defaultdict

# Convert dataset
transactions = {}
for idx, row in basket_list.iterrows():
    transactions[f"T{idx}"] = row['products']

min_support = int(0.005 * len(transactions))

# generate tidsets
def generate_tidsets(transactions):
    item_tidset = defaultdict(set)
    for tid, items in transactions.items():
        for item in items:
            item_tidset[item].add(tid)
    return item_tidset

def eclat(prefix, items, min_support, frequent_itemsets):
    while items:
        item, tidset = items.pop()
        support = len(tidset)
        if support >= min_support:
            new_itemset = prefix + [item]
            frequent_itemsets[frozenset(new_itemset)] = support
            suffix = []
            for other_item, other_tidset in items:
                intersection = tidset & other_tidset
                if len(intersection) >= min_support:
                    suffix.append((other_item, intersection))
            if suffix:
                eclat(new_itemset, suffix, min_support, frequent_itemsets)

item_tidset = generate_tidsets(transactions)
items = sorted(item_tidset.items(), key=lambda x: len(x[1]))
frequent_itemsets = {}
eclat([], items, min_support, frequent_itemsets)

In [None]:
for itemset, support in sorted(frequent_itemsets.items(), 
                                key=lambda x: (-len(x[0]), -x[1], sorted(list(x[0])))):
    print(list(itemset), "=>", support)

In [None]:
# Convert in dataframe for mlxtend
eclat_df = pd.DataFrame([
    {'itemsets': itemset, 'support': support / len(transactions)} 
    for itemset, support in frequent_itemsets.items()
])
eclat_df.head()

In [None]:
# Generate association rules
from mlxtend.frequent_patterns import association_rules

rules = association_rules(eclat_df, metric="confidence", min_threshold=0.3)

# Order by lift
rules_sorted = rules.sort_values('lift', ascending=False)

def extract_names(frozenset_items):
    return ', '.join(sorted(frozenset_items))

rules_sorted['antecedent'] = rules_sorted['antecedents'].apply(extract_names)
rules_sorted['consequent'] = rules_sorted['consequents'].apply(extract_names)

# Export without frozenset columns
rules_clean = rules_sorted[[
    'antecedent',
    'consequent', 
    'support',
    'confidence',
    'lift'
]]

rules_clean.to_csv('../data/processed/rules_clean.csv', index=False)

print(f"✅ {len(rules_clean)} règles exportées dans rules_clean.csv")
print(rules_clean.head(10))

In [None]:

# Charge ton CSV de règles
rules = pd.read_csv('../data/processed/rules_clean.csv')

# Extrait TOUS les produits uniques (antécédents + conséquents)
all_products = set()

for _, row in rules.iterrows():
    # Ajoute produits des antécédents
    all_products.update(row['antecedent'].split(', '))
    # Ajoute produits des conséquents
    all_products.update(row['consequent'].split(', '))

# Crée DataFrame
products_list = pd.DataFrame({
    'product_name': sorted(all_products)
})

# Export
products_list.to_csv('../data/processed/products_in_rules.csv', index=False)

print(f"✅ {len(products_list)} produits uniques exportés")
print(products_list.head(10))