# 2. Reglas de asociación - Apriori

In [12]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from itertools import combinations
from collections import defaultdict
import warnings
import numpy as np
warnings.filterwarnings('ignore')

np.random.seed(42)

In [13]:
# 1. SIMULACIÓN DEL DATASET 

n = 5000
genres_pool = ['Drama','Comedy','Action','Adventure','Science Fiction',
               'Thriller','Romance','Horror','Animation','Documentary','Crime','Fantasy']
lang_pool   = ['en','es','fr','ja','ko','de','it','pt']

def rnd_genres():
    k = np.random.choice([1,2,3], p=[0.50,0.35,0.15])
    return '|'.join(np.random.choice(genres_pool, k, replace=False))

budgets  = np.concatenate([np.zeros(3000), np.random.exponential(50e6, 2000)])
revenues = np.concatenate([np.zeros(3000), np.random.exponential(150e6, 2000)])
np.random.shuffle(budgets); np.random.shuffle(revenues)

df = pd.DataFrame({
    'popularity':  np.random.exponential(10, n),
    'budget':      budgets,
    'revenue':     revenues,
    'voteAvg':     np.clip(np.random.normal(6.0, 1.5, n), 0, 10),
    'voteCount':   np.abs(np.random.exponential(200, n)).astype(int),
    'runtime':     np.clip(np.abs(np.random.normal(95, 30, n)).astype(int), 1, 400),
    'originalLanguage': np.random.choice(lang_pool, n, p=[0.60,0.10,0.07,0.07,0.06,0.04,0.03,0.03]),
    'genres':      [rnd_genres() for _ in range(n)],
    'releaseYear': np.random.randint(1980, 2026, n),
})
print(f"Dataset simulado: {df.shape[0]} filas")

Dataset simulado: 5000 filas


In [14]:
# 2. DISCRETIZACIÓN

df['pop_cat']  = pd.cut(df['popularity'],  bins=[0,5,20,100,np.inf], labels=['Pop_Baja','Pop_Media','Pop_Alta','Pop_Viral'])
df['bud_cat']  = pd.cut(df['budget'],      bins=[-1,0,10e6,50e6,100e6,np.inf], labels=['Budget_0','Budget_Bajo','Budget_Medio','Budget_Alto','Budget_AAA'])
df['rev_cat']  = pd.cut(df['revenue'],     bins=[-1,0,10e6,50e6,200e6,np.inf], labels=['Rev_0','Rev_Bajo','Rev_Medio','Rev_Alto','Rev_Blockbuster'])
df['rat_cat']  = pd.cut(df['voteAvg'],     bins=[0,4,5.5,7,10],               labels=['Rating_Malo','Rating_Regular','Rating_Bueno','Rating_Excelente'])
df['run_cat']  = pd.cut(df['runtime'],     bins=[0,60,90,120,400],             labels=['Corta','Normal','Larga','Muy_Larga'])
df['vot_cat']  = pd.cut(df['voteCount'],   bins=[-1,10,100,500,np.inf],        labels=['Pocos_Votos','Votos_Medio','Votos_Alto','Muy_Votada'])
df['era_cat']  = pd.cut(df['releaseYear'], bins=[1979,1990,2000,2010,2020,2026], labels=['80s','90s','2000s','2010s','2020s'])
df['lang_cat'] = df['originalLanguage'].apply(lambda x: x if x in ['en','es','ja','ko','fr'] else 'Otro')

In [15]:
# 3. CONSTRUCCIÓN DE TRANSACCIONES

cat_cols = ['pop_cat','bud_cat','rev_cat','rat_cat','run_cat','vot_cat','era_cat','lang_cat']

def build_transactions(dataframe):
    tx = []
    for _, row in dataframe.iterrows():
        items = []
        for col in cat_cols:
            v = row[col]
            if pd.notna(v): items.append(str(v))
        for g in str(row['genres']).split('|'):
            g = g.strip()
            if g: items.append(f"Genre_{g}")
        tx.append(set(items))
    return tx

transactions = build_transactions(df)
print(f"Transacciones totales: {len(transactions)}")
print(f"Ejemplo: {list(transactions[0])[:6]}")

Transacciones totales: 5000
Ejemplo: ['Budget_0', 'Rev_Medio', 'Genre_Action', 'Genre_Romance', 'Genre_Thriller', '2000s']


In [16]:
# 4. IMPLEMENTACIÓN APRIORI


def get_frequent_1(transactions, min_sup):
    counts = defaultdict(int)
    for tx in transactions:
        for item in tx:
            counts[frozenset([item])] += 1
    n = len(transactions)
    return {k: v/n for k,v in counts.items() if v/n >= min_sup}

def apriori_gen(freq_k):
    items_list = list(freq_k.keys())
    candidates = set()
    for i in range(len(items_list)):
        for j in range(i+1, len(items_list)):
            union = items_list[i] | items_list[j]
            if len(union) == len(items_list[i]) + 1:
                candidates.add(union)
    return candidates

def get_frequent_k(transactions, candidates, min_sup):
    counts = defaultdict(int)
    n = len(transactions)
    for tx in transactions:
        for cand in candidates:
            if cand.issubset(tx):
                counts[cand] += 1
    return {k: v/n for k,v in counts.items() if v/n >= min_sup}

def run_apriori(transactions, min_support, max_len=4):
    all_freq = {}
    freq = get_frequent_1(transactions, min_support)
    all_freq.update(freq)
    k = 2
    while freq and k <= max_len:
        candidates = apriori_gen(freq)
        freq = get_frequent_k(transactions, candidates, min_support)
        all_freq.update(freq)
        print(f"  Tamaño {k}: {len(freq)} conjuntos frecuentes")
        k += 1
    return all_freq

def get_rules(freq_sets, min_confidence):
    rules = []
    for itemset, sup in freq_sets.items():
        if len(itemset) < 2: continue
        items = list(itemset)
        for size in range(1, len(items)):
            for ant in combinations(items, size):
                ant = frozenset(ant)
                con = itemset - ant
                ant_sup = freq_sets.get(ant, None)
                if ant_sup and ant_sup > 0:
                    conf = sup / ant_sup
                    if conf >= min_confidence:
                        con_sup = freq_sets.get(con, None)
                        lift = conf / con_sup if con_sup and con_sup > 0 else 0
                        rules.append({
                            'antecedents': ant,
                            'consequents': con,
                            'support': sup,
                            'confidence': conf,
                            'lift': lift
                        })
    return pd.DataFrame(rules).sort_values('lift', ascending=False) if rules else pd.DataFrame()

In [22]:
# 5. EXPERIMENTOS


# EXPERIMENTO A

print("EXPERIMENTO A: min_support=0.10, min_confidence=0.70")

freq_A = run_apriori(transactions, min_support=0.10, max_len=3)
rules_A = get_rules(freq_A, min_confidence=0.70)
print(f"  Total conjuntos frecuentes: {len(freq_A)}")
print(f"  Total reglas generadas:     {len(rules_A)}")
if len(rules_A) > 0:
    print("\n  Top 10 reglas por Lift:")
    for _, r in rules_A.head(10).iterrows():
        print(f"    {set(r['antecedents'])} → {set(r['consequents'])}  "
              f"[sup={r['support']:.3f}, conf={r['confidence']:.3f}, lift={r['lift']:.3f}]")

#  EXPERIMENTO B

print("EXPERIMENTO B: min_support=0.05, min_confidence=0.60")

freq_B = run_apriori(transactions, min_support=0.05, max_len=3)
rules_B = get_rules(freq_B, min_confidence=0.60)
print(f"  Total conjuntos frecuentes: {len(freq_B)}")
print(f"  Total reglas generadas:     {len(rules_B)}")
if len(rules_B) > 0:
    print("\n  Top 10 reglas por Lift:")
    for _, r in rules_B.head(10).iterrows():
        print(f"    {set(r['antecedents'])} → {set(r['consequents'])}  "
              f"[sup={r['support']:.3f}, conf={r['confidence']:.3f}, lift={r['lift']:.3f}]")

# EXPERIMENTO C

print("EXPERIMENTO C: Sin inglés (para reducir ruido) — min_support=0.05, min_confidence=0.60")

df_noeng = df[df['originalLanguage'] != 'en'].copy()
trans_noeng = build_transactions(df_noeng)
freq_C = run_apriori(trans_noeng, min_support=0.05, max_len=3)
rules_C = get_rules(freq_C, min_confidence=0.60)
print(f"  Películas no inglesas: {len(df_noeng)}")
print(f"  Total conjuntos frecuentes: {len(freq_C)}")
print(f"  Total reglas generadas:     {len(rules_C)}")
if len(rules_C) > 0:
    print("\n  Top 10 reglas por Lift (sin inglés):")
    for _, r in rules_C.head(10).iterrows():
        print(f"    {set(r['antecedents'])} → {set(r['consequents'])}  "
              f"[sup={r['support']:.3f}, conf={r['confidence']:.3f}, lift={r['lift']:.3f}]")

EXPERIMENTO A: min_support=0.10, min_confidence=0.70
  Tamaño 2: 80 conjuntos frecuentes
  Tamaño 3: 45 conjuntos frecuentes
  Total conjuntos frecuentes: 161
  Total reglas generadas:     0
EXPERIMENTO B: min_support=0.05, min_confidence=0.60
  Tamaño 2: 258 conjuntos frecuentes
  Tamaño 3: 242 conjuntos frecuentes
  Total conjuntos frecuentes: 547
  Total reglas generadas:     294

  Top 10 reglas por Lift:
    {'en', 'Genre_Animation'} → {'Budget_0'}  [sup=0.051, conf=0.665, lift=1.108]
    {'Budget_0', '80s'} → {'en'}  [sup=0.100, conf=0.674, lift=1.096]
    {'Genre_Romance', 'Budget_0'} → {'en'}  [sup=0.055, conf=0.671, lift=1.091]
    {'Genre_Adventure', 'en'} → {'Rev_0'}  [sup=0.056, conf=0.653, lift=1.089]
    {'Rev_0', 'Genre_Animation'} → {'Budget_0'}  [sup=0.052, conf=0.647, lift=1.079]
    {'Genre_Fantasy', 'Votos_Alto'} → {'Rev_0'}  [sup=0.050, conf=0.644, lift=1.074]
    {'en', '80s'} → {'Budget_0'}  [sup=0.100, conf=0.644, lift=1.074]
    {'Genre_Animation'} → {'Budget_0

In [20]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Reglas de Asociación — Apriori | The Movie DB', fontsize=14, fontweight='bold')

colors_lift = lambda s: plt.cm.RdYlGn((s - s.min()) / (s.max() - s.min() + 1e-9))

# --- Plot 1: Soporte vs Confianza coloreado por Lift ---
ax1 = axes[0, 0]
if len(rules_B) > 0:
    sc = ax1.scatter(rules_B['support'], rules_B['confidence'],
                     c=rules_B['lift'], cmap='RdYlGn', alpha=0.65, s=60,
                     edgecolors='gray', linewidth=0.3)
    plt.colorbar(sc, ax=ax1, label='Lift')
    ax1.axhline(0.70, color='red', lw=1.2, linestyle='--', alpha=0.7, label='Conf=0.70')
    ax1.set_xlabel('Soporte')
    ax1.set_ylabel('Confianza')
    ax1.set_title('Soporte vs Confianza\n(color = Lift, Exp. B)')
    ax1.legend(fontsize=8)

# --- Plot 2: Top 10 reglas por Lift ---
ax2 = axes[0, 1]
if len(rules_B) > 0:
    top10  = rules_B.head(10).copy()
    labels = []
    for _, r in top10.iterrows():
        a = ', '.join(sorted(list(r['antecedents']))[:2])
        c = ', '.join(sorted(list(r['consequents']))[:1])
        labels.append(f"{a[:25]} -> {c[:18]}")
    ax2.barh(range(len(top10)), top10['lift'],
             color=colors_lift(top10['lift']), edgecolor='white')
    ax2.set_yticks(range(len(top10)))
    ax2.set_yticklabels(labels, fontsize=6.5)
    ax2.set_xlabel('Lift')
    ax2.set_title('Top 10 Reglas por Lift\n(Experimento B)')
    ax2.invert_yaxis()

# --- Plot 3: Distribución de Confianza y Lift ---
ax3  = axes[1, 0]
ax3b = ax3.twinx()
if len(rules_B) > 0:
    ax3.hist(rules_B['confidence'], bins=18, color='steelblue', alpha=0.75,
             label='Confianza', edgecolor='white')
    ax3b.hist(rules_B['lift'], bins=18, color='coral', alpha=0.55,
              label='Lift', edgecolor='white')
    ax3.set_xlabel('Valor')
    ax3.set_ylabel('Frecuencia (Confianza)', color='steelblue')
    ax3b.set_ylabel('Frecuencia (Lift)', color='coral')
    ax3.set_title('Distribución de Confianza y Lift\n(Experimento B)')
    l1, h1 = ax3.get_legend_handles_labels()
    l2, h2 = ax3b.get_legend_handles_labels()
    ax3.legend(l1+l2, h1+h2, fontsize=8, loc='upper left')

# --- Plot 4: Comparación de los 3 experimentos ---
ax4  = axes[1, 1]
ax4b = ax4.twinx()
exps      = ['Exp A\n(sup>=0.10\nconf>=0.70)', 'Exp B\n(sup>=0.05\nconf>=0.60)', 'Exp C\n(sin ingles\nsup>=0.05)']
n_rules   = [len(rules_A), len(rules_B), len(rules_C)]
lifts_avg = [
    rules_A['lift'].mean() if len(rules_A) > 0 else 0,
    rules_B['lift'].mean() if len(rules_B) > 0 else 0,
    rules_C['lift'].mean() if len(rules_C) > 0 else 0
]
x     = np.arange(len(exps))
width = 0.35
bars1 = ax4.bar( x - width/2, n_rules,   width, color='steelblue',  alpha=0.8, label='# Reglas')
bars2 = ax4b.bar(x + width/2, lifts_avg, width, color='darkorange', alpha=0.8, label='Lift Promedio')
ax4.set_xticks(x)
ax4.set_xticklabels(exps, fontsize=8)
ax4.set_ylabel('Número de reglas',  color='steelblue')
ax4b.set_ylabel('Lift promedio',    color='darkorange')
ax4.set_title('Comparación de Experimentos')
l1, h1 = ax4.get_legend_handles_labels()
l2, h2 = ax4b.get_legend_handles_labels()
ax4.legend(l1+l2, h1+h2, fontsize=8, loc='upper right')
for bar in bars1:
    ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
             str(int(bar.get_height())), ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.savefig('reglas_asociacion.png', dpi=150, bbox_inches='tight')  # ruta relativa
plt.show()
print("Gráfica guardada como 'reglas_asociacion.png'")

Gráfica guardada como 'reglas_asociacion.png'


In [21]:
# 7. GUARDAR RESULTADOS EN TXT

def fmt_rule(r):
    return (f"{sorted(list(r['antecedents']))} -> {sorted(list(r['consequents']))}  "
            f"[sup={r['support']:.4f}, conf={r['confidence']:.4f}, lift={r['lift']:.4f}]")

with open('resultados_apriori.txt', 'w', encoding='utf-8') as f:
    f.write("RESULTADOS REGLAS DE ASOCIACION - APRIORI\n")
    f.write("="*70 + "\n\n")
    for label, rules in [
        ("EXPERIMENTO A (sup>=0.10, conf>=0.70)", rules_A),
        ("EXPERIMENTO B (sup>=0.05, conf>=0.60)", rules_B),
        ("EXPERIMENTO C - sin ingles (sup>=0.05, conf>=0.60)", rules_C)
    ]:
        f.write(f"\n{label}\n" + "-"*60 + "\n")
        f.write(f"Total reglas: {len(rules)}\n\n")
        if len(rules) > 0:
            for i, (_, r) in enumerate(rules.head(10).iterrows(), 1):
                f.write(f"  #{i}: {fmt_rule(r)}\n")

print("Resultados guardados en 'resultados_apriori.txt'")
print("\n Script completado exitosamente")


Resultados guardados en 'resultados_apriori.txt'

 Script completado exitosamente
