In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import sys
sys.path.append('../src')
from cluster_library import RuleBasedCustomerClusterer
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

## 1. Load Data

In [None]:
# Load processed data
print("Loading data...")
rules_df = pd.read_csv('../data/processed/rules_apriori_filtered.csv')
cleaned_df = pd.read_csv('../data/processed/cleaned_uk_data.csv')
clusters_ref = pd.read_csv('../data/processed/customer_clusters_from_rules.csv')

print(f"Total rules available: {len(rules_df)}")
print(f"Total customers: {len(clusters_ref)}")
print(f"Lift range: {rules_df['lift'].min():.2f} - {rules_df['lift'].max():.2f}")

## 2. Initialize Clusterer

In [None]:
# Initialize clusterer
clusterer = RuleBasedCustomerClusterer(
    df_clean=cleaned_df,
    customer_col="CustomerID",
    invoice_col="InvoiceNo",
    item_col="Description",
    quantity_col="Quantity"
)

# Build customer-item matrix (ch·ªâ c·∫ßn l√†m 1 l·∫ßn)
print("Building customer-item matrix...")
clusterer.build_customer_item_matrix()
print(f"Customer-item matrix shape: {clusterer.customer_item_bool.shape}")

## 3. Compute RFM (s·∫Ω th√™m v√†o features)

In [None]:
# Compute RFM
rfm_df = clusterer.compute_rfm()
print(f"RFM computed for {len(rfm_df)} customers")
print(rfm_df.describe())

## 4. Experiment v·ªõi c√°c gi√° tr·ªã Top-K

In [None]:
# C√°c gi√° tr·ªã Top-K ƒë·ªÉ th·ª≠ nghi·ªám
TOP_K_VALUES = [50, 100, 200, 300, 500, 1000]
K_CLUSTERS = 2  # S·ªë c·ª•m ƒë√£ ch·ªçn
RANDOM_STATE = 42

results = []

for top_k in TOP_K_VALUES:
    print(f"\n{'='*60}")
    print(f"Testing Top-K = {top_k}")
    print(f"{'='*60}")
    
    # Select top K rules by lift
    rules_top_k = rules_df.nlargest(top_k, 'lift').reset_index(drop=True)
    
    # Ensure str columns exist
    if 'antecedents_str' not in rules_top_k.columns:
        rules_top_k['antecedents_str'] = rules_top_k['antecedents'].astype(str)
    if 'consequents_str' not in rules_top_k.columns:
        rules_top_k['consequents_str'] = rules_top_k['consequents'].astype(str)
    
    # Assign rules to clusterer
    clusterer.rules_df_ = rules_top_k
    
    # Build rule feature matrix (weighted by lift)
    X_rules = clusterer.build_rule_feature_matrix(
        weighting='lift',
        min_antecedent_len=1
    )
    
    # Add RFM features
    customers_list = clusterer.customers_
    rfm_subset = rfm_df[rfm_df['CustomerID'].isin(customers_list)].set_index('CustomerID')
    rfm_subset = rfm_subset.reindex(customers_list)
    
    # Scale RFM
    scaler = StandardScaler()
    rfm_scaled = scaler.fit_transform(rfm_subset[['Recency', 'Frequency', 'Monetary']])
    
    # Combine features
    X_combined = np.hstack([X_rules, rfm_scaled])
    
    print(f"Feature matrix shape: {X_combined.shape}")
    print(f"Samples/Features ratio: {X_combined.shape[0]/X_combined.shape[1]:.2f}")
    
    # Run K-Means
    kmeans = KMeans(n_clusters=K_CLUSTERS, random_state=RANDOM_STATE, n_init=10)
    labels = kmeans.fit_predict(X_combined)
    
    # Calculate Silhouette Score
    sil_score = silhouette_score(X_combined, labels)
    
    # Analyze lift range
    lift_min = rules_top_k['lift'].min()
    lift_max = rules_top_k['lift'].max()
    lift_mean = rules_top_k['lift'].mean()
    
    # Count cluster sizes
    unique, counts = np.unique(labels, return_counts=True)
    cluster_dist = dict(zip(unique, counts))
    
    result = {
        'top_k': top_k,
        'n_features': X_combined.shape[1],
        'samples_per_feature': X_combined.shape[0] / X_combined.shape[1],
        'silhouette_score': sil_score,
        'lift_min': lift_min,
        'lift_max': lift_max,
        'lift_mean': lift_mean,
        'cluster_0_size': cluster_dist.get(0, 0),
        'cluster_1_size': cluster_dist.get(1, 0)
    }
    
    results.append(result)
    
    print(f"Silhouette Score: {sil_score:.4f}")
    print(f"Lift range: {lift_min:.2f} - {lift_max:.2f} (mean: {lift_mean:.2f})")
    print(f"Cluster distribution: {cluster_dist}")

# Convert to DataFrame
results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("SUMMARY OF ALL EXPERIMENTS")
print("="*80)
print(results_df.to_string(index=False))

## 5. Visualization: So s√°nh Silhouette Score

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Silhouette Score vs Top-K
ax1 = axes[0, 0]
ax1.plot(results_df['top_k'], results_df['silhouette_score'], 
         marker='o', linewidth=2, markersize=10, color='#0284c7')
ax1.axvline(x=200, color='red', linestyle='--', linewidth=2, label='Top-K = 200 (Chosen)')
ax1.axhline(y=results_df[results_df['top_k']==200]['silhouette_score'].values[0], 
            color='red', linestyle=':', alpha=0.5)
ax1.set_xlabel('Top-K (Number of Rules)', fontsize=12, fontweight='bold')
ax1.set_ylabel('Silhouette Score', fontsize=12, fontweight='bold')
ax1.set_title('Silhouette Score vs Top-K\n(Higher is Better)', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.legend(fontsize=10)

# Annotate best score
best_idx = results_df['silhouette_score'].idxmax()
best_k = results_df.loc[best_idx, 'top_k']
best_score = results_df.loc[best_idx, 'silhouette_score']
ax1.annotate(f'Best: {best_k}\nScore: {best_score:.4f}', 
             xy=(best_k, best_score), xytext=(best_k+100, best_score-0.05),
             arrowprops=dict(arrowstyle='->', color='red', lw=2),
             fontsize=11, fontweight='bold', color='red')

# Plot 2: Samples per Feature
ax2 = axes[0, 1]
ax2.plot(results_df['top_k'], results_df['samples_per_feature'], 
         marker='s', linewidth=2, markersize=10, color='#10b981')
ax2.axhline(y=10, color='orange', linestyle='--', linewidth=2, label='Min Threshold (10:1)')
ax2.axvline(x=200, color='red', linestyle='--', linewidth=2, label='Top-K = 200')
ax2.set_xlabel('Top-K (Number of Rules)', fontsize=12, fontweight='bold')
ax2.set_ylabel('Samples per Feature Ratio', fontsize=12, fontweight='bold')
ax2.set_title('Curse of Dimensionality Check\n(Should be > 10)', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.legend(fontsize=10)

# Plot 3: Mean Lift by Top-K
ax3 = axes[1, 0]
ax3.plot(results_df['top_k'], results_df['lift_mean'], 
         marker='^', linewidth=2, markersize=10, color='#8b5cf6')
ax3.axvline(x=200, color='red', linestyle='--', linewidth=2, label='Top-K = 200')
ax3.set_xlabel('Top-K (Number of Rules)', fontsize=12, fontweight='bold')
ax3.set_ylabel('Mean Lift Value', fontsize=12, fontweight='bold')
ax3.set_title('Mean Lift vs Top-K\n(Higher means stronger rules)', fontsize=14, fontweight='bold')
ax3.grid(True, alpha=0.3)
ax3.legend(fontsize=10)

# Plot 4: Feature Count
ax4 = axes[1, 1]
ax4.bar(results_df['top_k'].astype(str), results_df['n_features'], 
        color=['red' if k==200 else '#64748b' for k in results_df['top_k']], alpha=0.7)
ax4.set_xlabel('Top-K (Number of Rules)', fontsize=12, fontweight='bold')
ax4.set_ylabel('Total Features (Rules + RFM)', fontsize=12, fontweight='bold')
ax4.set_title('Feature Dimensionality\n(Red = Chosen value)', fontsize=14, fontweight='bold')
ax4.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../data/processed/top_k_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nüìä Visualization saved to: ../data/processed/top_k_comparison.png")

## 6. Summary Table

In [None]:
# Create summary with rankings
summary = results_df.copy()
summary['sil_rank'] = summary['silhouette_score'].rank(ascending=False).astype(int)
summary['quality'] = summary['top_k'].apply(lambda x: 
    '‚≠ê BEST' if x == 200 else 
    '‚úÖ Good' if x in [100, 300] else 
    '‚ö†Ô∏è Too Small' if x < 100 else 
    '‚ùå Too Large')

print("\n" + "="*100)
print("FINAL COMPARISON TABLE")
print("="*100)
print(summary[['top_k', 'silhouette_score', 'sil_rank', 'samples_per_feature', 
               'lift_mean', 'quality']].to_string(index=False))

print("\n" + "="*100)
print("RECOMMENDATION")
print("="*100)
best_row = summary[summary['top_k'] == 200].iloc[0]
print(f"üéØ Ch·ªçn Top-K = 200 v√¨:")
print(f"   1. Silhouette Score cao nh·∫•t: {best_row['silhouette_score']:.4f}")
print(f"   2. Samples/Feature t·ªët: {best_row['samples_per_feature']:.2f} (> 10 threshold)")
print(f"   3. Mean Lift: {best_row['lift_mean']:.2f} (ƒë·ªß m·∫°nh)")
print(f"   4. C√¢n b·∫±ng gi·ªØa information (capture 80% patterns) v√† noise (ch·ªâ 10%)")
print(f"   5. Kh√¥ng b·ªã curse of dimensionality nh∆∞ Top-K l·ªõn (500, 1000)")

## 7. Export Results

In [None]:
# Save results to CSV
results_df.to_csv('../data/processed/top_k_experiment_results.csv', index=False)
print("‚úÖ Results exported to: ../data/processed/top_k_experiment_results.csv")