In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score

from src.data_loader import DataLoader
from src.clustering import CustomerSegmentation

plt.style.use('seaborn')

In [None]:
# Load data
loader = DataLoader()
customers_df, products_df, transactions_df = loader.load_all_data()

# Initialize segmentation
segmentation = CustomerSegmentation(customers_df, products_df, transactions_df)

# Prepare features
features = segmentation.prepare_features()
print("Feature matrix shape:", features.shape)
print("\nFeatures included:", features.columns.tolist())

In [None]:
# Calculate DB Index for different numbers of clusters
db_scores = segmentation.find_optimal_clusters(max_clusters=10)

# Plot DB Index scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, len(db_scores) + 2), db_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Davies-Bouldin Index')
plt.title('Davies-Bouldin Index vs Number of Clusters')
plt.grid(True)

# Find optimal number of clusters
optimal_k = np.argmin(db_scores) + 2
print(f"Optimal number of clusters: {optimal_k}")

In [None]:
# Perform clustering with optimal k
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(StandardScaler().fit_transform(features))

# Add cluster labels to features
features['Cluster'] = clusters

# Analyze clusters
cluster_summary = features.groupby('Cluster').mean()
print("\nCluster Summary:")
print(cluster_summary)

# Save results
results = {
    'optimal_k': optimal_k,
    'db_index': db_scores[optimal_k-2],
    'cluster_summary': cluster_summary.to_dict()
}

# Save to PDF
plt.savefig('../reports/clustering_results.pdf')
print("\nResults saved to '../reports/clustering_results.pdf'")