In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score, calinski_harabasz_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Read and preprocess data
def load_and_preprocess_data():
    # Read datasets
    customers_df = pd.read_csv('Customers.csv')
    transactions_df = pd.read_csv('Transactions.csv')
    
    # Convert dates
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
    
    # Calculate customer metrics
    customer_metrics = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': ['sum', 'mean', 'std'],
        'Quantity': ['sum', 'mean']
    }).round(2)
    
    # Flatten column names
    customer_metrics.columns = [
        'transaction_count', 'total_spend', 'avg_transaction_value', 
        'std_transaction_value', 'total_quantity', 'avg_quantity'
    ]
    
    # Calculate recency and frequency
    latest_date = transactions_df['TransactionDate'].max()
    recency = transactions_df.groupby('CustomerID')['TransactionDate'].max()
    recency = (latest_date - recency).dt.days
    
    # Customer account age
    account_age = (latest_date - customers_df['SignupDate']).dt.days
    
    # Combine features
    features_df = pd.concat([
        customer_metrics,
        recency.rename('recency'),
        account_age.rename('account_age')
    ], axis=1).fillna(0)
    
    return features_df, customers_df

# Evaluate clustering performance
def evaluate_clustering(X, labels):
    metrics = {
        'Davies-Bouldin Index': davies_bouldin_score(X, labels),
        'Silhouette Score': silhouette_score(X, labels),
        'Calinski-Harabasz Index': calinski_harabasz_score(X, labels)
    }
    return metrics

# Plot elbow curve
def plot_elbow_curve(X, max_clusters=10):
    inertias = []
    db_scores = []
    
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X)
        inertias.append(kmeans.inertia_)
        db_scores.append(davies_bouldin_score(X, kmeans.labels_))
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Elbow curve
    ax1.plot(range(2, max_clusters + 1), inertias, marker='o')
    ax1.set_xlabel('Number of clusters (k)')
    ax1.set_ylabel('Inertia')
    ax1.set_title('Elbow Method')
    
    # Davies-Bouldin scores
    ax2.plot(range(2, max_clusters + 1), db_scores, marker='o', color='orange')
    ax2.set_xlabel('Number of clusters (k)')
    ax2.set_ylabel('Davies-Bouldin Index')
    ax2.set_title('Davies-Bouldin Index vs. Number of Clusters')
    
    plt.tight_layout()
    plt.savefig('elbow_curve.png')
    plt.close()
    
    return inertias, db_scores

# Visualize clusters
def visualize_clusters(X, labels, features_df):
    # PCA for dimensionality reduction
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    
    # Scatter plot of clusters
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis')
    plt.title('Customer Segments (PCA)')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.colorbar(scatter)
    plt.savefig('cluster_visualization.png')
    plt.close()
    
    # Feature distribution across clusters
    cluster_df = features_df.copy()
    cluster_df['Cluster'] = labels
    
    # Calculate cluster profiles
    cluster_profiles = cluster_df.groupby('Cluster').mean()
    
    # Heatmap of cluster profiles
    plt.figure(figsize=(12, 8))
    sns.heatmap(cluster_profiles.T, cmap='YlOrRd', center=0, annot=True, fmt='.2f')
    plt.title('Cluster Profiles')
    plt.savefig('cluster_profiles.png')
    plt.close()
    
    return cluster_profiles

# Main analysis
def main():
    # Load and preprocess data
    features_df, customers_df = load_and_preprocess_data()
    
    # Scale features
    scaler = StandardScaler()
    X = scaler.fit_transform(features_df)
    
    # Plot elbow curve and get scores
    inertias, db_scores = plot_elbow_curve(X)
    
    # Find optimal number of clusters (minimum DB score)
    optimal_k = db_scores.index(min(db_scores)) + 2  # Add 2 because we started from k=2
    
    # Perform clustering with optimal k
    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    labels = kmeans.fit_predict(X)
    
    # Evaluate clustering
    metrics = evaluate_clustering(X, labels)
    
    # Visualize results
    cluster_profiles = visualize_clusters(X, labels, features_df)
    
    # Generate report
    report = f"""
    Customer Segmentation Analysis Report
    
    1. Optimal Number of Clusters: {optimal_k}
    
    2. Clustering Metrics:
    {'='* 50}
    Davies-Bouldin Index: {metrics['Davies-Bouldin Index']:.4f}
    Silhouette Score: {metrics['Silhouette Score']:.4f}
    Calinski-Harabasz Index: {metrics['Calinski-Harabasz Index']:.4f}
    
    3. Cluster Sizes:
    {'='* 50}
    {pd.Series(labels).value_counts().sort_index().to_string()}
    
    4. Key Characteristics of Each Cluster:
    {'='* 50}
    """
    
    # Add cluster characteristics
    for cluster in range(optimal_k):
        report += f"\nCluster {cluster}:\n"
        cluster_features = cluster_profiles.loc[cluster].sort_values(ascending=False)[:3]
        report += f"Top features: {', '.join(f'{feat}: {val:.2f}' for feat, val in cluster_features.items())}\n"
    
    # Save report
    with open('clustering_report.txt', 'w') as f:
        f.write(report)
    
    return optimal_k, metrics, cluster_profiles

if __name__ == "__main__":
    optimal_k, metrics, cluster_profiles = main()
    print("Analysis complete. Check clustering_report.txt for detailed results.")
    print(f"\nOptimal number of clusters: {optimal_k}")
    print(f"Davies-Bouldin Index: {metrics['Davies-Bouldin Index']:.4f}")

Analysis complete. Check clustering_report.txt for detailed results.

Optimal number of clusters: 2
Davies-Bouldin Index: 0.5812
