# Task 3: Customer Segmentation / Clustering 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
from sklearn.decomposition import PCA

# Read CSV files
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')

# Merge Customers and Transactions on CustomerID
customer_transactions = pd.merge(customers_df, transactions_df, on='CustomerID', how='inner')

# Merge the result with Products on ProductID
customer_transactions = pd.merge(customer_transactions, products_df, on='ProductID', how='inner')

# Aggregate features for each customer
customer_features = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',    # Total spend by the customer
    'TransactionID': 'count',  # Number of transactions
    'Quantity': 'sum',      # Total quantity purchased
}).rename(columns={
    'TotalValue': 'TotalSpend',
    'TransactionID': 'TransactionCount',
    'Quantity': 'TotalQuantity'
}).reset_index()

# Display the aggregated features
print(customer_features.head())  # Display the first few rows of the aggregated data

# Normalize the features
scaler = StandardScaler()
features_normalized = scaler.fit_transform(customer_features[['TotalSpend', 'TransactionCount', 'TotalQuantity']])

# Apply K-Means clustering with 2 to 10 clusters and evaluate using DB Index
cluster_results = []
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(features_normalized)
    
    # Calculate Davies-Bouldin Index and Silhouette Score
    db_index = davies_bouldin_score(features_normalized, cluster_labels)
    silhouette_avg = silhouette_score(features_normalized, cluster_labels)
    
    cluster_results.append({
        'n_clusters': n_clusters,
        'DB_Index': db_index,
        'Silhouette_Score': silhouette_avg,
        'Cluster_Labels': cluster_labels
    })

# Find the best clustering result based on the lowest DB Index
best_result = min(cluster_results, key=lambda x: x['DB_Index'])

# Display the results for each cluster size
cluster_results_df = pd.DataFrame([{
    'n_clusters': result['n_clusters'],
    'DB_Index': result['DB_Index'],
    'Silhouette_Score': result['Silhouette_Score']
} for result in cluster_results])

# Display the best result and sorted cluster results
print(f"Best result: {best_result['n_clusters']} clusters, DB Index: {best_result['DB_Index']}")
print(cluster_results_df.sort_values(by='DB_Index'))

# Use PCA for dimensionality reduction to visualize clusters
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(features_normalized)
best_labels = best_result['Cluster_Labels']

# Plot the clusters
plt.figure(figsize=(10, 6))
for cluster in range(best_result['n_clusters']):
    cluster_points = reduced_features[best_labels == cluster]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster + 1}', alpha=0.7)

plt.title('Customer Clusters Visualization (PCA Reduced)', fontsize=14)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.grid(True)
plt.show()
