<a href="https://colab.research.google.com/github/A-P-Dharanya/Zeotap/blob/main/DHARANYA_AP_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Task 3: Customer Segmentation / Clustering

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns

customers_df = pd.read_csv('/content/Customers.csv')
transactions_df = pd.read_csv('/content/Transactions.csv')
merged_data = pd.merge(transactions_df, customers_df, on='CustomerID', how='inner')

customer_profile = merged_data.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    purchase_count=('TotalValue', 'count'),
    last_purchase_date=('TransactionDate', 'max')
).reset_index()

customer_profile['recency'] = (pd.to_datetime('today') - pd.to_datetime(customer_profile['last_purchase_date'])).dt.days
customer_profile.drop(columns=['last_purchase_date'], inplace=True)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profile[['total_spend', 'purchase_count', 'recency']])

# Elbow Method
wcss = []
for i in range(2, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(scaled_features)
    wcss.append(kmeans.inertia_)
plt.figure(figsize=(8,6))
plt.plot(range(2, 11), wcss, marker='o')
plt.title('Elbow Method for Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.show()
optimal_clusters = 4
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=42)
customer_profile['Cluster'] = kmeans.fit_predict(scaled_features)
db_index = davies_bouldin_score(scaled_features, customer_profile['Cluster'])
silhouette_avg = silhouette_score(scaled_features, customer_profile['Cluster'])
print(f"Optimal Number of Clusters: {optimal_clusters}")
print(f"Davies-Bouldin Index: {db_index}")
print(f"Silhouette Score: {silhouette_avg}")

# Visualization of the Clusters
pca = PCA(n_components=2)
pca_components = pca.fit_transform(scaled_features)

plt.figure(figsize=(8,6))
sns.scatterplot(x=pca_components[:,0], y=pca_components[:,1], hue=customer_profile['Cluster'], palette='Set1', s=100, alpha=0.7)
plt.title('Customer Segments Visualized with PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()
centers_pca = pca.transform(kmeans.cluster_centers_)
plt.figure(figsize=(8,6))
sns.scatterplot(x=pca_components[:,0], y=pca_components[:,1], hue=customer_profile['Cluster'], palette='Set1', s=100, alpha=0.7)
sns.scatterplot(x=centers_pca[:,0], y=centers_pca[:,1], s=200, color='black', marker='X', label='Centroids')
plt.title('Clusters and Their Centroids in PCA Space')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.show()
