In [None]:
import pandas as pd
from datetime import datetime

# Load the data
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Merge transactions with customers to get customer details
customer_transactions_df = pd.merge(transactions_df, customers_df, on='CustomerID')

# Convert SignupDate to days since signup
customer_transactions_df['SignupDate'] = pd.to_datetime(customer_transactions_df['SignupDate'])
customer_transactions_df['DaysSinceSignup'] = (datetime.now() - customer_transactions_df['SignupDate']).dt.days

# Aggregate transaction data to create customer features
customer_features = customer_transactions_df.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],
    'TransactionID': 'count',
    'DaysSinceSignup': 'first',
    'Region': 'first'
}).reset_index()

# Flatten the multi-index columns
customer_features.columns = ['CustomerID', 'TotalSpending', 'AvgSpending', 'NumTransactions', 'DaysSinceSignup', 'Region']

# One-hot encode the Region
customer_features = pd.get_dummies(customer_features, columns=['Region'])

# Normalize the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Determine the optimal number of clusters using the Elbow Method
inertia = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(customer_features_scaled)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Method graph
plt.figure(figsize=(8, 5))
plt.plot(range(2, 11), inertia, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.show()

# Based on the Elbow Method, choose the number of clusters
n_clusters = 4  # Example, choose based on the elbow point
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
customer_features['Cluster'] = kmeans.fit_predict(customer_features_scaled)

from sklearn.metrics import davies_bouldin_score

# Calculate the DB Index
db_index = davies_bouldin_score(customer_features_scaled, customer_features['Cluster'])
print(f'Davies-Bouldin Index: {db_index}')

# Other relevant metrics
from sklearn.metrics import silhouette_score

silhouette_avg = silhouette_score(customer_features_scaled, customer_features['Cluster'])
print(f'Silhouette Score: {silhouette_avg}')

from sklearn.decomposition import PCA

# Reduce dimensionality to 2D using PCA
pca = PCA(n_components=2)
customer_features_pca = pca.fit_transform(customer_features_scaled)

# Plot the clusters
plt.figure(figsize=(10, 7))
plt.scatter(customer_features_pca[:, 0], customer_features_pca[:, 1], c=customer_features['Cluster'], cmap='viridis', marker='o')
plt.title('Customer Clusters')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()