In [7]:
from IPython import get_ipython
from IPython.display import display
# %%
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import plotly.express as px

# Load data
customers_df = pd.read_csv('/content/Customers.csv')
transactions_df = pd.read_csv('/content/Transactions.csv')

# Merge the data based on common columns (assuming CustomerID is the key)
data = pd.merge(transactions_df, customers_df, on='CustomerID', how='inner')

# Drop CustomerID from the original DataFrame
data = data.drop(['CustomerID'], axis=1)

# Select relevant features for clustering (numerical features)
features = data.select_dtypes(include=['float64', 'int64'])

# Scale the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Find the optimal number of clusters using the elbow method
inertia = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)

fig = px.line(x=range(2, 11), y=inertia, markers=True, labels={'x': 'Number of Clusters', 'y': 'Inertia'}, title='Elbow Method for Optimal Clusters')
fig.show()

# Choose the optimal number of clusters (e.g., 4)
optimal_clusters = 4
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
labels = kmeans.fit_predict(scaled_features)

# Add cluster labels to the original data
data['Cluster'] = labels

# Calculate Davies-Bouldin Index
db_index = davies_bouldin_score(scaled_features, labels)

# Visualize clusters (scatter matrix for first 4 numerical features)
if len(features.columns) >= 4:
    scatter_features = features.columns[:4]
else:
    scatter_features = features.columns

fig = px.scatter_matrix(
    data, dimensions=scatter_features, color='Cluster',
    title='Scatter Matrix of Clusters', labels={col: col for col in scatter_features}
)
fig.update_traces(diagonal_visible=False)
fig.show()

# Save results to a CSV file
data.to_csv('/content/Clustered_Customers.csv', index=False)

# Generate a report
report = f"""Clustering Report:

Number of clusters: {optimal_clusters}
Davies-Bouldin Index: {db_index:.2f}

"""
print(report)

Clustering Report:

Number of clusters: 4
Davies-Bouldin Index: 0.74


