In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score, calinski_harabasz_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from fpdf import FPDF

In [13]:
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')

In [14]:
transaction_features = transactions_df.groupby('CustomerID').agg({
    'TransactionID': 'count',
    'TotalValue': ['sum', 'mean', 'std'],
    'Quantity': ['sum', 'mean']
})

In [15]:
transaction_features.columns = [f"{col[0]}_{col[1]}" for col in transaction_features.columns]

In [16]:
features = transaction_features.fillna(0)

In [17]:
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

In [18]:
scores = []
best_score, best_k = float('inf'), 2
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(normalized_features)
    score = davies_bouldin_score(normalized_features, labels)
    scores.append(score)
    if score < best_score:
        best_score, best_k = score, k



In [19]:
final_kmeans = KMeans(n_clusters=best_k, random_state=42)
clusters = final_kmeans.fit_predict(normalized_features)



In [20]:
pca = PCA(n_components=2)
coords = pca.fit_transform(normalized_features)
plt.scatter(coords[:, 0], coords[:, 1], c=clusters, cmap='viridis')
plt.title(f"Customer Segments (k={best_k})")
plt.xlabel("First Principal Component")
plt.ylabel("Second Principal Component")
plt.colorbar(label='Cluster')
plt.savefig('clusters.png')
plt.close()

In [10]:
silhouette = silhouette_score(normalized_features, clusters)
calinski = calinski_harabasz_score(normalized_features, clusters)

In [21]:
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()


pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, 'Customer Segmentation Clustering Report', ln=True, align='C')


pdf.set_font('Arial', '', 12)
pdf.ln(10)  

pdf.cell(0, 10, f"Number of clusters formed: {best_k}", ln=True)
pdf.cell(0, 10, f"Davies-Bouldin Index (DB Index): {best_score:.3f}", ln=True)
pdf.cell(0, 10, f"Silhouette Score: {silhouette:.3f}", ln=True)
pdf.cell(0, 10, f"Calinski-Harabasz Score: {calinski:.3f}", ln=True)


pdf.ln(10) 
pdf.cell(0, 10, 'Cluster visualization:', ln=True)
pdf.image('clusters.png', x=10, y=pdf.get_y(), w=180)


pdf.output('Customer_Segmentation_Report.pdf')

print(f"Clustering complete. Number of clusters: {best_k}, DB Index: {best_score:.3f}")
print("Clustering report saved as 'Customer_Segmentation_Report.pdf'.")

Clustering complete. Number of clusters: 7, DB Index: 1.120
Clustering report saved as 'Customer_Segmentation_Report.pdf'.


  pdf.set_font('Arial', 'B', 16)
  pdf.cell(0, 10, 'Customer Segmentation Clustering Report', ln=True, align='C')
  pdf.set_font('Arial', '', 12)
  pdf.cell(0, 10, f"Number of clusters formed: {best_k}", ln=True)
  pdf.cell(0, 10, f"Davies-Bouldin Index (DB Index): {best_score:.3f}", ln=True)
  pdf.cell(0, 10, f"Silhouette Score: {silhouette:.3f}", ln=True)
  pdf.cell(0, 10, f"Calinski-Harabasz Score: {calinski:.3f}", ln=True)
  pdf.cell(0, 10, 'Cluster visualization:', ln=True)
