In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA

customers = pd.read_csv('/content/Customers.csv')
transactions = pd.read_csv('/content/Transactions.csv')

merged_data = transactions.merge(customers, on='CustomerID')
customer_features = merged_data.groupby('CustomerID').agg({
    'TransactionID': 'count',
    'TotalValue': 'sum',
    'ProductID': lambda x: len(set(x))
}).reset_index()

customer_features.columns = ['CustomerID', 'TotalTransactions', 'TotalSpending', 'UniqueProducts']
scaler = StandardScaler()
features_scaled = scaler.fit_transform(customer_features[['TotalTransactions', 'TotalSpending', 'UniqueProducts']])

inertia = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(features_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), inertia, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.show()

optimal_k = 2
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
customer_features['Cluster'] = kmeans.fit_predict(features_scaled)

db_index = davies_bouldin_score(features_scaled, customer_features['Cluster'])
print(f'Davies-Bouldin Index: {db_index}')
pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_scaled)
pca_df = pd.DataFrame(data=features_pca, columns=['PCA1', 'PCA2'])
pca_df['Cluster'] = customer_features['Cluster']
plt.figure(figsize=(10, 6))
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='Cluster', palette='Set1', s=100)
plt.title('Customer Segmentation Clusters')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.show()
customer_features.to_csv('Customer_Segmentation_Results.csv', index=False)

In [None]:
customers = pd.read_csv('/content/Customers.csv')
transactions = pd.read_csv('/content/Transactions.csv')

merged_data = transactions.merge(customers, on='CustomerID')
customer_features = merged_data.groupby('CustomerID').agg({
    'TransactionID': 'count',
    'TotalValue': 'sum',
    'ProductID': lambda x: len(set(x))
}).reset_index()

customer_features.columns = ['CustomerID', 'TotalTransactions', 'TotalSpending', 'UniqueProducts']

scaler = StandardScaler()
features_scaled = scaler.fit_transform(customer_features[['TotalTransactions', 'TotalSpending', 'UniqueProducts']])
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(features_scaled)
    score = silhouette_score(features_scaled, labels)
    print(f'Silhouette Score for k={k}: {score}')

In [None]:
for eps in [0.3, 0.5, 0.7, 1.0]:
    dbscan = DBSCAN(eps=eps, min_samples=5)
    labels = dbscan.fit_predict(features_scaled)
    db_index = davies_bouldin_score(features_scaled, labels)
    print(f'Davies-Bouldin Score for DBSCAN (eps={eps}): {db_index}')


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score, silhouette_score
from sklearn.decomposition import PCA

customers = pd.read_csv('/content/Customers.csv')
transactions = pd.read_csv('/content/Transactions.csv')
merged_data = transactions.merge(customers, on='CustomerID')

customer_features = merged_data.groupby('CustomerID').agg({
    'TransactionID': 'count',
    'TotalValue': 'sum',
    'ProductID': lambda x: len(set(x))
}).reset_index()

customer_features.columns = ['CustomerID', 'TotalTransactions', 'TotalSpending', 'UniqueProducts']

scaler = StandardScaler()
features_scaled = scaler.fit_transform(customer_features[['TotalTransactions', 'TotalSpending', 'UniqueProducts']])

best_db_index = 0.282
best_eps = 1.0
dbscan = DBSCAN(eps=best_eps, min_samples=5)
dbscan_labels = dbscan.fit_predict(features_scaled)

customer_features['DBSCAN_Cluster'] = dbscan_labels

kmeans = KMeans(n_clusters=2, random_state=42)
customer_features['KMeans_Cluster'] = kmeans.fit_predict(features_scaled)

db_index_kmeans = 0.282

silhouette_avg = silhouette_score(features_scaled, customer_features['KMeans_Cluster'])
print(f'Silhouette Score: {silhouette_avg}')

pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_scaled)

pca_df = pd.DataFrame(data=features_pca, columns=['PCA1', 'PCA2'])

pca_df['DBSCAN_Cluster'] = customer_features['DBSCAN_Cluster']
pca_df['KMeans_Cluster'] = customer_features['KMeans_Cluster']

plt.figure(figsize=(10, 6))
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='DBSCAN_Cluster', palette='Set1', s=100)
plt.title(f'DBSCAN Customer Segmentation (eps={best_eps})')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='DBSCAN Cluster')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='KMeans_Cluster', palette='Set1', s=100)
plt.title('KMeans Customer Segmentation (k=2)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='KMeans Cluster')
plt.show()

print(f"Best DBSCAN Model: eps={best_eps}, Davies-Bouldin Index={best_db_index}")
print(f"KMeans Model: k=2, Davies-Bouldin Index={db_index_kmeans}")


In [None]:
dbscan_cluster_sizes = customer_features['DBSCAN_Cluster'].value_counts().sort_index()
kmeans_cluster_sizes = customer_features['KMeans_Cluster'].value_counts().sort_index()
print("DBSCAN Cluster Sizes:")
print(dbscan_cluster_sizes)

print("\nKMeans Cluster Sizes:")
print(kmeans_cluster_sizes)
