In [None]:
!pip install hdbscan scikit-learn pandas umap-learn

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('/content/Electronics_Data.csv')
df['category_encoded'] = df['category'].astype('category').cat.codes
X = df[['price', 'sales', 'popularity_score', 'category_encoded']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
iso = IsolationForest(contamination=0.03, random_state=42)
outliers = iso.fit_predict(X_scaled)
X_filtered = X_scaled[outliers == 1]

In [None]:
import umap
from itertools import product
from sklearn.metrics import silhouette_score
import hdbscan
import umap

umap_model = umap.UMAP(n_neighbors=2, min_dist=0.03, n_components=1, random_state=42)
X_umap = umap_model.fit_transform(X_filtered)

param_grid = {
    'min_cluster_size': [3, 5, 10, 15, 20, 25, 30],
    'min_samples': [1, 2, 3, 4, 5, 6]
}

best_score = -1
best_model = None

for min_cluster_size, min_samples in product(param_grid['min_cluster_size'], param_grid['min_samples']):
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        cluster_selection_epsilon=0.1
    )
    labels = clusterer.fit_predict(X_umap)

    # Evaluate only if there are at least 2 clusters (excluding noise)
    if len(set(labels)) > 1:
        non_noise_mask = labels != -1
        if np.any(non_noise_mask):  
            score = silhouette_score(X_umap[non_noise_mask], labels[non_noise_mask])
            if score > best_score:
                best_score = score
                best_model = clusterer
                best_labels = labels

print(f"✅ Best Silhouette Score: {best_score:.4f}")

✅ Best Silhouette Score: 0.8368

In [None]:
final_labels = best_model.fit_predict(X_umap)

clustered_data = df[outliers == 1].copy()
clustered_data['cluster'] = final_labels

clustered_data.to_csv('dataset_dbms.csv', index=False)

In [None]:
cluster_counts = clustered_data['cluster'].value_counts().sort_index()
print("\nNumber of entries per cluster:\n", cluster_counts)

unique_categories_per_cluster = clustered_data.groupby('cluster')['category'].unique()
print("\nUnique categories per cluster:\n", unique_categories_per_cluster)


Number of entries per cluster:
 cluster
-1      7
 0      3
 1      4
 2      4
 3      9
       ..
 101    3
 102    3
 103    3
 104    3
 105    4
Name: count, Length: 107, dtype: int64

Unique categories per cluster:
 cluster
-1      [Smartwatch, Laptop, Tablet]
 0                      [Smartwatch]
 1              [Camera, Headphones]
 2              [Camera, Headphones]
 3      [Laptop, Tablet, Headphones]
                    ...             
 101            [Tablet, Smartwatch]
 102            [Laptop, Smartphone]
 103                        [Laptop]
 104            [Headphones, Camera]
 105                        [Camera]
Name: category, Length: 107, dtype: object


In [1]:
import pandas as pd
import umap
import plotly.express as px

df = pd.read_csv("dataset_dbms.csv")

features = df[['price', 'sales', 'popularity_score', 'category_encoded']]

umap_3d = umap.UMAP(n_components=3, random_state=42)
embedding = umap_3d.fit_transform(features)

df['X'] = embedding[:, 0]
df['Y'] = embedding[:, 1]
df['Z'] = embedding[:, 2]

fig = px.scatter_3d(
    df,
    x='X',
    y='Y',
    z='Z',
    color='cluster',
    hover_data=['name','price', 'sales', 'popularity_score','quantity', 'category']
)

fig.show()

  from .autonotebook import tqdm as notebook_tqdm
  warn(


In [3]:
df.sample(5)

Unnamed: 0,product_id,name,category,quantity,price,sales,popularity_score,category_encoded,cluster,supplier
177,178,Asus ROG Zephyrus G14 v8,Laptop,402,1131.18,209,0.68,2,23,Asus
354,356,Bose QuietComfort 45 v26,Headphones,206,1502.26,744,0.99,1,23,Sennheiser
481,483,Nikon Z6 II v89,Camera,204,1265.48,607,0.44,0,44,Fujifilm
312,314,Galaxy Watch 6 v35,Smartwatch,91,1902.06,788,0.81,4,48,Samsung
75,76,Canon EOS R6 v66,Camera,422,1744.46,865,0.03,0,38,Fujifilm


In [4]:
cluster_number = 5

cluster_data = df[df['cluster'] == cluster_number]

print(f"Data for Cluster {cluster_number}:\n{cluster_data}")

Data for Cluster 5:
     product_id                name    category  quantity    price  sales  \
300         302   MacBook Air M3 v2      Laptop       210  1796.08    830   
348         350  MacBook Air M3 v88      Laptop       394  1837.38    927   
393         395   Xiaomi 13 Pro v31  Smartphone       388  1905.45    859   
456         458       iPhone 15 v88  Smartphone       459  1969.76    949   

     popularity_score  category_encoded  cluster supplier  
300              0.52                 2        5     Dell  
348              0.65                 2        5     Dell  
393              0.98                 3        5  OnePlus  
456              0.77                 3        5   Xiaomi  
