# Preprocessing

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import tensorflow as tf

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

2025-02-13 17:51:48.629213: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-13 17:51:49.004688: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-13 17:51:49.007790: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
X = X.reshape(70000, 784)
X.shape

(70000, 784)

In [4]:
y.shape

(70000,)

In [6]:
sample_ind = np.array([])

for l in range(10):
    sample_ind = np.concatenate((sample_ind, np.where(y == l)[0][:500]))


In [7]:
sample_ind = sample_ind.astype(int)

In [8]:
sample_x = X[sample_ind]
sample_x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

# DBSCAN Implementation

In [9]:
df = pd.DataFrame(sample_x)
df1 = df.copy()

In [60]:
def euclidean_distances(mat):
    from tqdm import tqdm
    import numpy as np
    import pandas as pd

    if isinstance(mat, (pd.DataFrame, pd.Series)):
        mat = mat.to_numpy()

    dot_product = np.dot(mat, mat.T)
    xi2 = np.sum(mat**2, axis=1)
    n = len(xi2)
    distances = np.zeros((n, n))

    for i in tqdm(range(n)):
        for j in range(n):
            distances[i, j] = np.sqrt(max(xi2[i] + xi2[j] - (2 * dot_product[i, j]), 0))

    return distances


In [61]:
''' Given a single point from the dataframe, this function will traverse all the neigbhours 
 of the given node (until it cannot find new unique neigbhours). '''
 
from tqdm import tqdm

def dfs(df, node_id, visited, current_cluster):
    stack = [node_id]
    
    with tqdm(total=len(df), desc="DFS Progress") as pbar:
        while len(stack) != 0:
            node_id = stack.pop()
            df.at[node_id, 'cluster'] = current_cluster
            
            if visited[node_id]:
                continue
            
            visited[node_id] = 1
            neigbhours = [int(i) for i in df['neighbors'][node_id].split(',')]

            for neigbhour in neigbhours:
                stack.append(neigbhour)

            pbar.update(1) 

    return df

        
        
        

In [62]:
from tqdm import tqdm
import numpy as np

def create_neighbors(ep, df):
    dist_matrix = euclidean_distances(df)  

    n = dist_matrix.shape[0]

    neighbors = []

    # Using tqdm to track progress of the loop
    for i in tqdm(range(n), desc="Processing Neighbors", unit="point"):
        current_neighbors = np.where(dist_matrix[i] < ep)[0]  
        current_neighbors = ",".join(map(str, current_neighbors.tolist()))  
        if len(current_neighbors) == 0:
             current_neighbors = str(i)
        neighbors.append(current_neighbors) 

    return np.array(neighbors)


In [91]:
from tqdm import tqdm
import numpy as np

def dbscan(ep, minpts, df):
    n = df.shape[0]
    neighbors = create_neighbors(ep, df)
    
    df1 = df.copy()
    df1['neighbors'] = neighbors
    df1['cluster'] = np.array([0] * n)

    curr_cluster = 1

    with tqdm(total=n, desc="DBSCAN Progress") as pbar:
        for pt in range(n):
            curr_pt = pt

            if df1.at[curr_pt, 'cluster'] != 0:
                pbar.update(1)
                continue
            
            neigbhours = [int(i) for i in df1['neighbors'][curr_pt].split(',')]

            if len(neigbhours) >= minpts:
                df1 = dfs(df1.copy(), curr_pt, np.array([0] * n), curr_cluster)
                curr_cluster += 1
            else:
                for neigbhour in neigbhours:
                    if len(neigbhours) >= minpts:
                        df1 = dfs(df1.copy(), curr_pt, np.array([0] * n), curr_cluster)
                        curr_cluster += 1 

            pbar.update(1)

    return df1


# DBSCAN Fashion

In [77]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2) 
X_pca = pca.fit_transform(df2)

In [79]:
X_pca = pd.DataFrame(X_pca)

In [80]:
df2 = df.copy()
df2 = dbscan(8, 4, X_pca)

100%|██████████| 5000/5000 [00:43<00:00, 115.09it/s]
Processing Neighbors: 100%|██████████| 5000/5000 [00:00<00:00, 92236.43point/s]
DFS Progress:   0%|          | 4/5000 [00:00<00:01, 4527.04it/s]
DFS Progress:   0%|          | 5/5000 [00:00<00:01, 4729.71it/s]
DFS Progress:   0%|          | 5/5000 [00:00<00:00, 5315.97it/s]
DFS Progress:   0%|          | 4/5000 [00:00<00:01, 4682.45it/s]
DFS Progress:   0%|          | 7/5000 [00:00<00:00, 5819.65it/s]
DFS Progress:   0%|          | 5/5000 [00:00<00:00, 5123.75it/s]
DFS Progress:   0%|          | 5/5000 [00:00<00:00, 5108.77it/s]
DFS Progress:   0%|          | 4/5000 [00:00<00:01, 3773.55it/s]
DFS Progress:   0%|          | 5/5000 [00:00<00:01, 4441.24it/s]
DFS Progress:   0%|          | 5/5000 [00:00<00:01, 4545.19it/s]68it/s]
DBSCAN Progress: 100%|██████████| 5000/5000 [00:00<00:00, 39847.01it/s]


In [81]:
df2

Unnamed: 0,0,1,neighbors,cluster
0,1413.172416,-431.196167,0,0
1,-703.493467,-1123.301283,1,0
2,831.186554,-1184.010088,2,0
3,998.676650,-850.216681,31889,0
4,1302.288306,-828.127455,4,0
...,...,...,...,...
4995,-894.665635,1595.566640,4995,0
4996,96.251509,1072.076267,4996,0
4997,-991.786203,1380.728076,4997,0
4998,-504.438955,1656.264722,4998,0


In [82]:
labels = df2['cluster'].unique()

array([ 0,  1,  5,  2,  3,  4,  6,  7,  8,  9, 10])

In [83]:
from sklearn.metrics import silhouette_score
score = silhouette_score(X_pca, labels, metric='euclidean')

In [84]:
df2['cluster'].value_counts()

cluster
0     4951
5        7
2        5
3        5
6        5
7        5
9        5
10       5
1        4
4        4
8        4
Name: count, dtype: int64

In [85]:
score

-0.411636906409945

# DBSCAN 20NG

In [2]:
def read_20ng_and_convert(file_path = '20ng.csv'):
    from sklearn.feature_extraction.text import TfidfVectorizer
    import pandas as pd

    df = pd.read_csv(file_path)
    df['index'] = df['index'].str.replace(r'\d+$', '', regex=True)
    classes = ["alt.atheism", "sci.med", "sci.electronics", "comp.graphics", "talk.politics.guns", "sci.crypt"]
    df = df[df['index'].isin(classes)]
    vectorizer = TfidfVectorizer(use_idf=False)
    # vectorizer = TfidfVectorizer(use_idf=False)
    sparse_matrix = vectorizer.fit_transform(df['content'])
    sparse_df = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, columns=vectorizer.get_feature_names_out())
    sparse_df_mean = sparse_df.mean(axis=0)
    sparse_std_sub = sparse_df.sparse.to_dense().std(axis=0)
    sparse_df = (sparse_df - sparse_df_mean)/sparse_std_sub
    sparse_df['index'] = df['index'].to_numpy()
    return sparse_df

In [3]:
df_20ng = read_20ng_and_convert()

In [4]:
df_20ng_x = df_20ng.drop('index', axis=1)

In [5]:
df_20ng_x_without = df_20ng_x.drop(['cluster', 'neighbors'], axis=1)

In [13]:
df_20ng_x_without1 = df_20ng_x_without.copy()
df_20ng_x_without1 = dbscan(10, 2, df_20ng_x_without1)

100%|██████████| 6000/6000 [01:03<00:00, 94.05it/s] 
Processing Neighbors: 100%|██████████| 6000/6000 [00:00<00:00, 33130.41point/s]
DFS Progress:   0%|          | 2/6000 [00:00<00:01, 4134.36it/s]
DFS Progress:   0%|          | 2/6000 [00:00<00:01, 4375.90it/s]it/s]
DFS Progress:   0%|          | 2/6000 [00:00<00:01, 4277.72it/s]it/s]
DFS Progress:   0%|          | 2/6000 [00:00<00:06, 979.29it/s]9it/s]
DFS Progress:   0%|          | 2/6000 [00:00<00:01, 4038.81it/s]it/s]
DFS Progress:   0%|          | 2/6000 [00:00<00:01, 4282.09it/s]it/s]
DFS Progress:   0%|          | 2/6000 [00:00<00:01, 4382.76it/s]t/s] 
DBSCAN Progress: 100%|██████████| 6000/6000 [00:14<00:00, 408.02it/s]


In [14]:
df_20ng_x_without1['cluster'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7])

In [15]:
labels = df_20ng_x_without1['cluster']

In [18]:
from sklearn.metrics import silhouette_score
df_20ng_x_without_sl = df_20ng_x_without.to_numpy() 
score = silhouette_score(df_20ng_x_without_sl, labels, metric='euclidean')

In [19]:
score

-0.18244887952253033

# DBSCAN Households

In [9]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [2]:
from ucimlrepo import fetch_ucirepo 
  
individual_household_electric_power_consumption = fetch_ucirepo(id=235) 
X = individual_household_electric_power_consumption.data.features 
y = individual_household_electric_power_consumption.data.targets 


  df = pd.read_csv(data_url)


In [4]:
X = X.drop(['Date', 'Time'], axis=1)

In [6]:
X.isnull().sum()

Global_active_power          0
Global_reactive_power        0
Voltage                      0
Global_intensity             0
Sub_metering_1               0
Sub_metering_2               0
Sub_metering_3           25979
dtype: int64

In [8]:
X = X.dropna()

In [12]:
import random
random_numbers = random.sample(range(1, 200000), 5000)

In [15]:
X_sub = X.iloc[random_numbers, :]

In [98]:
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=3.05, min_samples=5)  # Adjust eps & min_samples as needed
labels = db.fit_predict(X_sub)


In [78]:
X_sub1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 138677 to 37641
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Global_active_power    5000 non-null   float64
 1   Global_reactive_power  5000 non-null   float64
 2   Voltage                5000 non-null   float64
 3   Global_intensity       5000 non-null   float64
 4   Sub_metering_1         5000 non-null   float64
 5   Sub_metering_2         5000 non-null   float64
 6   Sub_metering_3         5000 non-null   float64
dtypes: float64(7)
memory usage: 312.5 KB


In [79]:
X_sub = X_sub.apply(pd.to_numeric, errors='coerce')

In [88]:
X_sub = X_sub.reset_index()
# X_sub.drop('index', inplace=True)

In [102]:
X_sub.drop('level_0', axis=1, inplace=True)

In [103]:
X_sub

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,1.372,0.058,243.46,5.6,0.0,0.0,18.0
1,0.474,0.170,244.44,2.0,0.0,0.0,0.0
2,0.344,0.184,243.10,1.6,0.0,0.0,0.0
3,1.360,0.086,240.14,5.6,0.0,1.0,18.0
4,0.294,0.090,241.94,1.2,0.0,0.0,0.0
...,...,...,...,...,...,...,...
4995,4.174,0.086,233.68,17.8,0.0,37.0,17.0
4996,3.964,0.306,234.66,16.8,38.0,0.0,16.0
4997,2.364,0.000,244.86,9.6,0.0,0.0,0.0
4998,0.296,0.108,241.61,1.2,0.0,1.0,0.0


In [89]:
X_sub.drop('index', axis=1, inplace=True)

In [104]:
X_sub1 = X_sub.copy()
X_sub1 = dbscan(3.05, 5, X_sub1)

100%|██████████| 5000/5000 [00:39<00:00, 127.42it/s]
Processing Neighbors: 100%|██████████| 5000/5000 [00:00<00:00, 6766.23point/s]
DFS Progress:  33%|███▎      | 1631/5000 [00:11<00:23, 141.98it/s]
DFS Progress:  60%|██████    | 3008/5000 [00:56<00:37, 53.22it/s]it]
DFS Progress:   1%|          | 28/5000 [00:00<00:00, 5281.07it/s]it]
DFS Progress:   2%|▏         | 86/5000 [00:00<00:01, 2980.19it/s]
DFS Progress:   1%|          | 51/5000 [00:00<00:01, 3710.74it/s]
DFS Progress:   1%|          | 31/5000 [00:00<00:01, 4427.38it/s]
DFS Progress:   0%|          | 7/5000 [00:00<00:00, 5933.74it/s]
DFS Progress:   0%|          | 8/5000 [00:00<00:00, 6637.87it/s]
DFS Progress:   0%|          | 6/5000 [00:00<00:00, 6297.75it/s]
DFS Progress:   0%|          | 7/5000 [00:00<00:00, 6645.57it/s]/s] 
DFS Progress:   0%|          | 7/5000 [00:00<00:00, 6896.91it/s]
DBSCAN Progress: 100%|██████████| 5000/5000 [01:08<00:00, 73.36it/s]


In [105]:
labels = X_sub1['cluster']

In [106]:
from sklearn.metrics import silhouette_score
score = silhouette_score(X_sub, labels, metric='euclidean')

In [107]:
score

0.6722807221147779

# Observation

It works well on household data due to its low dimensionality. This makes the data much dense and also creates the proper gap between the clusters. Whereas on the other side, in MNIST and 20NG which have high number of dimensions (784 and 50000+) this makes more uniform and hence less dense. The other reason for low performance on MNIST and 20NG is that its high dimensionality creates overlapping clusters which makes the DBSCAN much harder to classify the clusters. 

DBSCAN works well for specific values of epsilon and minpts because as data becomes more dense we need to decrease the minpts value (as more data will come in small area, this is for given epsilon value).