In [None]:
from scipy.spatial import KDTree

class DBSCANScratch:
    def __init__(self, eps, min_samples):
        self.eps = eps
        self.min_samples = min_samples

    def fit(self, X):
        X = X.astype(np.float32)
        n = X.shape[0]
        tree = KDTree(X)
        labels = -np.ones(n, dtype=int)
        visited = np.zeros(n, dtype=bool)
        cluster_id = 0

        for i in range(n):
            if visited[i]:
                continue

            visited[i] = True
            neighbors = tree.query_ball_point(X[i], self.eps)

            if len(neighbors) < self.min_samples:
                labels[i] = -1
            else:
                self._expand_cluster(i, neighbors, labels, visited, tree, cluster_id, X)
                cluster_id += 1

        self.labels_ = labels
        return self

    def _expand_cluster(self, point, neighbors, labels, visited, tree, cluster_id, X):
        labels[point] = cluster_id
        i = 0
        while i < len(neighbors):
            p = neighbors[i]
            if not visited[p]:
                visited[p] = True
                new_neighbors = tree.query_ball_point(X[p], self.eps)
                if len(new_neighbors) >= self.min_samples:
                    neighbors += new_neighbors
            if labels[p] == -1:
                labels[p] = cluster_id
            i += 1

In [None]:
import pandas as pd 

df_engineered = pd.read_csv("/content/content/fire_data_engineered.csv")

non_numeric_cols = df_engineered.select_dtypes(
    include=['object', 'category']
).columns

print("Non-numeric columns still present:")
print(non_numeric_cols)

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# 1. Select numeric columns FIRST (while still a DataFrame)
exclude_cols = ['class']
X_df = df_engineered.drop(columns=exclude_cols, errors='ignore')
X_df = X_df.select_dtypes(include=[np.number])

# 2. Replace inf / -inf with NaN
X_df = X_df.replace([np.inf, -np.inf], np.nan)

# 3. Handle NaNs (choose ONE strategy)

# Option A (recommended for DBSCAN): drop rows
X_df = X_df.dropna()

# Option B: fill with median (only if dropping hurts too much)
# X_df = X_df.fillna(X_df.median())

# 4. Convert to NumPy
X = X_df.to_numpy()

# 5. Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 6. Cast AFTER scaling (important)
X_scaled = X_scaled.astype(np.float32)

# 7. Sanity checks
print("Final DBSCAN input shape:", X_scaled.shape)
print("Any NaNs:", np.isnan(X_scaled).any())
print("Any infs:", np.isinf(X_scaled).any())
print("Max abs value:", np.max(np.abs(X_scaled)))


In [None]:
dbscan_scratch = DBSCANScratch(eps=0.5, min_samples=5)
dbscan_scratch.fit(X_scaled)
labels_scratch = dbscan_scratch.labels_


In [None]:
from sklearn.cluster import DBSCAN

dbscan_sklearn = DBSCAN(
    eps=0.5,
    min_samples=5,
    metric="euclidean",
    n_jobs=-1
)

labels_sklearn = dbscan_sklearn.fit_predict(X_scaled)


In [None]:
def summarize(labels, name):
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = np.sum(labels == -1)
    print(f"{name}")
    print("Clusters:", n_clusters)
    print("Noise points:", n_noise)
    print()

summarize(labels_scratch, "Scratch DBSCAN")
summarize(labels_sklearn, "Sklearn DBSCAN")
