## Libraries and Utilities

In [20]:
pip install kneed

In [21]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# preprocessing
from sklearn.preprocessing import MinMaxScaler

# modelling
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# metrics evaluation
from sklearn.metrics import silhouette_score
from sklearn.metrics import v_measure_score
from kneed import KneeLocator

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Load Data

In [22]:
petabencana_path = '../input/petabencana/Petabencana.csv'
bnpb_path = '../input/badan-nasional-penanggulangan-bencana/BNPB.csv'
petabencana_df = pd.read_csv(petabencana_path, encoding= 'unicode_escape')
bnpb_df = pd.read_csv(bnpb_path, on_bad_lines='skip', sep=';', index_col='No.')

In [23]:
bnpb_df.head()

In [24]:
bnpb_df.info()

### Pre-Processing MinMaxScaler

In [25]:
scaler = MinMaxScaler()

impact_df_normalized = pd.DataFrame()
for column in impact_df.columns:
    impact_df_normalized[[column]] = scaler.fit_transform(impact_df[[column]])

# Modelling | Unsupervised Case
**Features**
* `Meninggal`
* `Hilang`
* `Terluka`
* `Rumah Rusak`
* `Fasum Rusak`

In [26]:
impact_df = bnpb_df[['Meninggal', 'Hilang', 'Terluka', 'Rumah Rusak', 'Fasum Rusak']]
impact_df.head()

## K-Means

In [27]:
sse = []
k_rng = range(1, 10)

for k in k_rng:
    km = KMeans(n_clusters=k)
    km.fit(impact_df_normalized)
    sse.append(km.inertia_)

In [36]:
fig, ax = plt.subplots(figsize=(12, 8))
plt.xlabel('K')
plt.ylabel('Sum of squared error')
ax.set_title('Searching for Elbow')
sns.lineplot(x=k_rng, y=sse)

# Annotate arrow
ax.annotate('Possible Elbow Point', xy=(3, 3.3397772036858746), xytext=(3, 6), xycoords='data',          
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='blue', lw=2))

ax.annotate('Possible Elbow Point', xy=(5, 1.7224275684378545), xytext=(5, 3), xycoords='data',          
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='blue', lw=2))

plt.show()

## DBScan

### Default Parameters

In [29]:
impact_df_normalized.head()

In [30]:
dbscan_cluster1 = DBSCAN()
dbscan_cluster1.fit(impact_df_normalized)

In [31]:
np.unique(dbscan_cluster1.labels_)

In [32]:
# number of cluster
labels = dbscan_cluster1.labels_
n_clus = len(set(labels)) - (1 if -1 in labels else 0)
print('number of clusters: ', n_clus)

# identity noise
n_noise = list(dbscan_cluster1.labels_).count(-1)
print('noise point: ', n_noise)

In [33]:
nearest_neighbors = NearestNeighbors(n_neighbors=10)
neighbors = nearest_neighbors.fit(impact_df_normalized)

distances, indices = neighbors.kneighbors(impact_df_normalized)
distances = np.sort(distances[:, 9], axis=0)

fig = plt.figure(figsize=(5, 5))
plt.plot(distances)
plt.xlabel('Points')
plt.ylabel('Distance')

points: jumlah data

### Identifying Elbow Point with Kneed Package

In [34]:
i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')

fig = plt.figure(figsize=(10, 10))
knee.plot_knee()
plt.xlabel("Points")
plt.ylabel("Distance")

print(distances[knee.knee])

### Applying DBSCAN with Optimal value of Epsilon = 0.00025

In [37]:
dbscan_cluster = DBSCAN(eps=0.00025, min_samples=10)
dbscan_cluster.fit(impact_df_normalized)

# Number of Clusters
labels=dbscan_cluster.labels_
N_clus=len(set(labels))-(1 if -1 in labels else 0)
print('Estimated no. of clusters: %d' % N_clus)

# Identify Noise
n_noise = list(dbscan_cluster.labels_).count(-1)
print('Estimated no. of noise points: %d' % n_noise)