In [1]:
%reload_ext nb_black


<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn import datasets, metrics
from sqlalchemy import create_engine
import warnings

<IPython.core.display.Javascript object>

In [3]:
postgres_user = "dsbc_student"
postgres_pw = "7*.8G9QH21"
postgres_host = "142.93.121.174"
postgres_port = "5432"
postgres_db = "heartdisease"

engine = create_engine(
    "postgresql://{}:{}@{}:{}/{}".format(
        postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db
    )
)
df = pd.read_sql_query("select * from heartdisease", con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

<IPython.core.display.Javascript object>

In [4]:
# Define the features and the outcome
X = df.iloc[:, :13]
y = df.iloc[:, 13]

# Replace missing values (marked by `?`) with a `0`
X = X.replace(to_replace="?", value=0)

# Binarize `y` so that `1` means heart disease diagnosis and `0` means no diagnosis
y = np.where(y > 0, 0, 1)

<IPython.core.display.Javascript object>

In [5]:
pd.Series(y).value_counts()

1    164
0    139
dtype: int64

<IPython.core.display.Javascript object>

In [6]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

<IPython.core.display.Javascript object>

In [7]:
dbs = DBSCAN(eps=3, min_samples=5)
clusters = dbs.fit_predict(X_std)

<IPython.core.display.Javascript object>

In [8]:
pd.Series(clusters).value_counts()

 0    234
-1     48
 1      9
 4      5
 2      4
 3      3
dtype: int64

<IPython.core.display.Javascript object>

In [9]:
for i in range(1, 20):
    dbs = DBSCAN(eps=2, min_samples=i)
    clusters = dbs.fit_predict(X_std)
    n_clust = pd.Series(clusters).nunique()
    print(f"For min_samples {i}, there are {n_clust} clusters")

For min_samples 1, there are 178 clusters
For min_samples 2, there are 31 clusters
For min_samples 3, there are 15 clusters
For min_samples 4, there are 9 clusters
For min_samples 5, there are 8 clusters
For min_samples 6, there are 6 clusters
For min_samples 7, there are 6 clusters
For min_samples 8, there are 4 clusters
For min_samples 9, there are 4 clusters
For min_samples 10, there are 3 clusters
For min_samples 11, there are 2 clusters
For min_samples 12, there are 2 clusters
For min_samples 13, there are 2 clusters
For min_samples 14, there are 2 clusters
For min_samples 15, there are 2 clusters
For min_samples 16, there are 2 clusters
For min_samples 17, there are 2 clusters
For min_samples 18, there are 2 clusters
For min_samples 19, there are 2 clusters


<IPython.core.display.Javascript object>

In [10]:
eps_list = np.arange(0.4, 5, 0.4)
for e in eps_list:
    dbs = DBSCAN(eps=e, min_samples=1)
    clusters = dbs.fit_predict(X_std)
    n_clust = pd.Series(clusters).nunique()
    print(f"For eps {e:.1f}, there are {n_clust} clusters")

For eps 0.4, there are 302 clusters
For eps 0.8, there are 300 clusters
For eps 1.2, there are 275 clusters
For eps 1.6, there are 239 clusters
For eps 2.0, there are 178 clusters
For eps 2.4, there are 97 clusters
For eps 2.8, there are 57 clusters
For eps 3.2, there are 22 clusters
For eps 3.6, there are 6 clusters
For eps 4.0, there are 3 clusters
For eps 4.4, there are 1 clusters
For eps 4.8, there are 1 clusters


<IPython.core.display.Javascript object>

In [11]:
dbs = DBSCAN(eps=2, min_samples=11)
clusters = dbs.fit_predict(X_std)
ari = metrics.adjusted_rand_score(y, clusters)
sil = metrics.silhouette_score(X_std, clusters, metric="euclidean")
print(f"ari: {ari}, Silhouette: {sil}")

ari: -0.003349647016278553, Silhouette: -0.03401479551370893


<IPython.core.display.Javascript object>

The higher the min_samples, the fewer the clusters. The larger the epsilon, the fewer the clusters.