In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from scipy import stats

from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import datasets, metrics

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

from sqlalchemy import create_engine

%matplotlib inline

<IPython.core.display.Javascript object>

In [3]:
postgres_user = "dsbc_student"
postgres_pw = "7*.8G9QH21"
postgres_host = "142.93.121.174"
postgres_port = "5432"
postgres_db = "heartdisease"

engine = create_engine(
    "postgresql://{}:{}@{}:{}/{}".format(
        postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db
    )
)

heart_df = pd.read_sql_query("select * from heartdisease", con=engine)

engine.dispose()

<IPython.core.display.Javascript object>

In [4]:
heart_df = heart_df.drop(index=0, axis=0)
# Define the features and the outcome
X = heart_df.iloc[:, :13]
y = heart_df.iloc[:, 13]

# Replace missing values (marked by `?`) with a `0`
X = X.replace(to_replace="?", value=0)

# Binarize `y` so that `1` means heart disease diagnosis and `0` means no diagnosis
y = np.where(y > 0, 0, 1)

<IPython.core.display.Javascript object>

In [5]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(X_std)
X_std

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.394920,0.687874,0.876481,1.612596,0.758372,-0.412968,1.020113,-1.818840,1.431958,0.400906,0.661249,2.500487,-0.862438
1,1.394920,0.687874,0.876481,-0.662326,-0.342633,-0.412968,1.020113,-0.900807,1.431958,1.350164,0.661249,1.429357,1.168411
2,-1.929113,0.687874,-0.173905,-0.093596,0.063000,-0.412968,-0.993444,1.634713,-0.698344,2.126829,2.298111,-0.712905,-0.862438
3,-1.485908,-1.453754,-1.224290,-0.093596,-0.825530,-0.412968,1.020113,0.978975,-0.698344,0.314610,-0.975613,-0.712905,-0.862438
4,0.176108,0.687874,-1.224290,-0.662326,-0.207422,-0.412968,-0.993444,1.241270,-0.698344,-0.203167,-0.975613,-0.712905,-0.862438
...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,-1.042704,0.687874,-2.274676,-1.231057,0.333423,-0.412968,-0.993444,-0.769660,-0.698344,0.142017,0.661249,-0.712905,1.168411
298,1.505721,0.687874,0.876481,0.702627,-1.038004,2.421495,-0.993444,-0.376217,-0.698344,2.040533,0.661249,1.429357,1.168411
299,0.286909,0.687874,0.876481,-0.093596,-2.235588,-0.412968,-0.993444,-1.512829,1.431958,0.142017,0.661249,0.358226,1.168411
300,0.286909,-1.453754,-1.224290,-0.093596,-0.207422,-0.412968,1.020113,1.066406,-0.698344,-0.893537,0.661249,0.358226,-0.862438


<IPython.core.display.Javascript object>

1. Apply GMM to the heart disease dataset by setting n_components=2. Get ARI and silhouette scores for your solution and compare it with those of the k-means and hierarchical clustering solutions that you implemented in the previous checkpoint assignments. Which algorithm performs best?

In [6]:
gmm = GaussianMixture(n_components=2)
gmm.fit(X_std)

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
                means_init=None, n_components=2, n_init=1, precisions_init=None,
                random_state=None, reg_covar=1e-06, tol=0.001, verbose=0,
                verbose_interval=10, warm_start=False, weights_init=None)

<IPython.core.display.Javascript object>

In [7]:
pd.DataFrame(scaler.inverse_transform(gmm.means_)).style.background_gradient()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,55.626263,0.777778,3.686869,133.323232,251.242424,0.161616,1.111111,137.212121,1.0,1.519192,1.828283,0.858586,5.59596
1,53.817734,0.630542,2.91133,130.827586,244.541872,0.137931,0.926108,155.650246,-0.0,0.799507,1.482759,0.571429,4.261084


<IPython.core.display.Javascript object>

In [8]:
gmm_probs = pd.DataFrame(gmm.predict_proba(X_std))

<IPython.core.display.Javascript object>

In [9]:
print(gmm_probs[1].value_counts())
print(y.sum())

1.0    203
0.0     99
Name: 1, dtype: int64
163


<IPython.core.display.Javascript object>

In [10]:
labels = gmm.predict(X_std)
X["labels"] = labels

<IPython.core.display.Javascript object>

In [11]:
print(f"The ARI is {metrics.adjusted_rand_score(y, labels)}")
print(
    f"The Silhouette Score is {metrics.silhouette_score(X_std, labels, metric='euclidean')}"
)

The ARI is 0.18230716541111341
The Silhouette Score is 0.13684540146178015


<IPython.core.display.Javascript object>

In [12]:
gmm_probs[0].value_counts()

0.0    203
1.0     99
Name: 0, dtype: int64

<IPython.core.display.Javascript object>

Strange that the the predictions are at 100% confidence. Is this data linearly separable?

2. GMM implementation of scikit-learn has a parameter called covariance_type. This parameter determines the type of covariance parameters to use. There are four types that you can specify:

    * full: This is the default. Each component has its own general covariance matrix.
    * tied: All components share the same general covariance matrix.
    * diag: Each component has its own diagonal covariance matrix.
    * spherical: Each component has its own single variance.

Try all of these. Which one performs best in terms of ARI and silhouette scores?

In [13]:
covars = ["full", "tied", "diag", "spherical"]
for covar in covars:

    gmm = GaussianMixture(n_components=2, covariance_type=covar)
    clusters = gmm.fit_predict(X_std)

    print("Covariance Type: " + covar)
    print(f"The ARI is {metrics.adjusted_rand_score(y, clusters)}")
    print(
        f"The Silhouette Score is {metrics.silhouette_score(X_std, clusters, metric='euclidean')}"
    )
    print("-----")



Covariance Type: full
The ARI is 0.41922773658819823
The Silhouette Score is 0.16213539658435835
-----
Covariance Type: tied
The ARI is 0.4543732599413319
The Silhouette Score is 0.16810471764901874
-----
Covariance Type: diag
The ARI is 0.39376847260392006
The Silhouette Score is 0.16134238525855113
-----
Covariance Type: spherical
The ARI is 0.21214350613662605
The Silhouette Score is 0.12499890390810753
-----


<IPython.core.display.Javascript object>