# 1) Apply GMM to the heart disease data by setting n_components=2. Get ARI and silhoutte scores for your solution and compare it with those of the k-means and hierarchical clustering solutions that you implemented in the assignments of the previous checkpoints. Which algorithm does perform better?

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn import metrics
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings("ignore")

In [8]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'heartdisease'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(postgres_user, postgres_pw, postgres_host, postgres_port, 
        postgres_db))
heartdisease_df = pd.read_sql_query('select * from heartdisease', con = engine)

engine.dispose()

In [9]:
X = heartdisease_df.iloc[:, :13]
y = heartdisease_df.iloc[:, 13]

X = X.replace(to_replace='?', value = 0)

y = np.where(y > 0, 0, 1)

scaler = StandardScaler()
X_std = scaler.fit_transform(X)

gmm_cluster = GaussianMixture(n_components = 2, random_state = 123)

clusters = gmm_cluster.fit_predict(X_std)

print("ARI score: {}".format(metrics.adjusted_rand_score(y, clusters)))

print("Silhouette score: {}".format(metrics.silhouette_score(X_std, clusters, metric = 'euclidean')))

ARI score: 0.18389186035089963
Silhouette score: 0.13628813153331445


In [10]:
# The GMM scores are lower than both k-means and hierarchical clustering in terms of silhouette scores and ARI scores.

# 2) GMM implementation of scikit-learn has a parameter called covariance_type. This parameter determines the type of covariance parameters to use. Specifically, there are four types you can specify:

# A. full: This is the default. Each component has its own general covariance matrix.
# B. tied: All components share the same general covariance matrix.
# C. diag: Each component has its own diagonal covariance matrix.
# D. spherical: Each component has its own single variance.
# Try all of these. Which one does perform better in terms of ARI and silhouette scores?

In [11]:
gmm_cluster = GaussianMixture(n_components=2, random_state=123, covariance_type="full")

clusters = gmm_cluster.fit_predict(X_std)

print("ARI score with covariance_type=full: {}".format(metrics.adjusted_rand_score(y, clusters)))

print("Silhouette score with covariance_type=full: {}".format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))
print("------------------------------------------------------")

gmm_cluster = GaussianMixture(n_components = 2, random_state = 123, covariance_type = "tied")

clusters = gmm_cluster.fit_predict(X_std)

print("ARI score with covariance_type=tied: {}".format(metrics.adjusted_rand_score(y, clusters)))

print("Silhouette score with covariance_type=tied: {}".format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))
print("------------------------------------------------------")

gmm_cluster = GaussianMixture(n_components = 2, random_state = 123, covariance_type = "diag")

clusters = gmm_cluster.fit_predict(X_std)

print("ARI score with covariance_type=diag: {}".format(metrics.adjusted_rand_score(y, clusters)))

print("Silhouette score with covariance_type=diag: {}".format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))
print("------------------------------------------------------")

gmm_cluster = GaussianMixture(n_components = 2, random_state = 123, covariance_type = "spherical")

clusters = gmm_cluster.fit_predict(X_std)

print("ARI score with covariance_type=spherical: {}".format(metrics.adjusted_rand_score(y, clusters)))

print("Silhouette score with covariance_type=spherical: {}".format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))
print("------------------------------------------------------")

ARI score with covariance_type=full: 0.18389186035089963
Silhouette score with covariance_type=full: 0.13628813153331445
------------------------------------------------------
ARI score with covariance_type=tied: 0.18389186035089963
Silhouette score with covariance_type=tied: 0.13628813153331445
------------------------------------------------------
ARI score with covariance_type=diag: 0.18389186035089963
Silhouette score with covariance_type=diag: 0.13628813153331445
------------------------------------------------------
ARI score with covariance_type=spherical: 0.20765243525722465
Silhouette score with covariance_type=spherical: 0.12468753110276873
------------------------------------------------------


In [12]:
# The ARI score of covariance type spherical is higher than the other covariance types and the silhouette score of covariance 
# type spherical is lower than the other silhouette scores. The scores of the other covariance types are all the same.