In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn import metrics
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings("ignore")

In [0]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'heartdisease'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
heartdisease_df = pd.read_sql_query('select * from heartdisease',con=engine)

engine.dispose()

In [0]:
#preprocessing
# Define the features and the outcome
X = heartdisease_df.iloc[:, :13]
y = heartdisease_df.iloc[:, 13]

# Replace missing values (marked by ?) with a 0
X = X.replace(to_replace='?', value=0)

# Binarize y so that 1 means heart disease diagnosis and 0 means no diagnosis
y = np.where(y > 0, 0, 1)

Task 1: Apply GMM to the heart disease data by setting n_components=2. Get ARI and silhoutte scores for your solution and compare it with those of the k-means and hierarchical clustering solutions that you implemented in the assignments of the previous checkpoints. Which algorithm does perform better?

In [0]:
# Standardize
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [6]:
#define algorithm with 2 clusters
gmm_cluster = GaussianMixture(n_components=2, random_state=123)

# Fit the model
clusters = gmm_cluster.fit_predict(X_std)

print("GMM ARI score with 2 clusters: {}".format(
    metrics.adjusted_rand_score(y, clusters)))

print("GMM silhouette score with 2 clusters: {}".format(
    metrics.silhouette_score(X_std, clusters, metric='euclidean')))

GMM ARI score with 2 clusters: 0.18389186035089963
GMM silhouette score with 2 clusters: 0.13628813153331445


The ARI and Silhouette scores are quite low. In fact, K-means (.44) and hierarchical clustering (.29) ARI scores were higher. The K-means (.58) and hierarchical clustering (.15) silhouette scores were also higher. Setting k= 2 clusters, K-means performed best.

Task 2: GMM implementation of scikit-learn has a parameter called covariance_type. This parameter determines the type of covariance parameters to use. Specifically, there are four types you can specify:  

**full:** This is the default. Each component has its own general covariance matrix.  
**tied:** All components share the same general covariance matrix.  
**diag:** Each component has its own diagonal covariance matrix.  
**spherical:** Each component has its own single variance.  
Try all of these. Which one does perform better in terms of ARI and silhouette scores?

In [8]:
#GMM with 2 clusters, covariance type set to full
gmm_cluster = GaussianMixture(n_components=2, random_state=123, covariance_type="full")

# Fit the model
clusters = gmm_cluster.fit_predict(X_std)

print("ARI score (full): {}".format(
    metrics.adjusted_rand_score(y, clusters)))

print("Silhouette score: full: {}".format(
    metrics.silhouette_score(X_std, clusters, metric='euclidean')))
print("**********************************")

# GMM with covariance type set to tied
gmm_cluster = GaussianMixture(n_components=2, random_state=123, covariance_type="tied")

# Fit the model
clusters = gmm_cluster.fit_predict(X_std)

print("ARI score (tied): {}".format(
    metrics.adjusted_rand_score(y, clusters)))

print("Silhouette score (tied): {}".format(
    metrics.silhouette_score(X_std, clusters, metric='euclidean')))
print("**********************************")

#GMM with diag
gmm_cluster = GaussianMixture(n_components=2, random_state=123, covariance_type="diag")

# Fit the model
clusters = gmm_cluster.fit_predict(X_std)

print("ARI score (diag): {}".format(
    metrics.adjusted_rand_score(y, clusters)))

print("Silhouette score (diag): {}".format(
    metrics.silhouette_score(X_std, clusters, metric='euclidean')))
print("**********************************")


# GMM with spherical 
gmm_cluster = GaussianMixture(n_components=2, random_state=123, covariance_type="spherical")

# Fit the model
clusters = gmm_cluster.fit_predict(X_std)

print("ARI score (spherical): {}".format(
    metrics.adjusted_rand_score(y, clusters)))

print("Silhouette score (spherical): {}".format(
    metrics.silhouette_score(X_std, clusters, metric='euclidean')))
print("**********************************")

ARI score (full): 0.18389186035089963
Silhouette score: full: 0.13628813153331445
**********************************
ARI score (tied): 0.18389186035089963
Silhouette score (tied): 0.13628813153331445
**********************************
ARI score (diag): 0.18389186035089963
Silhouette score (diag): 0.13628813153331445
**********************************
ARI score (spherical): 0.20765243525722465
Silhouette score (spherical): 0.12468753110276873
**********************************


When covariance type is set to spherical, the ARI score is slightly higher (.21 in comparison to .18). However, the full, tied, and diag covariance types all had very slightly higher silhouette scores (.136 vs .125).