In [13]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import metrics

In [14]:
# Connection information
user = 'dsbc_student'
pw = '7*.8G9QH21'
host = '142.93.121.174'
port = '5432'
db = 'heartdisease'

In [15]:
# Establish connection
connection = create_engine('postgresql://{}:{}@{}:{}/{}'.format(user, pw, host, port, db))

# Create dataframe
df = pd.read_sql_query('select * from heartdisease', con=connection)

# Terminate connection
connection.dispose()

In [16]:
# Define features and outcome
X = df.iloc[:, :13]
y = df.iloc[:, 13]

# Replace missing values
X = X.replace(to_replace='?', value=0)

# Binarize
y = np.where(y > 0, 0, 1)

In [17]:
# Standardize
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

### Gaussian Mixture Models 

In [24]:
# Initialize GMM cluster
gmm_cluster = GaussianMixture(n_components=2)

# Fit model
clusters = gmm_cluster.fit_predict(X_std)

In [21]:
print('Adjusted Rand Index for GMM is {}'.format(metrics.adjusted_rand_score(y, clusters)))
print('Silhouette score for GMM is {}'.format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

Adjusted Rand Index for GMM is 0.4207322145049338
Silhouette score for GMM is 0.16118591340148433


In comparison to k-means, GMM Adjusted Rand Index (ARI) and Silhouette score is lower. 

In comparison to hierarchical:
- GMM ARI is higher than ward, but the Silhouette score is lower. 
- GMM ARI is lower than complete, but the Silhouette score is higher. 
- GMM ARI and Silhouette score is lower than average.

### Gaussian Mixture Models Covariance Type 

In [32]:
covariance = ['full', 'tied', 'diag', 'spherical']

for covar in covariance:
    
    # Initialize GMM cluster
    gmm_cluster = GaussianMixture(n_components=2, covariance_type=covar, random_state=123)
    
    # Fit model
    clusters = gmm_cluster.fit_predict(X_std)
    
    print('{} Covariance'.format(covar.upper()))
    print('Adjusted Rand Index for GMM is {}'.format(metrics.adjusted_rand_score(y, clusters)))
    print('Silhouette score for GMM is {}'.format(metrics.silhouette_score(X_std, clusters, metric='euclidean')))

FULL Covariance
Adjusted Rand Index for GMM is 0.18389186035089963
Silhouette score for GMM is 0.13628813153331445
TIED Covariance
Adjusted Rand Index for GMM is 0.18389186035089963
Silhouette score for GMM is 0.13628813153331445
DIAG Covariance
Adjusted Rand Index for GMM is 0.18389186035089963
Silhouette score for GMM is 0.13628813153331445
SPHERICAL Covariance
Adjusted Rand Index for GMM is 0.20765243525722465
Silhouette score for GMM is 0.12468753110276876
