In [269]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn import datasets
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib.patches import Ellipse

# Algorithms
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.manifold import MDS

# Tools
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.model_selection import train_test_split
np.random.seed(888)

In [132]:
'''
The two datasets that will be used to train the models in this project. Also used in project 1.
'''
breast_cancer = datasets.load_breast_cancer()
iris = datasets.load_iris()
breast_cancer_X, breast_cancer_Y = breast_cancer.data, breast_cancer.target
iris_X, iris_Y = iris.data, iris.target

In [282]:
# Clustering Algorithms
def gaussian_clustering(X, y, data, title):
    n_components = len(np.unique(y))
    
    gmm = GaussianMixture(n_components=n_components)
    gmm.fit(X)
    
    labels = gmm.predict(X)
    means = gmm.means_
    covariances = gmm.covariances_
    
    # Define colors for the true labels
    colors = ["navy", "tomato", "darkseagreen"]
    
    # Plot the data points with different colors for each cluster
    for i in range(n_components):
        cluster_data = X[labels == i]
        plt.scatter(cluster_data[:, 0], cluster_data[:, 1], c=colors[i], label=f'Cluster {i + 1}')
    
    # Plot ellipses to represent cluster covariances
    for i in range(n_components):
        covariance_matrix = covariances[i][:2, :2]
        eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
        angle = np.degrees(np.arctan2(eigenvectors[1, 0], eigenvectors[0, 0]))
        width, height = 2 * np.sqrt(5.991 * eigenvalues)  # 5.991 corresponds to 95% confidence interval

        ellipse = Ellipse(xy=means[i], width=width, height=height, angle=angle, color=colors[i], alpha=0.2)
        plt.gca().add_patch(ellipse)
    
    # Plot cluster centers
    plt.scatter(means[:, 0], means[:, 1], c='k', marker='x', s=100)
    
    plt.xlabel(data.feature_names[0])
    plt.ylabel(data.feature_names[1])
    plt.title(f'Gaussian Mixture Model Clusters for {title}')
    plt.savefig(fname=f'Gaussian_Mixture_Model_{title}')
    plt.clf()
    return

def agglomerative_clustering(X, y, data, title):
    n_clusters = len(np.unique(y))
    
    # Define colors for the true labels
    colors = ["navy", "tomato", "darkseagreen"]
    # Perform Agglomerative Clustering
    linkage_matrix = linkage(X, method='ward')  # You can use different linkage methods

    # Plot the dendrogram
    plt.figure(figsize=(8, 6))
    dendrogram(linkage_matrix, p=5, truncate_mode='level')
    plt.title(f'Agglomerative Clustering Dendrogram for {title}')
    plt.savefig(fname=f'Agglomerative_Clustering_{title}')
    plt.clf()
    return

In [283]:
if __name__ == "__main__":
    #gaussian_clustering(breast_cancer_X, breast_cancer_Y, breast_cancer, "Breast_Cancer_Dataset")
    #gaussian_clustering(iris_X, iris_Y, iris, "Iris_Dataset")
    agglomerative_clustering(breast_cancer_X, breast_cancer_Y, breast_cancer, "Breast_Cancer_Dataset")
    agglomerative_clustering(iris_X, iris_Y, iris, "Iris_Dataset")

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>