In [1]:
from scipy.stats import multivariate_normal
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import csv
import numpy as np
from sklearn.mixture import BayesianGaussianMixture


In [2]:
## data generator
# Set the random seed for reproducibility
np.random.seed(42)

# Generate a 2D dataset with 4 clusters
n_samples = 500
centers = [(1, 1), (1, -1), (-1, 1), (-1, -1)]
X = []
for center in centers:
    x = np.random.randn(n_samples, 2) + center
    X.append(x)
X = np.concatenate(X, axis=0)

In [3]:
def autoclass(X):
    min_components = 2
    max_components = 10

    # Initialize empty lists to store the BIC values and the models
    bic_values = []
    models = []

    # Loop over the range of cluster numbers and fit a model for each
    for n_components in range(min_components, max_components+1):

        model = BayesianGaussianMixture(
            n_components=n_components, max_iter=1000, n_init=1, weight_concentration_prior=1e-3, random_state=42)
        model.fit(X)
        models.append(model)
        log_likelihood = model.score(X)
        n_parameters = n_components * (X.shape[1] + 1)
        n_samples = X.shape[0]
        bic = -2 * log_likelihood + n_parameters * np.log(n_samples)
        bic_values.append(bic)

    # Find the optimal number of clusters based on the BIC values
    K = np.argmin(bic_values) + min_components
    return K
n_components=autoclass(X)

In [4]:
gmm = GaussianMixture(n_components=4, random_state=42)
gmm.fit(X)

# Get the cluster centers
centers = gmm.means_

# Print the cluster centers
print("Cluster centers:", centers)


Cluster centers: [[-1.06742775  0.89393312]
 [ 1.11892045 -0.82668   ]
 [ 0.936537    1.14980969]
 [-1.00526465 -1.15720522]]


In [5]:
def GMM(X, K, max_iter=100, tol=1e-4):
    """
    Fits a Gaussian Mixture Model to the input data X.

    Parameters:
    -----------
    X : ndarray
        The input data, with shape (N, D), where N is the number of data points and D is the number of features.
    K : int
        The number of components in the mixture model.
    max_iter : int, optional
        The maximum number of iterations to run the EM algorithm, defaults to 100.
    tol : float, optional
        The tolerance level for convergence, defaults to 1e-4.

    Returns:
    --------
    pi : ndarray
        The mixing coefficients of the GMM, with shape (K,).
    mu : ndarray
        The means of the components of the GMM, with shape (K, D).
    sigma : ndarray
        The covariance matrices of the components of the GMM, with shape (K, D, D).
    """

    # Initialization
    N, D = X.shape
    pi = np.ones(K) / K
    mu = X[np.random.choice(N, K, replace=False)]
    sigma = np.tile(np.eye(D), (K, 1, 1))

    # EM algorithm
    for i in range(max_iter):
        # E-step
        gamma = np.zeros((N, K))
        for j in range(K):
            gamma[:, j] = pi[j] * \
                multivariate_normal.pdf(X, mean=mu[j], cov=sigma[j])

        gamma /= np.sum(gamma, axis=1, keepdims=True)

        # M-step
        Nk = np.sum(gamma, axis=0)
        pi = Nk / N
        mu = np.dot(gamma.T, X) / Nk[:, None]
        for j in range(K):
            X_centered = X - mu[j]
            sigma[j] = np.dot(
                (X_centered * gamma[:, j][:, None]).T, X_centered) / Nk[j]

        # Check for convergence
        if np.abs(np.sum(gamma, axis=0) - N / K).max() < tol:
            break

    return pi, mu, sigma

x=GMM(X,n_components)
x[1]

array([[-1.05114392,  0.00284482],
       [ 0.99046984,  0.05014578]])

In [6]:
n_clusters = 4
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_

# Print the cluster labels
print("Cluster labels:", labels)

# Get the cluster centers
centers = kmeans.cluster_centers_

# Print the cluster centers
print("Cluster centers:", centers)


Cluster labels: [1 1 1 ... 0 0 3]
Cluster centers: [[-1.17974893 -1.25743119]
 [ 1.14924952  1.23694434]
 [-1.1852623   1.11395896]
 [ 1.18046857 -1.06130664]]


In [7]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_blobs
from sklearn.metrics import accuracy_score

# Generate a toy dataset
X, y = make_blobs(n_samples=500, centers=5, random_state=42)

# Split the dataset into training and testing sets
split = int(len(X) * 0.7)
X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]

# K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_train)
kmeans_train_preds = kmeans.predict(X_train)
kmeans_test_preds = kmeans.predict(X_test)

# GMM clustering
gmm = GaussianMixture(n_components=5, random_state=42)
gmm.fit(X_train)
gmm_train_preds = gmm.predict(X_train)
gmm_test_preds = gmm.predict(X_test)

# K-Nearest Neighbors classification
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_train_preds = knn.predict(X_train)
knn_test_preds = knn.predict(X_test)

# Evaluate the performance of each model
kmeans_acc = accuracy_score(y_test, kmeans_test_preds)
gmm_acc = accuracy_score(y_test, gmm_test_preds)
knn_acc = accuracy_score(y_test, knn_test_preds)

print("K-Means accuracy:", kmeans_acc)
print("GMM accuracy:", gmm_acc)
print("KNN accuracy:", knn_acc)


K-Means accuracy: 0.5866666666666667
GMM accuracy: 0.58
KNN accuracy: 0.98
