# PRML Assignment - 9
# Gaussian Mixture Model and Logistic Regression

In [1]:
import tqdm
import numpy as np
# import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)

## Gaussian Mixture Model Clustering on IRIS Data

In [2]:
class CustomGMM:
    def __init__(self, n_clusters, epochs=5):
        self.epochs = epochs
        self.k = n_clusters


    def initialize(self, X):
        self.class_probs = np.full(shape=self.k, fill_value=1/self.k)
        self.weights = np.full(shape=X.shape, fill_value=1/self.k)

        # Initialize with means taken as a random data point/row from the data for each component
        random_row_idxs = np.random.randint(0, X.shape[0], size=self.k)
        self.means = [X[row_idx, :] for row_idx in random_row_idxs]
        self.variances = [np.cov(X.T) for _ in range(self.k)]


    def expectation(self, X):
        self.weights = self.predict_proba(X)
        self.class_probs = self.weights.mean(axis=0)


    def maximization(self, X):
        for i in range(self.k):
            wt = self.weights[:, [i]]
            sum_wt = wt.sum()
            self.means[i] = (X * wt).sum(axis=0) / sum_wt
            self.variances[i] = np.cov(X.T, aweights=(wt / sum_wt).flatten(), bias=True)


    def predict_proba(self, X):
        likelihood = np.zeros((X.shape[0], self.k))
        for i in range(self.k):
            distn = multivariate_normal(mean=self.means[i], cov=self.variances[i])
            likelihood[:, i] = distn.pdf(X)

        numer = likelihood * self.class_probs
        denom = numer.sum(axis=1)[:, np.newaxis]
        wts = numer / denom

        return wts


    def fit(self, X):
        self.initialize(X)
        
        for i in tqdm.trange(self.epochs, desc="Epochs"):
            # print(f"Iteration {i + 1} -")
            self.expectation(X)
            self.maximization(X)
    

    def predict(self, X):
        weights = self.predict_proba(X)
        return np.argmax(weights, axis=1)

In [3]:
np.random.seed(42)
custom_gmm = CustomGMM(n_clusters=3, epochs=25)
custom_gmm.fit(X)

Epochs: 100%|██████████| 25/25 [00:00<00:00, 533.01it/s]


In [4]:
from sklearn.metrics.cluster import adjusted_rand_score

print("Adjusted Rand Score for GMM clustering on IRIS data:")
print(adjusted_rand_score(y, custom_gmm.predict(X)))

Adjusted Rand Score for GMM clustering on IRIS data:
0.9038742317748124


## Logistic Regression on IRIS Data using `sklearn`

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
np.set_printoptions(suppress=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

lr_classifier = LogisticRegression(random_state=42).fit(X_train, y_train)
print("Predictions: ", lr_classifier.predict(X_test))
print("Accuracy: ", round(lr_classifier.score(X_test, y_test), 3))

Predictions:  [0 2 1 1 0 1 0 0 2 1 2 2 2 1 0 0 0 1 1 2 0 2 1 2 2 2 1 0 2 0]
Accuracy:  0.967
