# Problem 4.3 : 2D color dataset and GMMs

We are given a 2D dataset of points in 2D color space. We use the generative property of GMMs in this task to find the model which generated these points.

In [27]:
# Importing libraries
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
import plotly.graph_objects as go

In [28]:
# Loading the dataset
with open('SMAI-Dataset-problem-4.3/colors.pkl', 'rb') as f:
    data = pickle.load(f)
data

array([[-2.31638050e+00,  1.26969612e+02],
       [ 1.43003789e+00,  1.27063470e+02],
       [-5.39587093e-01,  1.25897475e+02],
       [ 8.82056170e-02,  1.25327145e+02],
       [-1.41889010e+00,  1.27049518e+02],
       [ 9.66290690e-01,  1.27235673e+02],
       [-1.35840746e-01,  1.27498257e+02],
       [ 1.69478101e-01,  1.29486548e+02],
       [ 1.00847567e+00,  1.26187844e+02],
       [-1.41722056e+00,  1.25884563e+02],
       [ 1.36222888e-01,  1.26846716e+02],
       [-1.97722313e+00,  1.26074635e+02],
       [ 7.10850944e-01,  1.27439980e+02],
       [ 2.87579889e+00,  1.27482732e+02],
       [ 1.87622882e-01,  1.28141428e+02],
       [ 1.35391312e+00,  1.26752882e+02],
       [ 5.71283901e-01,  1.26553203e+02],
       [ 4.93088229e-01,  2.54571943e+02],
       [ 8.85417745e-01,  2.54739842e+02],
       [ 1.21842399e+00,  2.56128363e+02],
       [-1.35907317e+00,  2.53083402e+02],
       [ 1.57709750e+00,  2.54950866e+02],
       [-1.43345420e+00,  2.54619629e+02],
       [-8.

# Part 1 : Finding components that best fit the data

We find the 7 color components that fit the data and color the points according to the clusters.

In [29]:
gmm = GaussianMixture(n_components=7, random_state=0)
gmm.fit(data)
colors = gmm.predict(data)

In [30]:
fig = go.Figure()
fig.update_layout(title='GMM with 7 components', template='plotly_dark', width=800, height=800)
fig.add_trace(go.Scatter(x=data[:,0], y=data[:,1], mode='markers', marker=dict(color=colors, size=5)))
fig.show()

In [31]:
# Plotting the means of the components
fig = go.Figure()
fig.update_layout(title='Means of the components', template='plotly_dark', width=800, height=800)
fig.add_trace(go.Scatter(x=gmm.means_[:,0], y=gmm.means_[:,1], mode='markers', marker=dict(color=[i for i in range(1,8)], size=8)))
fig.show()

In [32]:
# print the covariance matrices
for i in range(7):
    print("Covariance matrix of component", i+1, ":" )
    print(gmm.covariances_[i])

Covariance matrix of component 1 :
[[ 1.19754533 -0.01923169]
 [-0.01923169  1.70463777]]
Covariance matrix of component 2 :
[[ 0.38531507 -0.21305776]
 [-0.21305776  0.64412607]]
Covariance matrix of component 3 :
[[1.63856537 0.49894624]
 [0.49894624 0.65573231]]
Covariance matrix of component 4 :
[[ 1.02962802 -0.03525602]
 [-0.03525602  0.57207645]]
Covariance matrix of component 5 :
[[0.72802356 0.03254854]
 [0.03254854 1.04653338]]
Covariance matrix of component 6 :
[[1.6740466  0.30152784]
 [0.30152784 0.89468541]]
Covariance matrix of component 7 :
[[1.09300903 0.05372962]
 [0.05372962 0.93233538]]


In [33]:
# Printing the weights of the components
print("Weights of the components :")
print(gmm.weights_)

Weights of the components :
[0.2  0.15 0.09 0.17 0.14 0.17 0.08]


### Plotting the colour comoponents of the 2D color space

Using the means, we can find the color components. We choose the axes to be green and red components, and blue to be 0.

In [34]:
# Plotting the color components in a band
fig = go.Figure()
fig.update_layout(title='Color components', template='plotly_dark', width=800, height=800, xaxis_title='Red', yaxis_title='Green')
for i in range(7):
    fig.add_trace(go.Scatter(x=[gmm.means_[i,0]-2, gmm.means_[i,0]+2], y=[gmm.means_[i,1], gmm.means_[i,1]],
                             mode='markers',
                             marker=dict(size=20,color='rgb'+str(tuple(gmm.means_[i].astype(int),) +(0,)))))
fig.show()

# Part 2 : Generating new dataset from the GMM

We utilise the generative property of GMMs to generate a sample dataset given the parameters of the GMM.

In [35]:
def generate_data(n_components, means, covariances, n_samples = 1000):
    if n_components != means.shape[0] or n_components != covariances.shape[0] or covariances.shape[1:3] != (2,2) or means.shape[1] != 2:
        print("Invalid parameters")
        return
    for i in range(n_components):
        if np.linalg.det(covariances[i]) <= 0.0:
            print("Invalid parameters")
            return
    weights = np.full(n_components, 1/n_components)
    data = np.zeros((n_samples, 2))
    labels = np.zeros(n_samples)
    for i in range(n_samples):
        component = np.random.choice(n_components, p=weights)
        labels[i] = component
        data[i] = np.random.multivariate_normal(means[component], covariances[component])
    return data, labels

In [36]:
# Generating the data
data, labels = generate_data(7, gmm.means_, gmm.covariances_, 100)

In [37]:
# Plotting the generated data
fig = go.Figure()
fig.update_layout(title='Generated data', template='plotly_dark', width=800, height=800)
fig.add_trace(go.Scatter(x=data[:,0], y=data[:,1], mode='markers', marker=dict(color=labels, size=5)))
fig.show()