In [5]:
# imports
import numpy as np
import pandas as pd

# for plots
import plotly.express as px
import plotly.graph_objects as go

In [2]:
# load data
X = np.genfromtxt('J19_E1n_QuantifiedData.csv')

In [66]:
# plot X
fig = go.Figure(
    data=[
        go.Scatter(
            x=X[:, 0],
            y=X[:, 1],
            mode="markers"
        )
    ]
)

fig.update_layout(
    title="dataset X", 
    xaxis_title="x",
    yaxis_title="y",
)

fig.show()

# Question 1

In [8]:
# cluster X
from sklearn.cluster import KMeans

# built
km = KMeans(
    n_clusters=3,
    # for reproducibilty
    random_state=111
)

labels = km.fit_predict(X)

In [67]:
# plot X
fig = px.scatter(
    x=X[:, 0],
    y=X[:, 1],
    color=labels
)

fig.update_layout(
    title="Kmeas clusters",
    xaxis_title="x",
    yaxis_title="y",
)

fig.show()

# Question 2


In [19]:
from scipy.stats import multivariate_normal


def log_likelihood(X, theta, Z):
    n, _ = X.shape
    log_likelihood = 0

    tau_1,tau_2,mu_1_x,mu_1_y,mu_2_x,mu_2_y,mu_3_x,mu_3_y,sigma_1,sigma_2,sigma_3 = theta

    for i in range(n):
        if Z[i] == 0:
            pdf_x = multivariate_normal.pdf(
                x=X[i],
                mean=[mu_1_x, mu_1_y],
                cov=[[sigma_1, 0], [0, sigma_1]]
            )

            log_likelihood += np.log(tau_1 * pdf_x)

        if Z[i] == 1:
            pdf_x = multivariate_normal.pdf(
                x=X[i],
                mean=[mu_2_x, mu_2_y],
                cov=[[sigma_2, 0], [0, sigma_2]]
            )

            log_likelihood += np.log(tau_2 * pdf_x)

        if Z[i] == 2:
            pdf_x = multivariate_normal.pdf(
                x=X[i],
                mean=[mu_3_x, mu_3_y],
                cov=[[sigma_3, 0], [0, sigma_3]]
            )

            tau_3 = 1 - tau_1 - tau_2
            log_likelihood += np.log(tau_3 * pdf_x)


    return log_likelihood


# Question 3

La paramétrisation initiale qui nous parrait la plus judicieuse est de choisir les paramètres du Kmeans. En effet, nous avons testé un paramétrage aléatoire contre un paramétrage issu du Kmeans et il confirme notre hypothèse (une log vraisemblance inférieure). Nous tirons les paramètres du Kmeans par la méthode suivante :

- Nos groupons les classes par mean et std
- Nous prenons l'attribut x et y des mean et std
- Les "tau" represente la proportion de la classe dans la population toute entière.

In [23]:
# create df for X and its labels
X_df = pd.DataFrame(
    X, 
    columns=[
        "x", 
        "y"
    ]
)

X_df["z"] = labels

In [26]:
# mean of each cluster
X_df.groupby(by=["z"]).mean()

Unnamed: 0_level_0,x,y
z,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-1.896134,1.950578
1,2.025616,1.81079
2,-0.029758,-0.373303


In [27]:
# std of each cluster
X_df.groupby(by=["z"]).std()

Unnamed: 0_level_0,x,y
z,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.437597,0.388692
1,0.528277,0.572065
2,0.778713,0.625238


In [35]:
# theta choosen
theta = (
    # tau
    len(X_df.query("z == 0")) / len(X_df), len(X_df.query("z == 1")) / len(X_df),
    # means 1
    -1.896134, 1.950578,
    # means 2
    2.025616, 1.810790,
    # means 3
    -0.029758, -0.373303,
    # std 1
    np.mean([0.437597,	0.388692]),
    # std 2
    np.mean([0.528277,	0.572065]),
    # std 3
    np.mean([0.778713,	0.625238])
)

log_likelihood(X, theta, labels)

-423.8999105939964

# Question 4

In [38]:
# function to compute proba of observation belonging to gaussian
def pdf(x, mu_x, mu_y, sigma):

    pdf_x = multivariate_normal.pdf(
            x=x,
            mean=[mu_x, mu_y],
            cov=[[sigma, 0], [0, sigma]]
        )

    return pdf_x

In [44]:
# proba classes
tau_1 = len(X_df.query("z == 0")) / len(X_df)
tau_2 = len(X_df.query("z == 1")) / len(X_df)
tau_3 = 1 - (tau_1 + tau_2)

In [45]:
def denominator(x):
    
    pdf_1 = pdf(
            x,
            # mean
            -1.896134, 1.950578,
            # std
            np.mean([0.437597,	0.388692])
        )

    pdf_2 = pdf(
            x,
            # mean
            2.025616, 1.810790,
            # std
            np.mean([0.528277,	0.572065]),
        )

    pdf_3 = pdf(
            x,
            # mean
            -0.029758, -0.373303,
            # std
            np.mean([0.778713,	0.625238])
        )
    
    return tau_1 * pdf_1 + tau_2 * pdf_2 + tau_3 * pdf_3

## groupe 1 probas

In [69]:
proba_groupe_1 = []

for x in X:
    pdf_1_ = pdf(
            x,
            # mean
            -1.896134, 1.950578,
            # std
            np.mean([0.437597,	0.388692])
        )

    proba_groupe_1.append(pdf_1_ * tau_1 / denominator(x))


# plot X
fig = px.scatter(
    x=X[:, 0],
    y=X[:, 1],
    color=proba_groupe_1
)

fig.update_layout(
    title="Group 1 probabilities",
    xaxis_title="x",
    yaxis_title="y",
)

fig.show()

In [70]:
proba_groupe_2 = []

for x in X:
    pdf_2_ = pdf(
            x,
            # mean
            2.025616, 1.810790,
            # std
            np.mean([0.528277,	0.572065]),
        )
    
    proba_groupe_2.append(pdf_2_ * tau_2 / denominator(x))


# plot X
fig = px.scatter(
    x=X[:, 0],
    y=X[:, 1],
    color=proba_groupe_2
)

fig.update_layout(
    title="Group 2 probabilities",
    xaxis_title="x",
    yaxis_title="y",
)

fig.show()

In [71]:
proba_groupe_3 = []

for x in X:
    pdf_3_ = pdf(
            x,
            # mean
            -0.029758, -0.373303,
            # std
            np.mean([0.778713,	0.625238])
        )

    proba_groupe_3.append(pdf_3_ * tau_3 / denominator(x))


# plot X
fig = px.scatter(
    x=X[:, 0],
    y=X[:, 1],
    color=proba_groupe_3
)

fig.update_layout(
    title="Group 3 probabilities",
    xaxis_title="x",
    yaxis_title="y",
)

fig.show()

# Question 5

In [50]:
def funct_Q(X, theta, probabilities):
    Q_score = 0
    n, _ = X.shape
    tau_1,tau_2,mu_1_x,mu_1_y,mu_2_x,mu_2_y,mu_3_x,mu_3_y,sigma_1,sigma_2,sigma_3 = theta

    # gather params by group
    dict_z1 = {
        "tau": tau_1,
        "mu_x": mu_1_x,
        "mu_y": mu_1_y,
        "sigma": sigma_1,
    }

    dict_z2 = {
        "tau": tau_2,
        "mu_x": mu_2_x,
        "mu_y": mu_2_y,
        "sigma": sigma_2,
    }

    dict_z3 = {
        "tau": 1 - (tau_1 + tau_2),
        "mu_x": mu_3_x,
        "mu_y": mu_3_y,
        "sigma": sigma_3,
    }

    groupes = [dict_z1, dict_z2, dict_z3]

    # compute the Q-score
    for i in range(n):
        for k in range(3):
            tau = groupes[k]["tau"]
            mu_x = groupes[k]["mu_x"]
            mu_y = groupes[k]["mu_y"]
            sigma = groupes[k]["sigma"]


            pdf_ = multivariate_normal.pdf(
                x=X[i],
                mean=[mu_x, mu_y],
                cov=[[sigma, 0], [0, sigma]]
            )

        Q_score += probabilities[i, k] * np.log(tau * pdf_)

    return Q_score


# Question 6

Test de la fonction `funct_Q`

In [72]:
probas = np.array(
    [
        proba_groupe_1,
        proba_groupe_2,
        proba_groupe_3
    ], 
    dtype=float
)

probas = probas.T

In [73]:
funct_Q(X, theta, probas)

-128.4038380913976