# Mixture Modeling

finding separate distributions in combined data. This algorihtm is called Expectation Maximization (EM).

In [1]:
using Plots
gr()

Plots.GRBackend()

In [21]:
# functions for probability density function
# calculating probabilities under the mixture model
# and estimating new parameters for the model
function pdf(x, μ, σ²)
    return (σ²*2*π)^(-0.5) * exp(-((x-μ)^2)/(2*σ²))
end

function likelihood(x, a, μ, σ²)
    p = [a[c]*[pdf(x[i], μ[c], σ²[c]) for i=1:length(x)] for c=1:length(a)]
    r = hcat(p...)
    r = r./sum(r,2)
    return r
end

function dist_estimate(x, r)
    ac = mean(r,1)
    μc = sum(r.*x,1) ./ sum(r,1)
    σ²c = sum(r.*(x.-μc).*(x.-μc),1) ./ sum(r,1)
    return ac, μc, σ²c
end

dist_estimate (generic function with 1 method)

In [28]:
# Create some data in a single vector that in reality comes from two different distributions
# These parameters are in practice unknown. We only have the data x.
true_μ = [-0.6 0.7]
true_σ² = [0.5^2 0.2^2]
x=[true_μ[1]+randn(1000,1)*sqrt(true_σ²[1]); true_μ[2]+randn(1000,1)*sqrt(true_σ²[2])];

In [29]:
# Initialize parameters for some distributions.
# These parameters will be adjusted to fit the true distribution parameters
μ = [-0.01 0.01]
σ² = [0.2^2 0.2^2]
a = [0.5 0.5];

In [32]:
#adjust distribution parameters to fit the true (unknown) distribution parameters of data
function EM(a, μ, σ², x, Nsteps)
    for n=1:Nsteps
        r = likelihood(x, a, μ, σ²) # (E)xpectation
        a, μ, σ² = dist_estimate(x, r) # (M)aximize
    end
    return a, μ, σ²
end

a, μ, σ² = EM(a, μ, σ², x, 500)

println("The algorihtm estimated these parameters")
println("μ = ",round.(μ,3))
println("σ² = ",round.(σ²,3))

println("\nThe true (unknown to the algorihtm) parameters were")
println("true_μ = ",round.(true_μ,3))
println("true_σ² = ",round.(true_σ²,3))

The algorihtm estimated these parameters
μ = [-0.569 0.712]
σ² = [0.246 0.036]

The true (unknown to the algorihtm) parameters were
true_μ = [-0.6 0.7]
true_σ² = [0.25 0.04]


In [33]:
#we can now estimate how likely a data point is to come from either distribution
function print_likelihood(x, a, μ, σ²)
    r = likelihood(x, a, μ, σ²)
    println("likelihood that the value ",x," belongs to")
    println("cluster 1: ",round.(r[1]*100,1),"%")
    println("cluster 2: ",round.(r[2]*100,1),"%\n")
end

print_likelihood(0.5, a, μ, σ²)
print_likelihood(0.2, a, μ, σ²)
print_likelihood(-1.0, a, μ, σ²)

likelihood that the value 0.5 belongs to
cluster 1: 6.6%
cluster 2: 93.4%

likelihood that the value 0.2 belongs to
cluster 1: 81.5%
cluster 2: 18.5%

likelihood that the value -1.0 belongs to
cluster 1: 100.0%
cluster 2: 0.0%



## What does EM do? (visualization)

In [35]:
#reset parameters for visualization
μ = [-0.01 0.01]
σ² = [0.2^2 0.2^2]
a = [0.5 0.5]

function visualize_EM(a, μ, σ², x, Nsteps)
    @gif for n=1:Nsteps
        r = likelihood(x, a, μ, σ²) # (E)xpectation
        
        # adjust the parameters slowly for purpose of visualization
        ac, μc, σ²c = dist_estimate(x, r)
        a = 0.95*a + 0.05*ac
        μ = 0.95*μ + 0.05*μc
        σ² = 0.95*σ² + 0.05*σ²c
        
        # visualize
        plotx=collect(-2.5:0.01:2.5)
        dist1 = a[1]*[pdf(plotx[i], μ[1], σ²[1]) for i=1:length(plotx)];
        dist2 = a[2]*[pdf(plotx[i], μ[2], σ²[2]) for i=1:length(plotx)];
        plot(plotx, [dist1 dist2], ylim=(0,1), xlim=(-3,3), title="mixture modeling of 2 gaussians", leg=false)
    end every 1
    return a, μ, σ²
end

a, μ, σ² = visualize_EM(a, μ, σ², x, 200)

[1m[36mINFO: [39m[22m[36mSaved animation to /home/andy/dev/githubnotebooks/mixture_modeling/tmp.gif
[39m

([0.517414 0.482586], [-0.537592 0.713361], [0.276737 0.036298])

<img src="tmp.gif">