<a href="https://colab.research.google.com/github/DorotaJanosz/machine-learning-bootcamp/blob/master/unsupervised/01_clustering/06_clustering_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Import bibliotek

In [2]:
import numpy as np
import pandas as pd
import plotly.express as px

###Wygenerowanie danych i wizualizacja

In [3]:
from sklearn.datasets import make_blobs

blobs_data = make_blobs(n_samples=1000, cluster_std=0.7, random_state=24, center_box=(-4.0, 4.0))[0]
blobs = pd.DataFrame(blobs_data, columns=['x1', 'x2'])
px.scatter(blobs, 'x1', 'x2', width=950, height=500, template='plotly_dark', title='blobs data')

In [4]:
from sklearn.datasets import make_circles

circle_data = make_circles(n_samples=1000, factor=0.5, noise=0.05)[0]
circles = pd.DataFrame(circle_data, columns=['x1', 'x2'])
px.scatter(circles, 'x1', 'x2', width=950, height=500, template='plotly_dark', title='circle data')

In [5]:
from sklearn.datasets import make_moons

moons_data = make_moons(n_samples=1000, noise=0.05)[0]
moons = pd.DataFrame(moons_data, columns=['x1', 'x2'])
px.scatter(moons, 'x1', 'x2', width=950, height=500, template='plotly_dark', title='moon data')

In [6]:
random_data = np.random.rand(1500, 2)
random = pd.DataFrame(random_data, columns=['x1', 'x2'])
px.scatter(random, 'x1', 'x2', width=950, height=500, title='random data', template='plotly_dark')

###Porównanie algorytmów - blobs data - 3 klastry

In [7]:
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.03, shared_yaxes=True) 

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(blobs_data)
kmeans.predict(blobs_data)
blobs['cluster'] = kmeans.labels_
trace1 = px.scatter(blobs, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]

from sklearn.cluster import AgglomerativeClustering
agglo = AgglomerativeClustering(n_clusters=3)
agglo.fit_predict(blobs_data)
blobs['cluster'] = agglo.labels_
trace2 = px.scatter(blobs, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]

from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit_predict(blobs_data)
blobs['cluster'] = dbscan.labels_
trace3 = px.scatter(blobs, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=1, col=3)
fig.update_layout(template='plotly_dark', title='KMeans vs. Agglometraive Clustering vs. DBSCAN - blobs data', coloraxis = {'colorscale':'viridis'})
fig.show()

###Porównanie algorytmów - cicrle data - 2 klastry

In [8]:
fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.03, shared_yaxes=True) 

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)
kmeans.fit(circle_data)
kmeans.predict(circle_data)
circles['cluster'] = kmeans.labels_
trace1 = px.scatter(circles, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]

from sklearn.cluster import AgglomerativeClustering
agglo = AgglomerativeClustering(n_clusters=2)
agglo.fit_predict(circle_data)
circles['cluster'] = agglo.labels_
trace2 = px.scatter(circles, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]

from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.1, min_samples=5)
dbscan.fit_predict(circle_data)
circles['cluster'] = dbscan.labels_
trace3 = px.scatter(circles, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=1, col=3)
fig.update_layout(template='plotly_dark', title='KMeans vs. Agglometraive Clustering vs. DBSCAN - circle data', coloraxis = {'colorscale':'viridis'})
fig.show()

###Porównanie algorytmów  - random data - 5 klastrów

In [10]:
fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.03, shared_yaxes=True) 

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)
kmeans.fit(random_data)
kmeans.predict(random_data)
random['cluster'] = kmeans.labels_
trace1 = px.scatter(random, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]

from sklearn.cluster import AgglomerativeClustering
agglo = AgglomerativeClustering(n_clusters=5)
agglo.fit_predict(random_data)
random['cluster'] = agglo.labels_
trace2 = px.scatter(random, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]

from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.1, min_samples=5)
dbscan.fit_predict(random_data)
random['cluster'] = dbscan.labels_
trace3 = px.scatter(random, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=1, col=3)
fig.update_layout(template='plotly_dark', title='KMeans vs. Agglometraive Clustering vs. DBSCAN - random data', coloraxis = {'colorscale':'viridis'})
fig.show()

###Porównanie algorytmów - moons data - 2 klastry

In [13]:
fig = make_subplots(rows=1, cols=3, horizontal_spacing=0.03, shared_yaxes=True) 

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)
kmeans.fit(moons_data)
kmeans.predict(moons_data)
moons['cluster'] = kmeans.labels_
trace1 = px.scatter(moons, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]

from sklearn.cluster import AgglomerativeClustering
agglo = AgglomerativeClustering(n_clusters=2)
agglo.fit_predict(moons_data)
moons['cluster'] = agglo.labels_
trace2 = px.scatter(moons, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]

from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.1, min_samples=5)
dbscan.fit_predict(moons_data)
moons['cluster'] = dbscan.labels_
trace3 = px.scatter(moons, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=1, col=3)
fig.update_layout(template='plotly_dark', title='KMeans vs. Agglometraive Clustering vs. DBSCAN - moons data', coloraxis = {'colorscale':'viridis'})
fig.show()