<a href="https://colab.research.google.com/github/DorotaJanosz/machine-learning-bootcamp/blob/master/unsupervised/01_clustering/01_kmeans_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Import bibliotek

In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
import random
import plotly.express as px
import plotly.graph_objects as go

np.random.seed(42)
np.set_printoptions(precision=6)
random.seed(41)

###Wygenerowanie danych

In [2]:
from sklearn.datasets import make_blobs

data = make_blobs(n_samples=40, centers=2, cluster_std=1.0, center_box=(-4.0, 4.0), random_state=42)[0]
df = pd.DataFrame(data, columns=['x1', 'x2'])
df.head(10)

Unnamed: 0,x1,x2
0,0.37743,0.069424
1,2.217347,2.327304
2,1.376777,0.603609
3,-1.467097,3.139985
4,-1.605386,5.457993
5,-0.181134,2.384871
6,-0.936151,2.180966
7,-0.794815,1.646044
8,-1.017176,2.548004
9,-2.331865,3.802576


###Wizualizacja danych

In [3]:
fig = px.scatter(data_frame=df, x='x1', y='x2', width=950, height=500, title='Algorytm K-średnich')
fig.update_traces(marker_size=12)

###Implementacja algorytmu K-średnich

In [4]:
#wyznaczenie wartości brzegowych

x1_min = df['x1'].min()
x1_max = df['x1'].max()

x2_min = df['x2'].min()
x2_max = df['x2'].max()

print(x1_min, x1_max)
print(x2_min, x2_max)

-2.728596881734133 3.333845579232757
-1.1983010410246 5.457992635788267


In [5]:
#losowe wygenerowanie współrzędnych centroidów

centroid_1 = np.array([random.uniform(x1_min, x1_max), random.uniform(x2_min, x2_max)]) #losujemy z rokdładu jednostajengo, gdzie każde zdarzenie ma jednakowe prawdopodobieństwo
centroid_2 = np.array([random.uniform(x1_min, x1_max), random.uniform(x2_min, x2_max)])

print(centroid_1)
print(centroid_2)

[-0.418681  0.337434]
[-1.722009  4.884443]


In [6]:
#wizualizacja tzw. punktów startowych centroidów

fig = px.scatter(data_frame=df, x='x1', y='x2', width=950, height=500, title='Algorytm K-średnich - inicjalizacja centoidów')
fig.add_trace(go.Scatter(x=[centroid_1[0]], y=[centroid_1[1]], name='centroid_1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[centroid_2[0]], y=[centroid_2[1]], name='centroid_2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12, showlegend=False)


In [7]:
#przypisanie punktów do najbliższego centroidu

clusters=[]
for point in data:
  centroid_1_dist = norm(centroid_1 - point) #norm na domyślnych paramtrach liczby norme eklidesową
  centroid_2_dist = norm(centroid_2 - point)
  cluster = 1
  if centroid_1_dist > centroid_2_dist:
    cluster = 2
  clusters.append(cluster)

df['cluster'] = clusters
df.head()

Unnamed: 0,x1,x2,cluster
0,0.37743,0.069424,1
1,2.217347,2.327304,1
2,1.376777,0.603609,1
3,-1.467097,3.139985,2
4,-1.605386,5.457993,2


In [8]:
#wizualizacja przypisania

fig = px.scatter(data_frame=df, x='x1', y='x2', width=950, height=500, color='cluster', title='Algorytm K-średnich - iteracja 1 - przpisanie puntów do najbliższego z centroidów')
fig.add_trace(go.Scatter(x=[centroid_1[0]], y=[centroid_1[1]], name='centroid_1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[centroid_2[0]], y=[centroid_2[1]], name='centroid_2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12, showlegend=False)

In [9]:
#obliczanie nowych wartości centroidów

new_centroid_1 = [df[df['cluster'] == 1].x1.mean(), df[df['cluster'] == 1].x2.mean()]
new_centroid_2 = [df[df['cluster'] == 2].x1.mean(), df[df['cluster'] == 2].x2.mean()]

print(new_centroid_1, new_centroid_2)

[1.1057883070617118, 1.1784295565923744] [-1.322466347597972, 3.7664746248027723]


In [10]:
#wizualizcja zaktualizowanych centroidów

fig = px.scatter(data_frame=df, x='x1', y='x2', width=950, height=500, color='cluster', title='Algorytm K-średnich - iteracja 1 - przypisanie puntów do najbliższego z centroidów')
fig.add_trace(go.Scatter(x=[centroid_1[0]], y=[centroid_1[1]], name='centroid_1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[centroid_2[0]], y=[centroid_2[1]], name='centroid_2', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_centroid_1[0]], y=[new_centroid_1[1]], name='new_centroid_1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_centroid_2[0]], y=[new_centroid_2[1]], name='new_centroid_2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12, showlegend=False)


In [11]:
fig = px.scatter(data_frame=df, x='x1', y='x2', width=950, height=500, color='cluster', title='Algorytm K-średnich - iteracja 1 - przypisanie puntów do najbliższego z centroidów')
fig.add_trace(go.Scatter(x=[new_centroid_1[0]], y=[new_centroid_1[1]], name='new_centroid_1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_centroid_2[0]], y=[new_centroid_2[1]], name='new_centroid_2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12, showlegend=False)

In [12]:
#ponowne przypisanie punktów do najbliższego centroidu

clusters=[]
for point in data:
  centroid_1_dist = norm(new_centroid_1 - point) 
  centroid_2_dist = norm(new_centroid_2 - point)
  cluster = 1
  if centroid_1_dist > centroid_2_dist:
    cluster = 2
  clusters.append(cluster)

df['cluster'] = clusters
df.head()

Unnamed: 0,x1,x2,cluster
0,0.37743,0.069424,1
1,2.217347,2.327304,1
2,1.376777,0.603609,1
3,-1.467097,3.139985,2
4,-1.605386,5.457993,2


In [13]:
fig = px.scatter(data_frame=df, x='x1', y='x2', width=950, height=500, color='cluster', title='Algorytm K-średnich - iteracja 2 - przpisanie puntów do najbliższego z centroidów')
fig.add_trace(go.Scatter(x=[new_centroid_1[0]], y=[new_centroid_1[1]], name='centroid_1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_centroid_2[0]], y=[new_centroid_2[1]], name='centroid_2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12, showlegend=False)

In [14]:
#aktualizacja centroidów

new_2_centroid_1 = [df[df['cluster'] == 1].x1.mean(), df[df['cluster'] == 1].x2.mean()]
new_2_centroid_2 = [df[df['cluster'] == 2].x1.mean(), df[df['cluster'] == 2].x2.mean()]

print(new_2_centroid_1, new_2_centroid_2)

[1.4180663576886534, 1.0311646927855822] [-1.3164446535062433, 3.5090014873277737]


In [15]:
fig = px.scatter(data_frame=df, x='x1', y='x2', width=950, height=500, color='cluster', title='Algorytm K-średnich - iteracja 2 - przypisanie puntów do najbliższego z centroidów')
fig.add_trace(go.Scatter(x=[new_2_centroid_1[0]], y=[new_2_centroid_1[1]], name='centroid_1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_2_centroid_2[0]], y=[new_2_centroid_2[1]], name='centroid_2', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_centroid_1[0]], y=[new_centroid_1[1]], name='new_centroid_1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_centroid_2[0]], y=[new_centroid_2[1]], name='new_centroid_2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12, showlegend=False)

In [16]:
#ponowne przypisanie punktów do najbliższego centroidu

clusters=[]
for point in data:
  centroid_1_dist = norm(new_2_centroid_1 - point) 
  centroid_2_dist = norm(new_2_centroid_2 - point)
  cluster = 1
  if centroid_1_dist > centroid_2_dist:
    cluster = 2
  clusters.append(cluster)

df['cluster'] = clusters
df.head()

Unnamed: 0,x1,x2,cluster
0,0.37743,0.069424,1
1,2.217347,2.327304,1
2,1.376777,0.603609,1
3,-1.467097,3.139985,2
4,-1.605386,5.457993,2


In [17]:
fig = px.scatter(data_frame=df, x='x1', y='x2', width=950, height=500, color='cluster', title='Algorytm K-średnich - iteracja 2 - przpisanie puntów do najbliższego z centroidów')
fig.add_trace(go.Scatter(x=[new_2_centroid_1[0]], y=[new_2_centroid_1[1]], name='centroid_1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_2_centroid_2[0]], y=[new_2_centroid_2[1]], name='centroid_2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12, showlegend=False)

###Implementacja algorytmu K-śednich - podsumowanie

In [23]:
from sklearn.datasets import make_blobs

data = make_blobs(n_samples=40, centers=2, cluster_std=1.0, center_box=(-4.0, 4.0), random_state=42)[0]
df = pd.DataFrame(data, columns=['x1', 'x2'])

x1_min = df['x1'].min()
x1_max = df['x1'].max()

x2_min = df['x2'].min()
x2_max = df['x2'].max()

centroid_1 = np.array([random.uniform(x1_min, x1_max), random.uniform(x2_min, x2_max)]) 
centroid_2 = np.array([random.uniform(x1_min, x1_max), random.uniform(x2_min, x2_max)])

for i in range(10):
  clusters=[]
  for point in data:
    centroid_1_dist = norm(centroid_1 - point) 
    centroid_2_dist = norm(centroid_2 - point)
    cluster = 1
    if centroid_1_dist > centroid_2_dist:
      cluster = 2
    clusters.append(cluster)

  df['cluster'] = clusters
  
  centroid_1 = [df[df['cluster'] == 1].x1.mean(), df[df['cluster'] == 1].x2.mean()]
  centroid_2 = [df[df['cluster'] == 2].x1.mean(), df[df['cluster'] == 2].x2.mean()]

print(centroid_1, centroid_2) 


[1.848262429759308, 0.8622246431993411] [-1.184810430866379, 3.18988309513586]


###Implementacja algorytmu K-średnich - wizualizacja

In [24]:
fig = px.scatter(data_frame=df, x='x1', y='x2', width=950, height=500, color='cluster', title='Algorytm K-średnich -końcowy rezultat')
fig.add_trace(go.Scatter(x=[centroid_1[0]], y=[centroid_1[1]], name='centroid_1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[centroid_2[0]], y=[centroid_2[1]], name='centroid_2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12, showlegend=False)