<a href="https://colab.research.google.com/github/BuczynskiRafal/ML/blob/main/unsupervised/01_kmeans_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import bibliotek

In [1]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
import random
import plotly.express as px
import plotly.graph_objects as go

np.random.seed(42)
np.set_printoptions(precision=6)
random.seed(41)

# Wygenerowanie danych

In [2]:
from sklearn.datasets import make_blobs

data = make_blobs(n_samples=40, centers=2, cluster_std=1.0, center_box=(-4.0, 4.0), random_state=42)[0]
df = pd.DataFrame(data, columns=['x1', 'x2'])
df.head()

Unnamed: 0,x1,x2
0,0.37743,0.069424
1,2.217347,2.327304
2,1.376777,0.603609
3,-1.467097,3.139985
4,-1.605386,5.457993


# Wizualizacja danych 

In [3]:
fig = px.scatter(df, 'x1', 'x2', width=950, height=500, title='Algorytm K-średnich')
fig.update_traces(marker_size=12)

Implementacja algorytmu K-średnich


In [4]:
# Wyznaczenie wartości brzegowych
x1_min = df.x1.min()
x1_max = df.x1.max()

x2_min = df.x2.min()
x2_max = df.x2.max()

print(x1_min, x1_max)
print(x2_min, x2_max)

-2.728596881734133 3.333845579232757
-1.1983010410246 5.457992635788267


In [5]:
# losowe wygenerowanie wartości centroidów
centroid1 = np.array([random.uniform(x1_min, x1_max), random.uniform(x2_min, x2_max)])
centroid2 = np.array([random.uniform(x1_min, x1_max), random.uniform(x2_min, x2_max)])
print(centroid1)
print(centroid2)

[-0.418681  0.337434]
[-1.722009  4.884443]


In [6]:
# wizualizacja punktów startowych
fig = px.scatter(df, 'x1', 'x2', width=750, height=400, title='Algorytm K-średnich')
fig.add_trace(go.Scatter(x=[centroid1[0]], y=[centroid1[1]], name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[centroid2[0]], y=[centroid2[1]], name='centroid2', mode='markers', marker_line_width=3 ))
fig.update_traces(marker_size=12, showlegend=False)


In [7]:
# przypisanie punktow do najbliższego centroidu
cluseters = [ 2 if norm(centroid1 - point) > norm(centroid2 - point) else 1 for point in data ]
print(cluseters)

df['cluster'] = cluseters
df.head()

[1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1]


Unnamed: 0,x1,x2,cluster
0,0.37743,0.069424,1
1,2.217347,2.327304,1
2,1.376777,0.603609,1
3,-1.467097,3.139985,2
4,-1.605386,5.457993,2


In [8]:
# wizualizacja przypisania
fig = px.scatter(df, 'x1', 'x2', color='cluster', width=950, height=500, 
                 title='Algorytm K-średnich - iteracja 1 - przypisanie punktów do najbliższego centroidu')
fig.add_trace(go.Scatter(x=[centroid1[0]], y=[centroid1[1]], name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[centroid2[0]], y=[centroid2[1]], name='centroid 2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12)
fig.update_layout(showlegend=False)

In [9]:
# Obliczenie nowych odległości centroidów
new_centroid_1 = [df[df.cluster == 1].x1.mean(), df[df.cluster == 1].x2.mean() ]
new_centroid_2 = [df[df.cluster == 2].x1.mean(), df[df.cluster == 2].x2.mean() ]

print(new_centroid_1)
print(new_centroid_2)


[1.1057883070617118, 1.1784295565923741]
[-1.322466347597972, 3.7664746248027723]


In [10]:
# wizualizacja danych z kolejnymi centroidami
fig = px.scatter(df, 'x1', 'x2', color='cluster', width=950, height=500, 
                 title='Algorytm K-średnich - iteracja 1 - przypisanie punktów do najbliższego centroidu')
fig.add_trace(go.Scatter(x=[centroid1[0]], y=[centroid1[1]], name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[centroid2[0]], y=[centroid2[1]], name='centroid 2', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_centroid_1[0]], y=[new_centroid_1[1]], name='new_centroid_1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_centroid_2[0]], y=[new_centroid_2[1]], name='new_centroid_2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12)
fig.update_layout(showlegend=False)

In [11]:
# automatyzacja aktualizacji wykresów centroidów
def add_centroid(*centroids):
    fig = px.scatter(df, 'x1', 'x2', color='cluster', width=950, height=500, 
                 title='Algorytm K-średnich - iteracja 1 - przypisanie punktów do najbliższego centroidu')
    fig.add_trace(go.Scatter(x=[centroid1[0]], y=[centroid1[1]], name='centroid 1', mode='markers', marker_line_width=3))
    fig.add_trace(go.Scatter(x=[centroid2[0]], y=[centroid2[1]], name='centroid 2', mode='markers', marker_line_width=3))
    for idx, centroid in enumerate(centroids):
        fig.add_trace(go.Scatter(x=[centroid[0]], y=[centroid[1]], name=f'new_centroid_{idx + 1}', mode='markers', marker_line_width=3))
    fig.update_traces(marker_size=12)
    fig.update_layout(showlegend=False)

    return fig

add_centroid(new_centroid_1, new_centroid_2)

In [12]:
# automatyzacja aktualizacji centroidów 
def centroid_actualization(new_centroid_1, new_centroid_2):
    return [2 if norm(new_centroid_1 - point) > norm(new_centroid_2 - point) else 1 for point in data ]

print(centroid_actualization(new_centroid_1, new_centroid_2))

[1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1]


In [13]:
# ponowne przypisanie do najbliższego centroidu
clusters = centroid_actualization(new_centroid_1, new_centroid_2)

df['cluster'] = cluseters
df.head()

Unnamed: 0,x1,x2,cluster
0,0.37743,0.069424,1
1,2.217347,2.327304,1
2,1.376777,0.603609,1
3,-1.467097,3.139985,2
4,-1.605386,5.457993,2


In [14]:
# aktualizacja centroidów
new_2_centroid_1 = [df[df.cluster == 1].x1.mean(), df[df.cluster == 1].x2.mean()]
new_2_centroid_2 = [df[df.cluster == 2].x1.mean(), df[df.cluster == 2].x2.mean()]

print(new_2_centroid_1, new_2_centroid_2)

[1.1057883070617118, 1.1784295565923741] [-1.322466347597972, 3.7664746248027723]


In [15]:
# wizualizacja danych z kolejnymi centroidami
add_centroid(new_centroid_1, new_centroid_2, new_2_centroid_1, new_2_centroid_2)

In [16]:
# przypisane punktów do najbliższego centroidu
clusters = centroid_actualization(new_centroid_1, new_centroid_2)

df['cluster'] = cluseters
df.head()

Unnamed: 0,x1,x2,cluster
0,0.37743,0.069424,1
1,2.217347,2.327304,1
2,1.376777,0.603609,1
3,-1.467097,3.139985,2
4,-1.605386,5.457993,2


In [17]:
# aktualizacja centroidów
new_3_centroid_1 = [df[df.cluster == 1].x1.mean(), df[df.cluster == 1].x2.mean()]
new_3_centroid_2 = [df[df.cluster == 2].x1.mean(), df[df.cluster == 2].x2.mean()]

print(new_3_centroid_1, new_3_centroid_2)

[1.1057883070617118, 1.1784295565923741] [-1.322466347597972, 3.7664746248027723]


In [18]:
# wizualizacja danych z kolejnymi centroidami
add_centroid(new_centroid_1, new_centroid_2, new_2_centroid_1, new_2_centroid_2, new_3_centroid_1, new_3_centroid_2)

# Implementacja algorytmu K-średnich - podsumowanie
automatyzacja powyższych kroków


In [19]:
# wygenerowanie surowych danych 
data = make_blobs(n_samples=40, centers=2, cluster_std=1.0, center_box=(-4.0, 4.0), random_state=42)[0]
df = pd.DataFrame(data, columns=['x1', 'x2'])
df.head()

# ustalenie brzegów
x1_min = df.x1.min()
x1_max = df.x1.max()

x2_min = df.x2.min()
x2_max = df.x2.max()

# utworzenie pierwszych centroidów
centroid_1 = np.array([random.uniform(x1_min, x1_max), random.uniform(x2_min, x2_max)])
centroid_2 = np.array([random.uniform(x1_min, x1_max), random.uniform(x2_min, x2_max)])

# klasteruzacja - przupisanie do centroidów - 10 iteracji
for i in range(10):
    clusters = []
    for point in data:
        centroid_1_dist = norm(centroid_1 - point)
        centroid_2_dist = norm(centroid_2 - point)
        cluster = 1
        if centroid_1_dist > centroid_2_dist:
            cluster = 2
        clusters.append(cluster)

    # przypisanie klastrów do dataframe
    df['cluster'] = clusters

    centroid_1 = [df[df.cluster == 1].x1.mean(), df[df.cluster == 1].x2.mean()]
    centroid_2 = [df[df.cluster == 2].x1.mean(), df[df.cluster == 2].x2.mean()]

print(new_centroid_1, new_centroid_2)

[1.1057883070617118, 1.1784295565923741] [-1.322466347597972, 3.7664746248027723]


# Implementacja algorytmu K-średnich - wizualizacja


In [20]:
fig = px.scatter(df, 'x1', 'x2', color='cluster', width=950, height=500, 
                 title='Algorytm K-średnich - końcowy rezultat')
fig.add_trace(go.Scatter(x=[new_2_centroid_1[0]], y=[new_2_centroid_1[1]], name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_2_centroid_2[0]], y=[new_2_centroid_2[1]], name='centroid 2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12)
fig.update_layout(showlegend=False)