In [None]:

from google.colab import drive
drive.mount('/content/drive')


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
blobs = pd.read_csv('/content/drive/MyDrive/STUDY/HCMUS-DM/Lab06/datacsv.csv')

colnames = list(blobs.columns[1:-1])
blobs


In [None]:
blobs.info()

In [None]:
blobs.info()

In [None]:
# Create a custom colormap
customcmap = ListedColormap(["crimson", "mediumblue", "darkmagenta"])

# Create a scatter plot
fig, ax = plt.subplots(figsize=(8, 6))
plt.scatter(
    x=blobs['x'],
    y=blobs['y'],
    s=150,
    c=blobs['cluster'].astype('category'),
    cmap=customcmap
)

# Add labels and ticks
ax.set_xlabel(r"x", fontsize=14)
ax.set_ylabel(r"y", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Show the plot
plt.show()

In [None]:
def initiate_centroids(k, dset):
  centroids = dset.sample(k)
  return centroids

np.random.seed(42)
k = 3
df = blobs[['x','y']]
centroids = initiate_centroids(k,df)
centroids


In [None]:
def rsserr(a, b):
  return np.sum(np.square(a-b))

In [None]:
for i, centroid in enumerate(range(centroids.shape[0])):
  err = rsserr(centroids.iloc[centroid,:], df.iloc[36,:])
  print('Error for centroid {0} : {1:.2f}'.format(i, err))

In [None]:
def centroid_assignation(dset, centroids):
    """
    Assigns each data point in a dataframe 'dset' to a centroid.

    Args:
        dset: A pandas dataframe with observations.
        centroids: A pandas dataframe with centroids.

    Returns:
        A tuple of two lists:
            - assignation: A list of integers where each integer represents the index
                           of the nearest centroid for the corresponding data point in 'dset'
            - assign_errors: A list of floats where each float represents the distance between
                           a data point and its nearest centroid
    """


    k = centroids.shape[0]  # number of centroids
    n = dset.shape[0]  # number of data points in the dataset
    assignation = []
    assign_errors = []

    for obs in range(n):
        all_errors = np.array([])
        for centroid in range(k):
            err = rsserr(centroids.iloc[centroid, :], dset.iloc[obs, :])
            all_errors = np.append(all_errors, err)

        nearest_centroid = np.where(all_errors == np.amin(all_errors))[0].tolist()[0]
        nearest_centroid_error = np.amin(all_errors)
        assignation.append(nearest_centroid)
        assign_errors.append(nearest_centroid_error)

    return assignation, assign_errors

In [None]:
df['centroid'], df['error'] = centroid_assignation(df, centroids)
df.head()

In [None]:
# Create a custom colormap
customcmap = ListedColormap(["crimson", "mediumblue", "darkmagenta"])

# Create a scatter plot
fig, ax = plt.subplots(figsize=(8, 6))
plt.scatter(
    df.iloc[:,0], df.iloc[:,1], marker = 'o',
    c = df['centroid'].astype('category'),
    cmap = customcmap, s = 80, alpha = 0.5
)
plt.scatter(centroids.iloc[:,0], centroids.iloc[:,1],
            marker = 's', s = 200,
            c=[0,1,2], cmap= customcmap
            )

# Add labels and ticks
ax.set_xlabel(r"x", fontsize=14)
ax.set_ylabel(r"y", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Show the plot
plt.show()

In [None]:
print("The total error is {0:.2f}".format(df['error'].sum()))

In [None]:
centroids = df.groupby('centroid').agg('mean').loc[:, colnames].reset_index(drop=True)
centroids

In [None]:

# Create a scatter plot
fig, ax = plt.subplots(figsize=(8, 6))
plt.scatter(
    df.iloc[:,0], df.iloc[:,1], marker = 'o',
    c = df['centroid'].astype('category'),
    cmap = customcmap, s = 80, alpha = 0.5
)

plt.scatter(centroids.iloc[:,0], centroids.iloc[:,1],
            marker = 's', s = 200,
            c=[0,1,2], cmap= customcmap
            )

# Add labels and ticks
ax.set_xlabel(r"x", fontsize=14)
ax.set_ylabel(r"y", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Show the plot
plt.show()

In [None]:
def kmeans(dset, k=2, tol=1e-4):
  """
  K-means implementation

  Args:
      dset: DataFrame with observations
      k: number of clusters, default k=2
      tol: tolerance

  Returns:
      working_dset['centroid'], j_err, centroids
  """

  # Let us work in a copy, so we don't mess the original
  working_dset = dset.copy()

  # We define some variables to hold the error, the
  # stopping signal and a counter for the iterations
  err = []
  goahead = True
  j = 0

  # Step 2: Initiate clusters by defining centroids
  centroids = initiate_centroids(k, dset)

  while (goahead):
    # Step 3 and 4 Assign centroids and calculate error
    working_dset['centroid'], j_err = centroid_assignation(working_dset, centroids)
    err.append(sum(j_err))

    # Step 5 Update centroid position
    centroids = working_dset.groupby('centroid').agg('mean').reset_index(drop=True)

    # Step 6 Restart the iteration
    if j > 0:
      if err[j - 1] - err[j] <= tol:
        goahead = False

    j += 1

  # Final assignment and update
  working_dset['centroid'], j_err = centroid_assignation(working_dset, centroids)
  centroids = working_dset.groupby('centroid').agg('mean').reset_index(drop=True)

  return working_dset['centroid'], j_err, centroids

In [None]:
np.random.seed(42)
df['centroid'], df['error'],centroids = kmeans(df[['x','y']], 3)
df.head()

In [None]:
centroids

In [None]:

# Create a scatter plot
fig, ax = plt.subplots(figsize=(8, 6))
plt.scatter(
    df.iloc[:,0], df.iloc[:,1], marker = 'o',
    c = df['centroid'].astype('category'),
    cmap = customcmap, s = 80, alpha = 0.5
)

plt.scatter(centroids.iloc[:,0], centroids.iloc[:,1],
            marker = 's', s = 200,
            c=[0,1,2], cmap= customcmap
            )

# Add labels and ticks
ax.set_xlabel(r"x", fontsize=14)
ax.set_ylabel(r"y", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Show the plot
plt.show()


In [None]:
err_total = []
n = 10

df_eblow = blobs[['x','y']]

for i in range(n):
    _,my_errs, _ = kmeans(df_elbow, i+1)
    err_total.append(sum(my_errs))

fig, ax = plt.subplots(figsize=(8, 6))

plt.plot(range(1, n+1), err_total, linewidth=3, marker='o')

ax.set_xlabel(r'Number of clusters', fontsize=14)
ax.set_ylabel(r'Total error', fontsize=14)

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.show()