In [12]:
pip install seaborn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting matplotlib!=3.6.1,>=3.4 (from seaborn)
  Downloading matplotlib-3.10.7-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached contourpy-1.3.3-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Downloading fonttools-4.60.1-cp313-cp313-win_amd64.whl.metadata (114 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached kiwisolver-1.4.9-cp313-cp313-win_amd64.whl.metadata (6.4 kB)
Collecting pyparsing>=3 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached pyparsing-3.2.5-py3-none-any.whl.metadata (5.0 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Downloading matplotlib-3.10.7-cp313-cp313-win_a


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
import numpy as np
from sklearn.cluster import KMeans
import seaborn as sns
import pandas as pd
from time import time
import matplotlib.pyplot as plt

In [6]:
def kmeans(X, k):
    """Creating a function that performs k-means clustering
    on a numerical NumPy array X that returns a tuple where
    centroids is a 2D array of shape containing the cluster
    centroids, and labels is a 1D array of shape containing
    the index of the assigned cluster for each row of X."""
    
    # Initialize and fit the Scikit-Learn KMeans model
    model = KMeans(n_clusters=k, n_init='auto', random_state=42)
    model.fit(X)
    
    # Extracting the centroids and labels as NumPy arrays
    centroids = model.cluster_centers_
    labels = model.labels_
    
    return centroids, labels

In [3]:
X = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

In [7]:
centroids, labels = kmeans(X, k=3)

In [8]:
print("Centroids:\n", centroids)
print("Labels:", labels)

Centroids:
 [[4. 5. 6.]
 [7. 8. 9.]
 [1. 2. 3.]]
Labels: [2 0 1]


In [10]:
kmeans(X, 3)

(array([[4., 5., 6.],
        [7., 8., 9.],
        [1., 2., 3.]]),
 array([2, 0, 1], dtype=int32))

In [14]:
# Load the dataset
diamonds = sns.load_dataset("diamonds")

In [15]:
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [16]:
# Keep only numerical columns
numeric_diamonds = diamonds.select_dtypes(include='number')

In [17]:
numeric_diamonds

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.20,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,61.0,58.0,2757,6.15,6.12,3.74


In [20]:
def kmeans_diamonds(n, k):
    """Creating a function that runs a prior kmeans function
    to create k clusters on the first n rows of the (numeric)
    diamonds dataset."""
    
    # Load the dataset
    diamonds = sns.load_dataset("diamonds")
    
    # Keep only numerical columns
    numeric_diamonds = diamonds.select_dtypes(include='number')
    
    # Restricting to the first n rows
    X = numeric_diamonds.head(n).to_numpy()

    # Using the kmeans() function defined in Exercise 1
    centroids, labels = kmeans(X, k)

    # Returning results
    return centroids, labels

In [21]:
centroids, labels = kmeans_diamonds(n=1000, k=5)

print("Centroid for 4th cluster:\n", centroids[3])
print("Cluster assignment for 10th diamond:", labels[9])

Centroid for 4th cluster:
 [7.70586797e-01 6.17163814e+01 5.77694377e+01 2.86234474e+03
 5.87168704e+00 5.85951100e+00 3.62000000e+00]
Cluster assignment for 10th diamond: 2


In [None]:
def kmeans_timer(n, k, n_iter=5):
    """Creating a function that runs the function
    kmeans_diamonds(n, k) exactly n_iter times, and saves the
    runtime for reach run and returns the average time across
    the n runs, where 'time' is in seconds."""
    
    # Creating an empty list to store the runtimes
    times = []
    
    for _ in range(n_iter):
        start = time()
        centroids, labels = kmeans_diamonds(n, k)
        end = time()
        elapsed = end - start
        times.append(elapsed)
    
    
