# Exercise 4-5  

In [33]:
import numpy as np


def euclidian_dist(point_1: list[int | float], point_2: list[int | float]) -> float:
    """ 
    Returns the euclidian distance between two points
    """
    res = []
    for p_1, p_2 in zip(point_1, point_2):
        res.append(abs(p_1 - p_2)**2)
        
    return sum(res)**(1/2)
    

def manhattan_dist(point_1: list[int | float], point_2: list[int | float]) -> float:
    """ 
    Returns the manhattan distance between two points
    """
    res = []
    for p_1, p_2 in zip(point_1, point_2):
        res.append(abs(p_1 - p_2))
        
    return sum(res)


def max_norm_dist(point_1: list[int | float], point_2: list[int | float]) -> float:
    """
    Returns the max norm distance between two points
    """
    res = []
    for p_1, p_2 in zip(point_1, point_2):
        res.append(abs(p_1 - p_2))
        
    return max(res)


def weigthed_euclidian_dist(point_1: list[int | float],
                            point_2: list[int | float],
                            weights: list[int | float]) -> float:
    """
    Returns the weighted euclidian distance between two points and a list of weights
    The length of the array must be the same as the length of the points
    """
    res = []
    for p_1, p_2, w in zip(point_1, point_2, weights):
        res.append(w * (abs(p_1 - p_2)))
        
    return sum(res)**(1/2)


def quadratic_dist(point_1: list[int | float],
                   point_2: list[int | float],
                   w_matrix: list[list[int | float]]) -> float:
    """ 
    Returns the quadratic distance
    """
    point_1 = np.array(point_1)
    point_2 = np.array(point_2)
    w_matrix = np.array(w_matrix)
    return ((point_1 - point_2) @ w_matrix @ (np.transpose(point_1) - np.transpose(point_2)))**(1/2)


def custom_dist(point_1: list[int | float],
                point_2: list[int | float],
                p: int | float) -> float:
    """ 
    """
    
    res = []
    for p_1, p_2 in zip(point_1, point_2):
        res.append((abs(p_1 - p_2))**p)
        
    return sum(res)**(1/p) 

In [26]:
from statistics import mean
from statistics import mode, StatisticsError
from random import randint
from typing import Iterable 
from typing import Iterable 
#from collections import Iterable                            # < py38


def flatten(items):
    """Yield items from any nested iterable; see Reference."""
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            for sub_x in flatten(x):
                yield sub_x
        else:
            yield x
            

def get_min_index(my_list):
    """ 
    """
    # inbuilt function to find the position of minimum
    return my_list.index(min(my_list))


def flatten(items):
    """Yield items from any nested iterable; see Reference."""
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            for sub_x in flatten(x):
                yield sub_x
        else:
            yield x

            
def get_td_2(cluster, centroid):
    res = 0
    for p in cluster:
        res += abs(p[0] - centroid[0])**2
    return res

                    
def get_total_td_2(clusters, centroids):
    res = 0
    for i, cluster in enumerate(clusters):
        res += get_td_2(cluster, centroids[i])
    return res

def get_simplified_silhoutte(clusters, centroids):
    """ 
    
    """
    cluster_silhouttes = []
    for i, cluster in enumerate(clusters):

        silhuette_values = []
        for p in cluster:
            a_o = euclidian_dist(p, centroids[i])
            distances = [euclidian_dist(p, centroid) for j, centroid in enumerate(centroids) if j != i]
            min_index = get_min_index(distances)
            b_o = distances[min_index]
            silhuette = ( b_o - a_o ) / max([a_o, b_o])
            silhuette_values.append(silhuette)
        cluster_silhoutte = mean(silhuette_values)
        cluster_silhouttes.append(cluster_silhoutte)
    return cluster_silhouttes


def get_total_sihoutte(clusters, centriods):
    """ 
    
    """
    
    return mean(get_simplified_silhoutte(clusters, centriods))

               
def forgy_loyd(points, k, centroids = None):
    """ 
    
    """
    
    if centroids == None:
        centroids = []
        
        for i in range(k):
            match = False
            
            while not match:
                centroid = points[randint(0, len(points) - 1)]
                if centroid not in centroids:
                    centroids.append(centroid)
                    match = True
                
            
    prev_clusters = [[] for _ in range(k)]
    clusters = [[] for _ in range(k)]
    
    j = 1
    for p in points:  
        distances = [euclidian_dist(p, centroid) for centroid in centroids]
        min_index = get_min_index(distances)
        clusters[min_index].append(p)
    
    temp_clusters = []
    temp_centroids = []
    for l, cluster in enumerate(clusters):
        if len(cluster) > 0:
            temp_clusters.append(cluster)
            temp_centroids.append(centroids[l])
        else:
            k = k - 1
    clusters = temp_clusters 
    centroids = temp_centroids
    
    
    silhouttes = get_simplified_silhoutte(clusters, centroids)
    tot_silhoutte = get_total_sihoutte(clusters, centroids)        
    print("-----------------------------------")
    print(f"iteation: {j}")
    print(f"centroids: {centroids}")
    print(f"Total td**2: {get_total_td_2(clusters, centroids)}")
    print(f"Total silhoutte: {tot_silhoutte}")
    print("-----------------------------------")
    for i, cluster in enumerate(clusters):
        print(f"    - cluster {i + 1}: {cluster}")
        print(f"    - Td**2: {get_td_2(cluster, centroids[i])}")
        print(f"    - Silhoutte: {silhouttes[i]}")

    
    
    while prev_clusters != clusters:
        j += 1 
        prev_clusters = clusters
        for i, cluster in enumerate(clusters):
            try:
                centroids[i] = [mean(flatten(cluster))]
            except StatisticsError:
                del centroids[i]
                del clusters[i]
                k = k -1
                
        clusters = [[] for _ in range(k)]
        for p in points:  
            distances = [euclidian_dist(p, centroid) for centroid in centroids]
            min_index = get_min_index(distances)
            clusters[min_index].append(p)
        
        temp_clusters = []
        temp_centroids = []
        for l, cluster in enumerate(clusters):
            if len(cluster) > 0:
                temp_clusters.append(cluster)
                temp_centroids.append(centroids[l])
            else:
                k = k - 1
        clusters = temp_clusters 
        centroids = temp_centroids
        
        silhouttes = get_simplified_silhoutte(clusters, centroids)
        tot_silhoutte = get_total_sihoutte(clusters, centroids)        
        print("-----------------------------------")
        print(f"iteation: {j}")
        print(f"centroids: {centroids}")
        print(f"Total td**2: {get_total_td_2(clusters, centroids)}")
        print(f"Total silhoutte: {tot_silhoutte}")
        print("-----------------------------------")
        for i, cluster in enumerate(clusters):
            print(f"    - cluster {i + 1}: {cluster}")
            print(f"    - Td**2: {get_td_2(cluster, centroids[i])}")
            print(f"    - Silhoutte: {silhouttes[i]}")


       

    silhouttes = get_simplified_silhoutte(clusters, centroids)
    tot_silhoutte = get_total_sihoutte(clusters, centroids)        
    print("-----------------------------------")
    print(f"Final RESULTs")
    print(f"iteation: {j}")
    print(f"centroids: {centroids}")
    print(f"Total td**2: {get_total_td_2(clusters, centroids)}")
    print(f"Total silhoutte: {tot_silhoutte}")
    print("-----------------------------------")
    for i, cluster in enumerate(clusters):
        print(f"    - cluster {i + 1}: {cluster}")
        print(f"    - Td**2: {get_td_2(cluster, centroids[i])}")
        print(f"    - Silhoutte: {silhouttes[i]}")

    
    return (clusters, centroids)
    

![title](imgs/Screenshot_2.png)

In [3]:
cluster_1 = [[1, 5], [2, 3], [3, 4], [10, 1]]
cluster_2 = [[6, 8], [7,7], [7, 8], [7, 9]]

clusters = [cluster_1, cluster_2]
centriods = [[4, 3.25], [6.75, 8]]

print(get_simplified_silhoutte(clusters, centriods))
print(get_total_sihoutte(clusters, centriods))        

[0.5284865898623086, 0.8591012153970738]
0.6937939026296912


In [4]:
cluster_1 = [[10, 1]]
cluster_2 = [[2, 3], [3, 3], [1, 4], [6, 8], [7, 7], [7, 8], [7, 9]]

clusters = [cluster_1, cluster_2]
centriods = [[mean(list(flatten(cluster_1)))], [mean(list(flatten(cluster_2)))]]

print(get_simplified_silhoutte(clusters, centriods))
print(get_total_sihoutte(clusters, centriods))  

[0.030769230769230847, -0.0504837960028733]
-0.009857282616821228


In [5]:
cluster_1 = [[1,4], [2, 3], [3, 4]]
cluster_2 = [[6, 8], [7, 7], [7, 8], [7, 9], [10, 1]]

clusters = [cluster_1, cluster_2]
centriods = [[mean(list(flatten(cluster_1)))], [mean(list(flatten(cluster_2)))]]

print(get_simplified_silhoutte(clusters, centriods))
print(get_total_sihoutte(clusters, centriods))  

[0.8287037037037036, 0.8531211750305997]
0.8409124393671517


![title](imgs/Screenshot_16.png)

In [6]:
points = [[2], [3], [4], [10], [11], [12], [20], [25], [30]]
clusters, centroids = forgy_loyd(points, 3, centroids = [[2], [4], [6]])

-----------------------------------
iteation: 1
centroids: [[2], [4], [6]]
Total td**2: 1211
Total silhoutte: 0.5647893772893773
-----------------------------------
    - cluster 1: [[2], [3]]
    - Td**2: 1
    - Silhoutte: 0.5
    - cluster 2: [[4]]
    - Td**2: 0
    - Silhoutte: 1.0
    - cluster 3: [[10], [11], [12], [20], [25], [30]]
    - Td**2: 1210
    - Silhoutte: 0.19436813186813187
-----------------------------------
iteation: 2
centroids: [[2.5], [4], [18]]
Total td**2: 318.5
Total silhoutte: 0.5358440170940171
-----------------------------------
    - cluster 1: [[2], [3]]
    - Td**2: 0.5
    - Silhoutte: 0.625
    - cluster 2: [[4], [10], [11]]
    - Td**2: 85
    - Silhoutte: 0.4
    - cluster 3: [[12], [20], [25], [30]]
    - Td**2: 233
    - Silhoutte: 0.5825320512820513
-----------------------------------
iteation: 3
centroids: [[2.5], [8.333333333333334], [21.75]]
Total td**2: 107.77083333333331
Total silhoutte: 0.7592741033285408
----------------------------------

0.8525944490856772

![title](imgs/Screenshot_1.png)

In [7]:
points = [[1, 5],
          [2, 3],
          [6, 8],
          [7, 8],
          [7, 9]]

centroids = [[3, 4], [7, 7], [10, 1]]

 
clusters, centroids = forgy_loyd(points, 3, centroids)

-----------------------------------
iteation: 1
centroids: [[3, 4], [7, 7]]
Total td**2: 6
Total silhoutte: 0.7277347340561804
-----------------------------------
    - cluster 1: [[1, 5], [2, 3]]
    - Td**2: 5
    - Silhoutte: 0.7127917786285166
    - cluster 2: [[6, 8], [7, 8], [7, 9]]
    - Td**2: 1
    - Silhoutte: 0.742677689483844
-----------------------------------
iteation: 2
centroids: [[2.75], [7.5]]
Total td**2: 6.375
Total silhoutte: 0.7824626354038118
-----------------------------------
    - cluster 1: [[1, 5], [2, 3]]
    - Td**2: 3.625
    - Silhoutte: 0.7972027972027972
    - cluster 2: [[6, 8], [7, 8], [7, 9]]
    - Td**2: 2.75
    - Silhoutte: 0.7677224736048265
-----------------------------------
Final RESULTs
iteation: 2
centroids: [[2.75], [7.5]]
Total td**2: 6.375
Total silhoutte: 0.7824626354038118
-----------------------------------
    - cluster 1: [[1, 5], [2, 3]]
    - Td**2: 3.625
    - Silhoutte: 0.7972027972027972
    - cluster 2: [[6, 8], [7, 8], [7, 9]

0.7824626354038118

In [8]:
points = [[1, 5],
          [2, 3],
          [6, 8],
          [7, 8],
          [7, 9]]
#centroids = [[3, 4], [7, 7], [10, 1]]


clusters, centroids = forgy_loyd(points, 4)

-----------------------------------
iteation: 1
centroids: [[7, 9], [2, 3], [6, 8], [1, 5]]
Total td**2: 0
Total silhoutte: 0.875
-----------------------------------
    - cluster 1: [[7, 8], [7, 9]]
    - Td**2: 0
    - Silhoutte: 0.5
    - cluster 2: [[2, 3]]
    - Td**2: 0
    - Silhoutte: 1.0
    - cluster 3: [[6, 8]]
    - Td**2: 0
    - Silhoutte: 1.0
    - cluster 4: [[1, 5]]
    - Td**2: 0
    - Silhoutte: 1.0
-----------------------------------
iteation: 2
centroids: [[2.5], [7]]
Total td**2: 3.5
Total silhoutte: 0.8648809523809524
-----------------------------------
    - cluster 1: [[1, 5], [2, 3]]
    - Td**2: 2.5
    - Silhoutte: 0.825
    - cluster 2: [[6, 8], [7, 8], [7, 9]]
    - Td**2: 1
    - Silhoutte: 0.9047619047619048
-----------------------------------
iteation: 3
centroids: [[2.75], [7.5]]
Total td**2: 6.375
Total silhoutte: 0.7824626354038118
-----------------------------------
    - cluster 1: [[1, 5], [2, 3]]
    - Td**2: 3.625
    - Silhoutte: 0.797202797202

0.7824626354038118

In [9]:
points = [[1, 5],
          [2, 3],
          [6, 8],
          [7, 8],
          [7, 9]]
#centroids = [[3, 4], [7, 7], [10, 1]]


clusters, centroids = forgy_loyd(points, 5)

-----------------------------------
iteation: 1
centroids: [[7, 8], [1, 5], [6, 8], [2, 3], [7, 9]]
Total td**2: 0
Total silhoutte: 1.0
-----------------------------------
    - cluster 1: [[7, 8]]
    - Td**2: 0
    - Silhoutte: 1.0
    - cluster 2: [[1, 5]]
    - Td**2: 0
    - Silhoutte: 1.0
    - cluster 3: [[6, 8]]
    - Td**2: 0
    - Silhoutte: 1.0
    - cluster 4: [[2, 3]]
    - Td**2: 0
    - Silhoutte: 1.0
    - cluster 5: [[7, 9]]
    - Td**2: 0
    - Silhoutte: 1.0
-----------------------------------
iteation: 2
centroids: [[7], [2.5]]
Total td**2: 3.5
Total silhoutte: 0.8648809523809524
-----------------------------------
    - cluster 1: [[6, 8], [7, 8], [7, 9]]
    - Td**2: 1
    - Silhoutte: 0.9047619047619048
    - cluster 2: [[1, 5], [2, 3]]
    - Td**2: 2.5
    - Silhoutte: 0.825
-----------------------------------
iteation: 3
centroids: [[7.5], [2.75]]
Total td**2: 6.375
Total silhoutte: 0.7824626354038118
-----------------------------------
    - cluster 1: [[6, 8]

# MACQUEEN 

In [10]:
def macqueen(points, k, centroids = None):
    
    
    if centroids == None:
        centroids = []
        
        for i in range(k):
            match = False
            
            while not match:
                centroid = points[randint(0, len(points) - 1)]
                if centroid not in centroids:
                    centroids.append(centroid)
                    match = True
                
            
    prev_clusters = [[] for _ in range(k)]
    clusters = [[] for _ in range(k)]
    
    j = 1
    for p in points:  
        distances = [euclidian_dist(p, centroid) for centroid in centroids]
        min_index = get_min_index(distances)
        clusters[min_index].append(p)
    
    print("-----------------------------------")
    print(f"iteation: {j}")
    print(f"centroids: {centroids}")
    print(f"Total td**2: {get_total_td_2(clusters, centroids)}")
    print("-----------------------------------")
    for i, cluster in enumerate(clusters):
        print(f"    - cluster {i + 1}: {cluster}")
        print(f"    - Td**2: {get_td_2(cluster, centroids[i])}")
    
    
    while prev_clusters != clusters:
        j += 1 
        prev_clusters = clusters
        for i, cluster in enumerate(clusters):
            centroids[i] = [mean(flatten(cluster))]
        
        clusters = [[] for _ in range(k)]
        temp_prev = clusters
        for p in points:  
            distances = [euclidian_dist(p, centroid) for centroid in centroids]
            min_index = get_min_index(distances)
            clusters[min_index].append(p)
            if clusters[min_index] != prev_clusters[min_index]:
                distances = [euclidian_dist(p, centroid) for centroid in centroids]
                min_index = get_min_index(distances)
                clusters[min_index].append(p)        
        print()
        print("-----------------------------------")
        print(f"iteation: {j}")
        print(f"centroids: {centroids}")
        print(f"Total td**2: {get_total_td_2(clusters, centroids)}")
        print("-----------------------------------")
        for i, cluster in enumerate(clusters):
            print(f"    - cluster {i + 1}: {cluster}")
            print(f"    - Td**2: {get_td_2(cluster, centroids[i])}")
    
    print()
    print("-----------------------------------")
    print(f"Final result: ")
    print(f"centroids: {centroids}")
    print(f"Total td**2: {get_total_td_2(clusters, centroids)}")
    print("-----------------------------------")
    for i, cluster in enumerate(clusters):
        print(f"    - cluster {i + 1}: {cluster}")
        print(f"    - Td**2: {get_td_2(cluster, centroids[i])}")  
        

points = [[2], [3], [4], [10], [11], [12], [20], [25], [30]]
macqueen(points, 3, centroids = [[2], [4], [6]])

-----------------------------------
iteation: 1
centroids: [[2], [4], [6]]
Total td**2: 1211
-----------------------------------
    - cluster 1: [[2], [3]]
    - Td**2: 1
    - cluster 2: [[4]]
    - Td**2: 0
    - cluster 3: [[10], [11], [12], [20], [25], [30]]
    - Td**2: 1210

-----------------------------------
iteation: 2
centroids: [[2.5], [4], [18]]
Total td**2: 637.0
-----------------------------------
    - cluster 1: [[2], [2], [3], [3]]
    - Td**2: 1.0
    - cluster 2: [[4], [10], [10], [11], [11]]
    - Td**2: 170
    - cluster 3: [[12], [12], [20], [20], [25], [25], [30], [30]]
    - Td**2: 466

-----------------------------------
iteation: 3
centroids: [[2.5], [9.2], [21.75]]
Total td**2: 192.31500000000003
-----------------------------------
    - cluster 1: [[2], [2], [3], [3], [4], [4]]
    - Td**2: 5.5
    - cluster 2: [[10], [10], [11], [11], [12], [12]]
    - Td**2: 23.440000000000015
    - cluster 3: [[20], [20], [25], [25], [30], [30]]
    - Td**2: 163.375

---

# Exam exercises

![title](imgs/Screenshot_3.png)

In [17]:
def get_centroids(cu: list[list[int]]):
    cens = []
    for c in cu:
        cen = mean(list(flatten(c)))
        cens.append([cen])
    return cens



cluster_1 = [[1], [3], [5]]
cluster_2 = [[7], [10], [11], [12]]
s_1 = [cluster_1, cluster_2]
centroids = get_centroids(s_1)
get_total_td_2(s_1, centroids)

22

In [18]:
cluster_1 = [[1], [3]]
cluster_2 = [[5], [7]]
cluster_3 = [ [10], [11], [12]]
s_2 = [cluster_1, cluster_2, cluster_3]
centroids = get_centroids(s_2)
get_total_td_2(s_2, centroids)

6

In [19]:
cluster_1 = [[1], [3], [5], [7]]
cluster_2 = [[10], [11], [12]]
s_3 = [cluster_1, cluster_2]
centroids = get_centroids(s_3)
get_total_td_2(s_3, centroids)

22

![Totæe](imgs/Screenshot_4.png)

In [34]:
cluster_1 = [[1], [3], [5]]
cluster_2 = [[7], [10], [11], [12]]
s_1 = [cluster_1, cluster_2]
centroids = get_centroids(s_1)
get_total_sihoutte(s_1, centroids)

0.7591435185185185

In [35]:
cluster_1 = [[1], [3]]
cluster_2 = [[5], [7]]
cluster_3 = [ [10], [11], [12]]
s_2 = [cluster_1, cluster_2, cluster_3]
centroids = get_centroids(s_2)
round(get_total_sihoutte(s_2, centroids), 2)

0.77

In [36]:
cluster_1 = [[1], [3], [5], [7]]
cluster_2 = [[10], [11], [12]]
s_3 = [cluster_1, cluster_2]
centroids = get_centroids(s_3)
round(get_total_sihoutte(s_3, centroids), 2)

0.78