## Results of the 3-means experiment

In [10]:
import numpy as np 
import pandas as pd 
from sklearn.cluster import KMeans

In [11]:
# Helper functions for computing cost
def get_weighted_distance_between_two_vectors(vector1, vector2, weight):
    return (weight * np.linalg.norm(vector1 - vector2))

def get_distance_between_two_vectors(vector1, vector2):
    return np.linalg.norm(vector1 - vector2)

def get_k_means_cost(k, clusters, data):
    accumulativeCost = 0
    currentCosts = np.repeat(0,k)
    for vector in data:
        currentCosts = list(map(get_distance_between_two_vectors, clusters, np.repeat(vector, k, axis=0)))
        accumulativeCost = accumulativeCost + min(currentCosts)
                            
    return accumulativeCost

def get_weighted_k_means_cost(k, clusters, data, data_weights):
    accumulativeCost = 0
    currentCosts = np.repeat(0,k)
    for vector in data:
        currentCosts = list(map(get_weighted_distance_between_two_vectors, clusters, np.repeat(vector, k, axis=0), data_weights))
        accumulativeCost = accumulativeCost + min(currentCosts)
                            
    return accumulativeCost

In [12]:
# Load cluster centres and coreset data
cluster_df = pd.read_pickle('../data/results/3means/3means_cluster_centers_12_coreset.pkl')
coreset_df = pd.read_csv('../data/12_coreset.csv', encoding='ISO-8859-1')

### Compute both the non-weighted and weighted costs on the coreset

In [18]:
coreset_vectors = pd.DataFrame.to_numpy(coreset_df[['X', 'Y']])
weight_vectors = pd.Series.to_numpy(coreset_df['weights'])
cluster_vectors = pd.DataFrame.to_numpy(cluster_df)

non_weighted_cost = get_k_means_cost(3, cluster_vectors, coreset_vectors)
weighted_cost = get_weighted_k_means_cost(3, cluster_vectors, coreset_vectors, weight_vectors)

print("Non-weighted cost:", non_weighted_cost)
print("Weighted cost:", weighted_cost)

Non-weighted cost: 46.43718493113637
Weighted cost: 4824.608463984105


### Scikit learn kmeans implementation for reference

In [19]:
kmeans_unweighted = KMeans(init="k-means++", n_clusters=3, random_state=0).fit(coreset_vectors)
skl_unweighted_cost = get_k_means_cost(3, kmeans_unweighted.cluster_centers_, coreset_vectors)
print('Scikit learn unweighted cost:', skl_unweighted_cost)

kmeans_weighted = KMeans(init="k-means++", n_clusters=3, random_state=0).fit(coreset_vectors, sample_weight=weight_vectors)
skl_weighted_cost = get_weighted_k_means_cost(3, kmeans_weighted.cluster_centers_, coreset_vectors, weight_vectors)
print('Scikit learn weighted cost:', skl_weighted_cost)

Scikit learn unweighted cost: 41.266189742121156
Scikit learn weighted cost: 4200.602651251721


