<a href="https://colab.research.google.com/github/Axle-Bucamp/Geomarketing/blob/main/kmean_with_fixed_centroid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install geopandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import plotly.express as px
import geopandas as gpd

geo_df = gpd.read_file(gpd.datasets.get_path('naturalearth_cities'))

fig = px.scatter_geo(geo_df,
                    lat=geo_df.geometry.y,
                    lon=geo_df.geometry.x,
                    hover_name="name")
fig.show()

In [None]:
geo_df.head()

Unnamed: 0,name,geometry
0,Vatican City,POINT (12.45339 41.90328)
1,San Marino,POINT (12.44177 43.93610)
2,Vaduz,POINT (9.51667 47.13372)
3,Luxembourg,POINT (6.13000 49.61166)
4,Palikir,POINT (158.14997 6.91664)


In [None]:
geo_df["x"] = geo_df.geometry.x
geo_df["y"] = geo_df.geometry.y


In [None]:
geo_df.head()

Unnamed: 0,name,geometry,x,y
0,Vatican City,POINT (12.45339 41.90328),12.453387,41.903282
1,San Marino,POINT (12.44177 43.93610),12.44177,43.936096
2,Vaduz,POINT (9.51667 47.13372),9.516669,47.133724
3,Luxembourg,POINT (6.13000 49.61166),6.130003,49.61166
4,Palikir,POINT (158.14997 6.91664),158.149974,6.916644


In [None]:
import pandas as pd
import numpy as np
import math


In [None]:
def initialize_centroids(k, data):

    n_dims = data.shape[1]
    centroid_min = data.min().min()
    centroid_max = data.max().max()
    centroids = []

    for centroid in range(k):
        centroid = np.random.uniform(centroid_min, centroid_max, n_dims)
        centroids.append(centroid)

    centroids = pd.DataFrame(centroids, columns = data.columns)

    return centroids

In [None]:
def calculate_error(a,b):
    '''
    Given two Numpy Arrays, calculates the root of the sum of squared errores.
    '''
    error = np.square(np.sum((a-b)**2))

    return error    

In [None]:
def assign_centroid(data, centroids):
    '''
    Receives a dataframe of data and centroids and returns a list assigning each observation a centroid.
    data: a dataframe with all data that will be used.
    centroids: a dataframe with the centroids. For assignment the index will be used.
    '''

    n_observations = data.shape[0]
    centroid_assign = []
    centroid_errors = []
    k = centroids.shape[0]


    for observation in range(n_observations):

        # Calculate the errror
        errors = np.array([])
        for centroid in range(k):
            error = calculate_error(centroids.iloc[centroid, :2], data.iloc[observation,:2])
            errors = np.append(errors, error)

        # Calculate closest centroid & error 
        closest_centroid =  np.where(errors == np.amin(errors))[0].tolist()[0]
        centroid_error = np.amin(errors)

        # Assign values to lists
        centroid_assign.append(closest_centroid)
        centroid_errors.append(centroid_error)

    return (centroid_assign,centroid_errors)

In [None]:
import warnings

def knn(data, k, fixed_k):
    '''
    Given a dataset and number of clusters, it clusterizes the data. 
    data: a DataFrame with all information necessary
    k: number of clusters to create
    '''
    # Initialize centroids
    centroids = initialize_centroids(k, data)
    error = []
    compr = True
    i = 0
    centroids = pd.concat([centroids,fixed_k]).reset_index(drop=True)
    while(compr):
        # Obtain centroids and error
        
        data['centroid'], iter_error = assign_centroid(data,centroids)
        error = np.append(error, sum(iter_error))
        # Recalculate centroids
        centroids = data.groupby('centroid').agg('mean').reset_index(drop = True)
        centroids.iloc[centroids.shape[0] - fixed_k.shape[0]:,:] = fixed_k
        

        # Re initialize centroids
        if(centroids.shape[0] < k):
            warnings.warn("Cluster devanished! Consider reducing the number of k")
            #raise Warning("Vanished centroid. Consider changing the number of clusters.")
            number_centroids_reinitialize = k - centroids.shape[0] 
            reinitialized_centroids = initialize_centroids(number_centroids_reinitialize, data.drop(['centroid'], axis = 1))

            # Find the index of the centroids that  are missing
            ind_missing = np.isin(np.array(range(k)), centroids.index)
            reinitialized_centroids.index = np.array(range(k))[ind_missing == False]

            # Include the new centroids
            centroids = centroids.append(reinitialized_centroids)

        # Check if the error has decreased
        if(len(error)<2):
            compr = True
        else:
            if(round(error[i],3) !=  round(error[i-1],3)):
                compr = True
            else:
                compr = False
        i = i + 1 


    #data['centroid'], iter_error = assign_centroid(data,centroids)
    #centroids = data.groupby('centroid').agg('mean').reset_index(drop = True)

    return (data['centroid'], error[-1], centroids)

In [None]:
geo_df.head()

Unnamed: 0,name,geometry,x,y
0,Vatican City,POINT (12.45339 41.90328),12.453387,41.903282
1,San Marino,POINT (12.44177 43.93610),12.44177,43.936096
2,Vaduz,POINT (9.51667 47.13372),9.516669,47.133724
3,Luxembourg,POINT (6.13000 49.61166),6.130003,49.61166
4,Palikir,POINT (158.14997 6.91664),158.149974,6.916644


In [None]:
fixed = pd.DataFrame(np.array([[42, 1],[15,5],[30,6]]), columns=['x', 'y']) 
coord = geo_df.drop(["geometry","name"], axis=1)
fixed.head()

Unnamed: 0,x,y
0,42,1
1,15,5
2,30,6


In [None]:
z = 4
geo_df['centroid'], _, centroids =  knn(coord,z, fixed)

In [None]:
geo_df["x"] =geo_df.geometry.x
geo_df["y"] =geo_df.geometry.y

In [None]:
for k in range(centroids.shape[0]):
  df2 = pd.DataFrame(np.array([["center_" + str(k), 0,float(centroids.iloc[k]["x"]),float(centroids.iloc[k]["y"]),z+fixed.shape[0] ]]), columns=['name',"geometry",'x','y','centroid']).reset_index(drop=True)
  geo_df = geo_df.reset_index(drop=True)
  geo_df = pd.concat([geo_df,df2]).reset_index(drop=True)

In [None]:
geo_df.shape

(207, 5)

In [None]:
fig = px.scatter_geo(geo_df,
                    lat="y",
                    lon="x",
                    hover_name="name",
                    color="centroid"
                    )
fig.show()