In [1]:
import pandas as pd
import random
import operator
import math

In [2]:
df = pd.read_csv('./datasets/weather_forecast_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temperature  2500 non-null   float64
 1   Humidity     2500 non-null   float64
 2   Wind_Speed   2500 non-null   float64
 3   Cloud_Cover  2500 non-null   float64
 4   Pressure     2500 non-null   float64
 5   Rain         2500 non-null   object 
dtypes: float64(5), object(1)
memory usage: 117.3+ KB


In [3]:
df.drop(columns=['Rain'], inplace=True)

In [4]:
def choose_random_initial_centroids(k: int, dataset: pd.DataFrame):
    result = []
    possible_indices = [index for index, row in dataset.iterrows()]

    while len(result) != k:
        index = random.choice(possible_indices)
        result.append(df.loc[index])
        possible_indices.remove(index)

    return result

In [5]:
class Cluster:
    def __init__(self, centroid: pd.Series, members: pd.DataFrame):
        self.centroid = centroid
        self.members = members
        self.id = self.generate_id()

    def generate_id(self):
        candids = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
        id = ''
        for i in range(0, 20):
            id += random.choice(candids)

        return id

In [6]:
def calculate_euclidean_distance(obj: pd.Series, centroid: pd.Series):
    result = 0
    for attribute in obj.keys():
        result += pow(obj[attribute] - centroid[attribute], 2)

    result = math.sqrt(result)
    return result

In [7]:
def get_mean_point(cluster: Cluster):
    new_centroid = pd.Series()
    attributes = cluster.members.keys()
    for attribute in attributes:
        new_centroid[attribute] = float(cluster.members[attribute].mean())

    return new_centroid

In [8]:
def perform_k_means(k: int, dataset: pd.DataFrame):
    initial_means = choose_random_initial_centroids(k, df)
    stable = False
    columns = list(dataset.keys())
    clusters = [Cluster(mean, pd.DataFrame(columns=columns)) for mean in initial_means]
    while not stable:
        clusters = [Cluster(c.centroid, pd.DataFrame(columns=columns)) for c in clusters]
        for _, row in dataset.iterrows():
            distance_to_centroids = dict()
            for cluster in clusters:
                distance_to_centroids[cluster.id] = calculate_euclidean_distance(row, cluster.centroid)
            closest_cluster_id = min(distance_to_centroids.items(), key=operator.itemgetter(1))[0]
            corresponding_cluster = next((c for c in clusters if c.id == closest_cluster_id), None)
            corresponding_cluster.members.loc[len(corresponding_cluster.members)] = row

        stable = True
        for cluster in clusters:
            new_centroid = get_mean_point(cluster)
            if calculate_euclidean_distance(cluster.centroid, new_centroid) > .01: # if new centroids are far different, repeat the loop
                stable = False
            cluster.centroid = new_centroid

    return clusters

In [9]:
clusters = perform_k_means(5, df)

In [10]:
for cluster in clusters:
    print(f'Centroid:\n {cluster.centroid}')
    print('------------------------------')

# cluster members are printable too. avoided for neatness

Centroid:
 Temperature      22.589719
Humidity         64.741542
Wind_Speed       10.022640
Cloud_Cover      74.838801
Pressure       1032.942281
dtype: float64
------------------------------
Centroid:
 Temperature      22.186172
Humidity         84.135053
Wind_Speed        9.805439
Cloud_Cover      28.494542
Pressure       1001.393942
dtype: float64
------------------------------
Centroid:
 Temperature     23.266930
Humidity        46.298601
Wind_Speed      10.024293
Cloud_Cover     32.412432
Pressure       998.165711
dtype: float64
------------------------------
Centroid:
 Temperature      22.454077
Humidity         60.049611
Wind_Speed        9.753121
Cloud_Cover      21.806998
Pressure       1033.849150
dtype: float64
------------------------------
Centroid:
 Temperature     22.485126
Humidity        65.510887
Wind_Speed       9.910101
Cloud_Cover     79.397697
Pressure       997.623096
dtype: float64
------------------------------
