In [1]:
import random
import pandas as pd
import math
import statistics as stats
import operator

In [2]:
generated_ids = []
class Cluster:
    def __init__(self, members, children):
        if members is None:
            members = []
        self.members = members
        if children is None:
            children = []
        self.children = children
        self.id = self.generate_id()
    def generate_id(self):
        id = ''
        possible_chars = '1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
        while len(id) == 0 or id in generated_ids:
            id = ''
            for _ in range(0, 20):
                random_char = random.choice(possible_chars)
                id += random_char
        
        generated_ids.append(id)

        return id

In [7]:
df = pd.read_csv('./datasets/weather_forecast_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temperature  2500 non-null   float64
 1   Humidity     2500 non-null   float64
 2   Wind_Speed   2500 non-null   float64
 3   Cloud_Cover  2500 non-null   float64
 4   Pressure     2500 non-null   float64
 5   Rain         2500 non-null   object 
dtypes: float64(5), object(1)
memory usage: 117.3+ KB


In [8]:
df.drop(columns=['Rain'], inplace=True)

In [9]:
def get_euclidean_distance(obj1: pd.Series, obj2: pd.Series):
    result = 0
    for attribute in obj1.keys():
        result += pow(float(obj1[attribute]) - float(obj2[attribute]), 2)

    return math.sqrt(result)

In [10]:
def get_clusters_distance(c1: Cluster, c2: Cluster, method: str):
    distances = []
    for item in c1.members:
        for _item in c2.members:
            distances.append(get_euclidean_distance(_item, item))
    
    if method == 'max':
        return max(distances)
    if method == 'min':
        return min(distances)
    if method == 'avg':
        return stats.mean(distances)
    
    raise Exception('invalid method passed')

In [11]:
def generate_distance_matrix(clusters: list, method: str):
    matrix = dict()
    for cluster in clusters:
        for other_cluster in [c for c in clusters if c is not cluster]:
            if f'{cluster.id}|{other_cluster.id}' not in matrix:
                distance = get_clusters_distance(cluster, other_cluster, method)
                matrix[f'{cluster.id}|{other_cluster.id}'] = distance
                matrix[f'{other_cluster.id}|{cluster.id}'] = distance

    return matrix

In [12]:
def perform_agglomerative_hierarchical_clustering(dataset: pd.DataFrame, method: str):
    clusters = [Cluster([row], []) for i, row in dataset.iterrows()]

    while len(clusters) > 1:
        distance_matrix = generate_distance_matrix(clusters, method)
        closest_clusters = min(distance_matrix.items(), key=operator.itemgetter(1))[0]
        c1_key, c2_key = closest_clusters.split('|')[0], closest_clusters.split('|')[1]
        c1, c2 = next((c for c in clusters if c.id == c1_key), None), next((c for c in clusters if c.id == c2_key), None)
        c1.children.append(c2)
        clusters.remove(c2)

    return clusters[0]

In [13]:
def series_as_string(serie: pd.Series):
    s = ''
    for attr in serie.keys():
        s += f'"{attr}":{serie[attr]} '

    return s

In [14]:
def print_hierarchy(node: Cluster, level: int):
    spaces = ''
    for _ in range(0, level):
        spaces += ' '
    print(spaces + f'cluster id: {node.id}; objects: [')
    for member in node.members:
        print(spaces + series_as_string(member))

    print(spaces + ']; internal clusters: ')
    for child in node.children:
        print_hierarchy(child, level + 1)
    print(spaces + ']')

In [15]:
hierarchy_root = perform_agglomerative_hierarchical_clustering(df[:200], 'max')

In [16]:
print_hierarchy(hierarchy_root, 0)

cluster id: RpznOEPNWkPB0Ypn8QvR; objects: [
"Temperature":23.72033759818312 "Humidity":89.59264065174611 "Wind_Speed":7.335604391040214 "Cloud_Cover":50.50169383291316 "Pressure":1032.378758690279 
]; internal clusters: 
 cluster id: GqBcRCYM4LrY0erOSnkb; objects: [
 "Temperature":33.355349948119844 "Humidity":94.26030814274029 "Wind_Speed":11.85795524802031 "Cloud_Cover":49.85858788649463 "Pressure":1030.176153553989 
 ]; internal clusters: 
 ]
 cluster id: hRowuffA3NkWE66Hc4PB; objects: [
 "Temperature":10.479829957733338 "Humidity":86.95071771557434 "Wind_Speed":16.775945567999454 "Cloud_Cover":49.26848752549474 "Pressure":1029.706512092824 
 ]; internal clusters: 
 ]
 cluster id: CBczgHzntcAuIySsdJLI; objects: [
 "Temperature":33.59370196286561 "Humidity":88.37542686178719 "Wind_Speed":2.474866568294325 "Cloud_Cover":62.64242471491316 "Pressure":1036.8236476569737 
 ]; internal clusters: 
  cluster id: 9EBhVX4cVNeQ3KSuNdEC; objects: [
  "Temperature":31.160216811778195 "Humidity":