In [5]:
import csv
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram

def load_data(filepath):
    # Initialize an empty list to store the data
    data = []

    # Open the file using 'with' to ensure it's properly closed afterwards
    with open(filepath, 'r') as file:
        # Create a DictReader object
        reader = csv.DictReader(file)
        
        # Iterate over each row in the CSV
        for row in reader:
            # Convert the OrderedDict to a regular dict (if necessary)
            row_dict = dict(row)
            # Append the row dict to the data list
            data.append(row_dict)
    
    # Return the list of row dictionaries
    return data

def calc_features(row):
    # List to hold feature values
    features = []
    
    # List of keys in order
    keys = ['Population', 'Net migration', 'GDP ($ per capita)', 
            'Literacy (%)', 'Phones (per 1000)', 'Infant mortality (per 1000 births)']
    
    for key in keys:
        value_str = row[key]
        # Data cleaning steps
        value_str = value_str.replace(',', '').replace('%', '').strip()
        # Handle missing or 'unknown' values
        if value_str in ['', 'N/A', 'unknown']:
            # Decide how to handle missing values
            # For example, you could skip this row or assign a default value
            value = 0.0  # or continue to the next row
        else:
            # Convert to float
            value = float(value_str)
        # Append to features list
        features.append(value)
    
    # Convert features list to NumPy array
    feature_array = np.array(features, dtype=np.float64)
    
    return feature_array


def hac(features):
    n = len(features)
    clusters = {i: [i] for i in range(n)}
    cluster_indices = set(range(n))
    Z = np.zeros((n - 1, 4))

    # Maximum cluster index after all merges
    max_index = 2 * n - 2
    D = np.full((max_index + 1, max_index + 1), np.inf)

    # Compute initial distances between data points
    for i in range(n):
        for j in range(i + 1, n):
            D[i, j] = D[j, i] = np.linalg.norm(features[i] - features[j])

    for iteration in range(n - 1):
        # Find the pair of clusters with the minimal distance
        min_distance = np.inf
        min_i = -1
        min_j = -1
        cluster_indices_list = sorted(cluster_indices)
        for i in cluster_indices_list:
            for j in cluster_indices_list:
                if i >= j:
                    continue
                if D[i, j] < min_distance:
                    min_distance = D[i, j]
                    min_i, min_j = i, j
                elif D[i, j] == min_distance:
                    # Tie-breaking rules
                    if i < min_i or (i == min_i and j < min_j):
                        min_i, min_j = i, j

        # Merge clusters min_i and min_j
        new_cluster_index = n - 1 + iteration
        clusters[new_cluster_index] = clusters[min_i] + clusters[min_j]
        size = len(clusters[new_cluster_index])

        # Update Z with merge information
        Z[iteration, 0] = min(min_i, min_j)
        Z[iteration, 1] = max(min_i, min_j)
        Z[iteration, 2] = min_distance
        Z[iteration, 3] = size

        # Update cluster indices
        cluster_indices.remove(min_i)
        cluster_indices.remove(min_j)
        cluster_indices.add(new_cluster_index)

        # Update distances for the new cluster
        for m in cluster_indices:
            if m == new_cluster_index:
                continue
            # Single linkage: minimum distance between clusters
            D[new_cluster_index, m] = D[m, new_cluster_index] = min(D[min_i, m], D[min_j, m])

        # Remove old distances
        D[min_i, :] = np.inf
        D[:, min_i] = np.inf
        D[min_j, :] = np.inf
        D[:, min_j] = np.inf

    return Z


In [11]:
def fig_hac(Z, names):
    # Import necessary functions
    from scipy.cluster.hierarchy import dendrogram
    import matplotlib.pyplot as plt

    # Create a figure
    fig = plt.figure(figsize=(10, 7))
    
    # Plot the dendrogram
    dendrogram(Z, labels=names, leaf_rotation=90)
    
    # Adjust layout
    plt.tight_layout()
    
    # Optionally, add titles and labels
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel('Countries')
    plt.ylabel('Distance')
    
    # Show the plot
    plt.show()
    
    # Return the figure
    return fig


In [None]:
def normalize_features(features):
    # Stack all feature vectors into a 2D array
    feature_matrix = np.vstack(features)  # Shape: (n, 6)
    
    # Initialize an array to store normalized features
    normalized_matrix = np.zeros_like(feature_matrix)
    
    # Loop over each feature (column)
    for i in range(feature_matrix.shape[1]):
        col = feature_matrix[:, i]
        col_min = np.min(col)
        col_max = np.max(col)
        range_ = col_max - col_min
        
        if range_ == 0:
            # If the feature has zero range, set normalized values to zero
            normalized_col = np.zeros_like(col)
        else:
            # Apply normalization
            normalized_col = (col - col_min) / range_
        
        # Store the normalized column
        normalized_matrix[:, i] = normalized_col
    
    # Split the normalized matrix back into a list of arrays
    normalized_features = [normalized_matrix[i, :] for i in range(normalized_matrix.shape[0])]
    
    return normalized_features
if __name__ == "__main__":
    # Load and process data
    data = load_data('countries.csv')
    country_names = [row['Country'] for row in data]
    features = [calc_features(row) for row in data]
    features_normalized = normalize_features(features)
    
    # Select a subset for testing
    n = 20
    selected_features = features_normalized[:n]
    selected_names = country_names[:n]
    
    # Run HAC
    Z = hac(selected_features)
    
    # Visualize clustering
    fig = fig_hac(Z, selected_names)
    plt.show()
