In [None]:
# ------------------------------------------------------------------------------
# Hierarchical Clustering and Dendrogram Visualization
# ------------------------------------------------------------------------------
# Author: Ali Moayedi
# Email: am636@st-andrews.ac.uk
# Date: October 2024
#
# Description:
# This script performs hierarchical clustering on navigational features of geese
# migration. The clustering process identifies the optimal
# clusters based on a 25% maximum cluster size threshold. The script includes:
#   - Loading and preprocessing of the input data
#   - Calculation of the distance matrix and hierarchical linkage
#   - Determination of the optimal number of clusters
#   - Generation of a dendrogram to visualise the clustering structure
#   - Assignment of reordered cluster IDs based on dendrogram order
#
### Input:
#   - `Autumn_Day_CF.csv`: Preprocessed feature data for autumn daytime migration. 
#   - `Autumn_Night_CF.csv`: Preprocessed feature data for autumn nighttime migration. 
#   - `Spring_Day_CF.csv`: Preprocessed feature data for spring daytime migration.
#   - `Spring_Night_CF.csv`: Preprocessed feature data for spring nighttime migration.

## Note: These input files are generated by "7_Before_Clustering" scripts.

# Output:
#   - `Autumn_Day_with_ClusterID.csv`: Data with assigned cluster IDs
#   - `Dendrogram_Autumn_Day.png`: Dendrogram visualisation


# ------------------------------------------------------------------------------


In [None]:
# ------------------------------------------------------------------------------
#   Autumn Day Migration
# ------------------------------------------------------------------------------

### Input:
#   - `Autumn_Day_CF.csv`: Preprocessed feature data for autumn daytime migration. 
# Output:
#   - `Autumn_Day_with_ClusterID.csv`: Data with assigned cluster IDs
#   - `Dendrogram_Autumn_Day.png`: Dendrogram visualisation

## Note: `Autumn_Day_with_ClusterID.csv` will be used in "8_After_Clustering" script within "3_Autumn" folder.
# ------------------------------------------------------------------------------

# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import pdist

# ------------------------------------------------------------------------------
# Step 1: Load and Preprocess Data
# ------------------------------------------------------------------------------
# Load CSV data containing features for autumn daytime migration
filename = 'Autumn_Day_CF.csv'
data = pd.read_csv(filename)

# Extract all features as a NumPy array for clustering analysis
X = data.values

# ------------------------------------------------------------------------------
# Step 2: Compute Distance Matrix and Hierarchical Clustering
# ------------------------------------------------------------------------------
# Calculate the distance matrix using Euclidean distance
# Perform hierarchical clustering using Ward's linkage method
dist_matrix = pdist(X, metric='euclidean')
linked = linkage(X, method='ward')

# ------------------------------------------------------------------------------
# Step 3: Determine Optimal Number of Clusters (k)
# ------------------------------------------------------------------------------
# Function to find the optimal k based on a maximum cluster size condition
def find_optimal_k(linked, data, max_ratio=0.25):
    """
    Determines the optimal number of clusters (k) such that no cluster exceeds
    the specified max_ratio of the total sample size.

    Parameters:
        linked: Linkage matrix from hierarchical clustering
        data: Input dataset
        max_ratio: Maximum allowable ratio of the largest cluster size

    Returns:
        k: Optimal number of clusters
        clusters: Cluster assignments for each sample
    """
    n_samples = data.shape[0]
    for k in range(2, n_samples):
        clusters = fcluster(linked, k, criterion='maxclust')
        cluster_sizes = np.bincount(clusters)
        max_cluster_size = cluster_sizes.max()
        if max_cluster_size / n_samples <= max_ratio:
            return k, clusters
    return k, clusters  # Return the last k if no optimal k is found

# Find optimal k and cluster assignments
optimal_k, clusters = find_optimal_k(linked, data)

# ------------------------------------------------------------------------------
# Step 4: Reorder Clusters Based on Dendrogram Order
# ------------------------------------------------------------------------------
# Generate dendrogram to extract the leaf order
dendro = dendrogram(linked, no_plot=True)
leaf_order = dendro['leaves']  # Order of the samples in the dendrogram

# Map original cluster IDs to new IDs based on dendrogram order
unique_clusters_in_order = np.unique(clusters[leaf_order])
ordered_cluster_ids = np.zeros_like(clusters)

# Assign new cluster IDs from 1 to optimal_k based on dendrogram order
for new_id, old_id in enumerate(unique_clusters_in_order):
    ordered_cluster_ids[clusters == old_id] = new_id + 1

# Add reordered cluster IDs to the DataFrame
data[f'cluster_id'] = ordered_cluster_ids

# Save the modified DataFrame with reordered cluster IDs
output_filename = 'Autumn_Day_with_ClusterID.csv'
data.to_csv(output_filename, index=False)

# Print the optimal number of clusters
print(f'Optimal number of clusters: {optimal_k}')

# ------------------------------------------------------------------------------
# Step 5: Visualize Dendrogram
# ------------------------------------------------------------------------------
# Plot the dendrogram with the correct number of clusters and colours
plt.figure(figsize=(12, 8))
dendrogram(linked, 
           orientation='top', 
           truncate_mode='lastp', 
           p=70,  
           color_threshold=92,  # Height for Optimal number of clusters (here 11 clusters) 
           leaf_rotation=90., 
           leaf_font_size=10., 
           show_leaf_counts=True)

# Annotate the dendrogram
plt.title('Dendrogram (Autumn-Day)')
plt.xlabel('Data Points per Leaf')
plt.ylabel('Distance')

# Save the dendrogram visualisation
output_dendrogram = 'Dendrogram_Autumn_Day.png'
plt.savefig(output_dendrogram, bbox_inches='tight')
plt.show()

# ------------------------------------------------------------------------------
# End of Script
# ------------------------------------------------------------------------------


In [None]:
# ------------------------------------------------------------------------------
#    Autumn Night Migration
# ------------------------------------------------------------------------------

# Input:
#   - `Autumn_Night_CF.csv`: Preprocessed feature data for autumn nighttime migration
#
# Output:
#   - `Autumn_Night_with_ClusterID.csv`: Data with assigned cluster IDs
#   - `Dendrogram_Autumn_Night.png`: Dendrogram visualisation

## Note: `Autumn_Night_with_ClusterID.csv` will be used in "8_After_Clustering" script within "3_Autumn" folder.

# ------------------------------------------------------------------------------

# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import pdist

# ------------------------------------------------------------------------------
# Step 1: Load and Preprocess Data
# ------------------------------------------------------------------------------
# Load CSV data containing features for autumn nighttime migration
filename = 'Autumn_Night_CF.csv'
data = pd.read_csv(filename)

# Extract all features as a NumPy array for clustering analysis
X = data.values

# ------------------------------------------------------------------------------
# Step 2: Compute Distance Matrix and Hierarchical Clustering
# ------------------------------------------------------------------------------
# Calculate the distance matrix using Euclidean distance
# Perform hierarchical clustering using Ward's linkage method
dist_matrix = pdist(X, metric='euclidean')
linked = linkage(X, method='ward')

# ------------------------------------------------------------------------------
# Step 3: Determine Optimal Number of Clusters (k)
# ------------------------------------------------------------------------------
# Function to find the optimal k based on a maximum cluster size condition
def find_optimal_k(linked, data, max_ratio=0.25):
    """
    Determines the optimal number of clusters (k) such that no cluster exceeds
    the specified max_ratio of the total sample size.

    Parameters:
        linked: Linkage matrix from hierarchical clustering
        data: Input dataset
        max_ratio: Maximum allowable ratio of the largest cluster size

    Returns:
        k: Optimal number of clusters
        clusters: Cluster assignments for each sample
    """
    n_samples = data.shape[0]
    for k in range(2, n_samples):
        clusters = fcluster(linked, k, criterion='maxclust')
        cluster_sizes = np.bincount(clusters)
        max_cluster_size = cluster_sizes.max()
        if max_cluster_size / n_samples <= max_ratio:
            return k, clusters
    return k, clusters  # Return the last k if no optimal k is found

# Find optimal k and cluster assignments
optimal_k, clusters = find_optimal_k(linked, data)

# ------------------------------------------------------------------------------
# Step 4: Reorder Clusters Based on Dendrogram Order
# ------------------------------------------------------------------------------
# Generate dendrogram to extract the leaf order
dendro = dendrogram(linked, no_plot=True)
leaf_order = dendro['leaves']  # Order of the samples in the dendrogram

# Map original cluster IDs to new IDs based on dendrogram order
unique_clusters_in_order = np.unique(clusters[leaf_order])
ordered_cluster_ids = np.zeros_like(clusters)

# Assign new cluster IDs from 1 to optimal_k based on the dendrogram order
for new_id, old_id in enumerate(unique_clusters_in_order):
    ordered_cluster_ids[clusters == old_id] = new_id + 1

# Add reordered cluster IDs to the DataFrame
data[f'cluster_id'] = ordered_cluster_ids

# Save the modified DataFrame with reordered cluster IDs
output_filename = 'Autumn_Night_with_ClusterID.csv'
data.to_csv(output_filename, index=False)

# Print the optimal number of clusters
print(f'Optimal number of clusters: {optimal_k}')

# ------------------------------------------------------------------------------
# Step 5: Visualize Dendrogram
# ------------------------------------------------------------------------------
# Plot the dendrogram with the correct number of clusters and colours
plt.figure(figsize=(12, 8))
dendrogram(linked, 
           orientation='top', 
           truncate_mode='lastp', 
           p=60,  
           color_threshold=37,  # # Height for Optimal number of clusters (here 11 clusters)
           leaf_rotation=90., 
           leaf_font_size=10., 
           show_leaf_counts=True)

# Annotate the dendrogram
plt.title('Dendrogram (Autumn-Night)')
plt.xlabel('Data Points per Leaf')
plt.ylabel('Distance')

# Save the dendrogram visualisation
output_dendrogram = 'Dendrogram_Autumn_Night.png'
plt.savefig(output_dendrogram, bbox_inches='tight')
plt.show()

# ------------------------------------------------------------------------------
# End of Script
# ------------------------------------------------------------------------------


In [None]:
# ------------------------------------------------------------------------------
#    Spring Day Migration
# ------------------------------------------------------------------------------

# Input:
#   - `Spring_Day_CF.csv`: Preprocessed feature data for spring daytime migration
#
# Output:
#   - `Spring_Day_With_ClusterID.csv`: Data with assigned cluster IDs
#   - `Dendrogram_Spring_Day.png`: Dendrogram visualisation

## Note: `Spring_Day_With_ClusterID.csv` will be used in "8_After_Clustering" script within "4_Spring" folder.

# ------------------------------------------------------------------------------

# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import pdist

# ------------------------------------------------------------------------------
# Step 1: Load and Preprocess Data
# ------------------------------------------------------------------------------
# Load CSV data containing features for spring daytime migration
filename = 'Spring_Day_CF.csv'
data = pd.read_csv(filename)

# Extract all features as a NumPy array for clustering analysis
X = data.values

# ------------------------------------------------------------------------------
# Step 2: Compute Distance Matrix and Hierarchical Clustering
# ------------------------------------------------------------------------------
# Calculate the distance matrix using Euclidean distance
# Perform hierarchical clustering using Ward's linkage method
dist_matrix = pdist(X, metric='euclidean')
linked = linkage(X, method='ward')

# ------------------------------------------------------------------------------
# Step 3: Determine Optimal Number of Clusters (k)
# ------------------------------------------------------------------------------
# Function to find the optimal k based on a maximum cluster size condition
def find_optimal_k(linked, data, max_ratio=0.25):
    """
    Determines the optimal number of clusters (k) such that no cluster exceeds
    the specified max_ratio of the total sample size.

    Parameters:
        linked: Linkage matrix from hierarchical clustering
        data: Input dataset
        max_ratio: Maximum allowable ratio of the largest cluster size

    Returns:
        k: Optimal number of clusters
        clusters: Cluster assignments for each sample
    """
    n_samples = data.shape[0]
    for k in range(2, n_samples):
        clusters = fcluster(linked, k, criterion='maxclust')
        cluster_sizes = np.bincount(clusters)
        max_cluster_size = cluster_sizes.max()
        if max_cluster_size / n_samples <= max_ratio:
            return k, clusters
    return k, clusters  # Return the last k if no optimal k is found

# Find optimal k and cluster assignments
optimal_k, clusters = find_optimal_k(linked, data)

# ------------------------------------------------------------------------------
# Step 4: Reorder Clusters Based on Dendrogram Order
# ------------------------------------------------------------------------------
# Generate dendrogram to extract the leaf order
dendro = dendrogram(linked, no_plot=True)
leaf_order = dendro['leaves']  # Order of the samples in the dendrogram

# Map original cluster IDs to new IDs based on dendrogram order
unique_clusters_in_order = np.unique(clusters[leaf_order])
ordered_cluster_ids = np.zeros_like(clusters)

# Assign new cluster IDs from 1 to optimal_k based on dendrogram order
for new_id, old_id in enumerate(unique_clusters_in_order):
    ordered_cluster_ids[clusters == old_id] = new_id + 1

# Add reordered cluster IDs to the DataFrame
data[f'cluster_id'] = ordered_cluster_ids

# Save the modified DataFrame with reordered cluster IDs
output_filename = 'Spring_Day_With_ClusterID.csv'
data.to_csv(output_filename, index=False)

# Print the optimal number of clusters
print(f'Optimal number of clusters: {optimal_k}')

# ------------------------------------------------------------------------------
# Step 5: Visualize Dendrogram
# ------------------------------------------------------------------------------
# Plot the dendrogram with the correct number of clusters and colours
plt.figure(figsize=(12, 8))
dendrogram(linked, 
           orientation='top', 
           truncate_mode='lastp', 
           p=50, 
           color_threshold=240.5,  # Height for Optimal number of clusters (here 6 clusters)
           leaf_rotation=90., 
           leaf_font_size=10., 
           show_leaf_counts=True)

# Annotate the dendrogram
plt.title('Dendrogram (Spring-Day)')
plt.xlabel('Data Points per Leaf')
plt.ylabel('Distance')

# Save the dendrogram visualisation
output_dendrogram = 'Dendrogram_Spring_Day.png'
plt.savefig(output_dendrogram, bbox_inches='tight')
plt.show()

# ------------------------------------------------------------------------------
# End of Script
# ------------------------------------------------------------------------------


In [None]:
# ------------------------------------------------------------------------------
#     Spring Night Migration
# ------------------------------------------------------------------------------

# Input:
#   - `Spring_Night_CF.csv`: Preprocessed feature data for spring nighttime migration
#
# Output:
#   - `Spring_Night_with_ClusterID.csv`: Data with assigned cluster IDs
#   - `Dendrogram_Spring_Night.png`: Dendrogram visualisation

## Note: `Spring_Night_With_ClusterID.csv` will be used in "8_After_Clustering" script within "4_Spring" folder.

# ------------------------------------------------------------------------------

# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import pdist

# ------------------------------------------------------------------------------
# Step 1: Load and Preprocess Data
# ------------------------------------------------------------------------------
# Load CSV data containing features for spring nighttime migration
filename = 'Spring_Night_CF.csv'
data = pd.read_csv(filename)

# Extract all features as a NumPy array for clustering analysis
X = data.values

# ------------------------------------------------------------------------------
# Step 2: Compute Distance Matrix and Hierarchical Clustering
# ------------------------------------------------------------------------------
# Calculate the distance matrix using Euclidean distance
# Perform hierarchical clustering using Ward's linkage method
dist_matrix = pdist(X, metric='euclidean')
linked = linkage(X, method='ward')

# ------------------------------------------------------------------------------
# Step 3: Determine Optimal Number of Clusters (k)
# ------------------------------------------------------------------------------
# Function to find the optimal k based on a maximum cluster size condition
def find_optimal_k(linked, data, max_ratio=0.25):
    """
    Determines the optimal number of clusters (k) such that no cluster exceeds
    the specified max_ratio of the total sample size.

    Parameters:
        linked: Linkage matrix from hierarchical clustering
        data: Input dataset
        max_ratio: Maximum allowable ratio of the largest cluster size

    Returns:
        k: Optimal number of clusters
        clusters: Cluster assignments for each sample
    """
    n_samples = data.shape[0]
    for k in range(2, n_samples):
        clusters = fcluster(linked, k, criterion='maxclust')
        cluster_sizes = np.bincount(clusters)
        max_cluster_size = cluster_sizes.max()
        if max_cluster_size / n_samples <= max_ratio:
            return k, clusters
    return k, clusters  # Return the last k if no optimal k is found

# Find optimal k and cluster assignments
optimal_k, clusters = find_optimal_k(linked, data)

# ------------------------------------------------------------------------------
# Step 4: Reorder Clusters Based on Dendrogram Order
# ------------------------------------------------------------------------------
# Generate dendrogram to extract the leaf order
dendro = dendrogram(linked, no_plot=True)
leaf_order = dendro['leaves']  # Order of the samples in the dendrogram

# Map original cluster IDs to new IDs based on dendrogram order
unique_clusters_in_order = np.unique(clusters[leaf_order])
ordered_cluster_ids = np.zeros_like(clusters)

# Assign new cluster IDs from 1 to optimal_k based on dendrogram order
for new_id, old_id in enumerate(unique_clusters_in_order):
    ordered_cluster_ids[clusters == old_id] = new_id + 1

# Add reordered cluster IDs to the DataFrame
data[f'cluster_id'] = ordered_cluster_ids

# Save the modified DataFrame with reordered cluster IDs
output_filename = 'Spring_Night_with_ClusterID.csv'
data.to_csv(output_filename, index=False)

# Print the optimal number of clusters
print(f'Optimal number of clusters: {optimal_k}')

# ------------------------------------------------------------------------------
# Step 5: Visualize Dendrogram
# ------------------------------------------------------------------------------
# Plot the dendrogram with the correct number of clusters and colours
plt.figure(figsize=(12, 8))
dendrogram(linked, 
           orientation='top', 
           truncate_mode='lastp', 
           p=50, 
           color_threshold=43,  # Height for Optimal number of clusters (here 9 clusters)
           leaf_rotation=90., 
           leaf_font_size=10., 
           show_leaf_counts=True)

# Annotate the dendrogram
plt.title('Dendrogram (Spring-Night)')
plt.xlabel('Data Points per Leaf')
plt.ylabel('Distance')

# Save the dendrogram visualisation
output_dendrogram = 'Dendrogram_Spring_Night.png'
plt.savefig(output_dendrogram, bbox_inches='tight')
plt.show()

# ------------------------------------------------------------------------------
# End of Script
# ------------------------------------------------------------------------------
