In [1]:
# ------------------------------------------------------------------------------
# Silhouette and Calinski-Harabasz Validation Indices for Clustering
# ------------------------------------------------------------------------------
# Author: Ali Moayedi
# Email: am636@st-andrews.ac.uk
# Date: October 2024
#
# Description:
# This script calculates Silhouette and Calinski-Harabasz (CH) indices to evaluate
# the quality of clustering solutions. These indices provide a measure of cluster
# cohesion and separation, helping to identify the optimal number of clusters.

### Input:
#   - `Autumn_Day_CF.csv`: Preprocessed feature data for autumn daytime migration. 
#   - `Autumn_Night_CF.csv`: Preprocessed feature data for autumn nighttime migration. 
#   - `Spring_Day_CF.csv`: Preprocessed feature data for spring daytime migration.
#   - `Spring_Night_CF.csv`: Preprocessed feature data for spring nighttime migration.

## Note: These input files are generated by "7_Before_Clustering" scripts.

# Output:
#   - Printed Silhouette and Calinski-Harabasz index values for a range of cluster numbers
# ------------------------------------------------------------------------------


In [None]:
# ------------------------------------------------------------------------------
#   Autumn Day Migration
# ------------------------------------------------------------------------------

# Import necessary libraries
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score

# ------------------------------------------------------------------------------
# Step 1: Load Data
# ------------------------------------------------------------------------------
# Load CSV data containing features for clustering
filename = 'Autumn_Day_CF.csv'
data = pd.read_csv(filename)

# Extract all features as a NumPy array for clustering analysis
X = data.values

# ------------------------------------------------------------------------------
# Step 2: Perform Hierarchical Clustering
# ------------------------------------------------------------------------------
# Compute the distance matrix and hierarchical linkage
linked = linkage(X, method='ward')

# ------------------------------------------------------------------------------
# Step 3: Calculate Validation Indices for Different Cluster Numbers
# ------------------------------------------------------------------------------
# Range of cluster numbers to evaluate
cluster_range = range(2, 20)

# Lists to store validation index results
silhouette_scores = []
calinski_harabasz_scores = []

# Evaluate Silhouette and CH scores for each cluster number
for k in cluster_range:
    # Assign cluster labels using the hierarchical clustering linkage
    cluster_labels = fcluster(linked, k, criterion='maxclust')

    # Calculate Silhouette Score
    silhouette_avg = silhouette_score(X, cluster_labels)
    silhouette_scores.append(silhouette_avg)

    # Calculate Calinski-Harabasz Index
    ch_score = calinski_harabasz_score(X, cluster_labels)
    calinski_harabasz_scores.append(ch_score)

    print(f"For k = {k}: Silhouette Score = {silhouette_avg:.3f}, Calinski-Harabasz Index = {ch_score:.3f}")

# ------------------------------------------------------------------------------
# Step 4: Plot Validation Indices
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

# Plot Silhouette Scores
plt.subplot(1, 2, 1)
plt.plot(cluster_range, silhouette_scores, marker='o', label='Silhouette Score')
plt.title('Silhouette Score - Autumn daytime')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.legend()

# Plot Calinski-Harabasz Scores
plt.subplot(1, 2, 2)
plt.plot(cluster_range, calinski_harabasz_scores, marker='o', label='Calinski-Harabasz Index')
plt.title('Calinski-Harabasz Index - Autumn daytime')
plt.xlabel('Number of Clusters')
plt.ylabel('Calinski-Harabasz Index')
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.savefig('Validation_Indices_Autumn_Day.png')
plt.show()

# ------------------------------------------------------------------------------
# End of Script
# ------------------------------------------------------------------------------


In [None]:
# ------------------------------------------------------------------------------
#    Autumn Night Migration
# ------------------------------------------------------------------------------

# Import necessary libraries
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score

# ------------------------------------------------------------------------------
# Step 1: Load Data
# ------------------------------------------------------------------------------
# Load CSV data containing features for clustering
filename = 'Autumn_Night_CF.csv'
data = pd.read_csv(filename)

# Extract all features as a NumPy array for clustering analysis
X = data.values

# ------------------------------------------------------------------------------
# Step 2: Perform Hierarchical Clustering
# ------------------------------------------------------------------------------
# Compute the distance matrix and hierarchical linkage
linked = linkage(X, method='ward')

# ------------------------------------------------------------------------------
# Step 3: Calculate Validation Indices for Different Cluster Numbers
# ------------------------------------------------------------------------------
# Range of cluster numbers to evaluate
cluster_range = range(2, 20)

# Lists to store validation index results
silhouette_scores = []
calinski_harabasz_scores = []

# Evaluate Silhouette and CH scores for each cluster number
for k in cluster_range:
    # Assign cluster labels using the hierarchical clustering linkage
    cluster_labels = fcluster(linked, k, criterion='maxclust')

    # Calculate Silhouette Score
    silhouette_avg = silhouette_score(X, cluster_labels)
    silhouette_scores.append(silhouette_avg)

    # Calculate Calinski-Harabasz Index
    ch_score = calinski_harabasz_score(X, cluster_labels)
    calinski_harabasz_scores.append(ch_score)

    print(f"For k = {k}: Silhouette Score = {silhouette_avg:.3f}, Calinski-Harabasz Index = {ch_score:.3f}")

# ------------------------------------------------------------------------------
# Step 4: Plot Validation Indices
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

# Plot Silhouette Scores
plt.subplot(1, 2, 1)
plt.plot(cluster_range, silhouette_scores, marker='o', label='Silhouette Score')
plt.title('Silhouette Score - Autumn nighttime')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.legend()

# Plot Calinski-Harabasz Scores
plt.subplot(1, 2, 2)
plt.plot(cluster_range, calinski_harabasz_scores, marker='o', label='Calinski-Harabasz Index')
plt.title('Calinski-Harabasz Index - Autumn nighttime')
plt.xlabel('Number of Clusters')
plt.ylabel('Calinski-Harabasz Index')
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.savefig('Validation_Indices_Autumn_Night.png')
plt.show()

# ------------------------------------------------------------------------------
# End of Script
# ------------------------------------------------------------------------------


In [None]:
# ------------------------------------------------------------------------------
#    Spring Day Migration
# ------------------------------------------------------------------------------

# Import necessary libraries
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score

# ------------------------------------------------------------------------------
# Step 1: Load Data
# ------------------------------------------------------------------------------
# Load CSV data containing features for clustering
filename = 'Spring_Day_CF.csv'
data = pd.read_csv(filename)

# Extract all features as a NumPy array for clustering analysis
X = data.values

# ------------------------------------------------------------------------------
# Step 2: Perform Hierarchical Clustering
# ------------------------------------------------------------------------------
# Compute the distance matrix and hierarchical linkage
linked = linkage(X, method='ward')

# ------------------------------------------------------------------------------
# Step 3: Calculate Validation Indices for Different Cluster Numbers
# ------------------------------------------------------------------------------
# Range of cluster numbers to evaluate
cluster_range = range(2, 20)

# Lists to store validation index results
silhouette_scores = []
calinski_harabasz_scores = []

# Evaluate Silhouette and CH scores for each cluster number
for k in cluster_range:
    # Assign cluster labels using the hierarchical clustering linkage
    cluster_labels = fcluster(linked, k, criterion='maxclust')

    # Calculate Silhouette Score
    silhouette_avg = silhouette_score(X, cluster_labels)
    silhouette_scores.append(silhouette_avg)

    # Calculate Calinski-Harabasz Index
    ch_score = calinski_harabasz_score(X, cluster_labels)
    calinski_harabasz_scores.append(ch_score)

    print(f"For k = {k}: Silhouette Score = {silhouette_avg:.3f}, Calinski-Harabasz Index = {ch_score:.3f}")

# ------------------------------------------------------------------------------
# Step 4: Plot Validation Indices 
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

# Plot Silhouette Scores
plt.subplot(1, 2, 1)
plt.plot(cluster_range, silhouette_scores, marker='o', label='Silhouette Score')
plt.title('Silhouette Score - Spring daytime')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.legend()

# Plot Calinski-Harabasz Scores
plt.subplot(1, 2, 2)
plt.plot(cluster_range, calinski_harabasz_scores, marker='o', label='Calinski-Harabasz Index')
plt.title('Calinski-Harabasz Index - Spring daytime')
plt.xlabel('Number of Clusters')
plt.ylabel('Calinski-Harabasz Index')
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.savefig('Validation_Indices_Spring_Day.png')
plt.show()

# ------------------------------------------------------------------------------
# End of Script
# ------------------------------------------------------------------------------


In [None]:
# ------------------------------------------------------------------------------
#     Spring Night Migration
# ------------------------------------------------------------------------------

# Import necessary libraries
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score

# ------------------------------------------------------------------------------
# Step 1: Load Data
# ------------------------------------------------------------------------------
# Load CSV data containing features for clustering
filename = 'Spring_Night_CF.csv'
data = pd.read_csv(filename)

# Extract all features as a NumPy array for clustering analysis
X = data.values

# ------------------------------------------------------------------------------
# Step 2: Perform Hierarchical Clustering
# ------------------------------------------------------------------------------
# Compute the distance matrix and hierarchical linkage
linked = linkage(X, method='ward')

# ------------------------------------------------------------------------------
# Step 3: Calculate Validation Indices for Different Cluster Numbers
# ------------------------------------------------------------------------------
# Range of cluster numbers to evaluate
cluster_range = range(2, 20)

# Lists to store validation index results
silhouette_scores = []
calinski_harabasz_scores = []

# Evaluate Silhouette and CH scores for each cluster number
for k in cluster_range:
    # Assign cluster labels using the hierarchical clustering linkage
    cluster_labels = fcluster(linked, k, criterion='maxclust')

    # Calculate Silhouette Score
    silhouette_avg = silhouette_score(X, cluster_labels)
    silhouette_scores.append(silhouette_avg)

    # Calculate Calinski-Harabasz Index
    ch_score = calinski_harabasz_score(X, cluster_labels)
    calinski_harabasz_scores.append(ch_score)

    print(f"For k = {k}: Silhouette Score = {silhouette_avg:.3f}, Calinski-Harabasz Index = {ch_score:.3f}")

# ------------------------------------------------------------------------------
# Step 4: Plot Validation Indices 
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

# Plot Silhouette Scores
plt.subplot(1, 2, 1)
plt.plot(cluster_range, silhouette_scores, marker='o', label='Silhouette Score')
plt.title('Silhouette Score - Spring nighttime')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.legend()

# Plot Calinski-Harabasz Scores
plt.subplot(1, 2, 2)
plt.plot(cluster_range, calinski_harabasz_scores, marker='o', label='Calinski-Harabasz Index')
plt.title('Calinski-Harabasz Index - Spring nighttime')
plt.xlabel('Number of Clusters')
plt.ylabel('Calinski-Harabasz Index')
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.savefig('Validation_Indices_Spring_Night.png')
plt.show()

# ------------------------------------------------------------------------------
# End of Script
# ------------------------------------------------------------------------------
