# Agglomerative Clustering Analysis of Cybersecurity Threats

This notebook implements Agglomerative Clustering to analyze patterns in cybersecurity threats from 2015 to 2024.

## 1. Data Loading and Initial Exploration

First, let's load and explore our dataset.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette("husl")

# Load the data
df = pd.read_csv('../Sheets/Global_Cybersecurity_Threats_2015-2024.csv')

# Display basic information about the dataset
print("Dataset Info:")
print("-" * 40)
print(df.info())
print("\nSample of the data:")
print("-" * 40)
print(df.head())

# Check for missing values
print("\nMissing Values:")
print("-" * 40)
print(df.isnull().sum())

## 2. Data Preprocessing

Let's prepare our data for clustering by encoding categorical variables and scaling numerical features.

In [None]:
# Preprocess the data
# Create LabelEncoder for categorical columns
le = LabelEncoder()
df['Attack_Type'] = le.fit_transform(df['Attack_Type'])

# Select features for clustering
feature_columns = ['Attack_Type', 'Financial_Loss', 'User_Impact', 'Year']
X = df[feature_columns].values

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Store feature names for later use
feature_names = feature_columns

## 3. Linkage Analysis

We'll analyze different linkage methods to determine the best approach for our data.

In [None]:
# Analyze different linkage methods
linkage_methods = ['ward', 'complete', 'average', 'single']
plt.figure(figsize=(20, 15))

for idx, method in enumerate(linkage_methods, 1):
    # Compute linkage matrix
    Z = linkage(X_scaled, method=method)
    
    # Create subplot
    plt.subplot(2, 2, idx)
    dendrogram(Z)
    plt.title(f'Dendrogram using {method} linkage')
    plt.xlabel('Sample Index')
    plt.ylabel('Distance')

plt.tight_layout()
plt.show()

## 4. Determine Optimal Number of Clusters

Let's use the silhouette score to find the optimal number of clusters.

In [None]:
# Test different numbers of clusters
n_clusters_range = range(2, 11)
silhouette_scores = []

for n_clusters in n_clusters_range:
    clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    cluster_labels = clusterer.fit_predict(X_scaled)
    silhouette_avg = silhouette_score(X_scaled, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    print(f"For n_clusters = {n_clusters}, silhouette score = {silhouette_avg:.3f}")

# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(n_clusters_range, silhouette_scores, 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')
plt.grid(True)
plt.show()

# Get optimal number of clusters
optimal_clusters = n_clusters_range[np.argmax(silhouette_scores)]
print(f"\nOptimal number of clusters: {optimal_clusters}")

## 5. Perform Agglomerative Clustering

Now we'll perform the clustering with the optimal number of clusters.

In [None]:
# Perform Agglomerative Clustering with optimal number of clusters
agg_clustering = AgglomerativeClustering(n_clusters=optimal_clusters)
cluster_labels = agg_clustering.fit_predict(X_scaled)

# Add cluster labels to the dataframe
df['Cluster'] = cluster_labels

# Print cluster sizes
print("Cluster Sizes:")
print("-" * 40)
print(df['Cluster'].value_counts().sort_index())

# Calculate cluster centers
cluster_centers = []
for i in range(optimal_clusters):
    cluster_center = X_scaled[cluster_labels == i].mean(axis=0)
    cluster_centers.append(cluster_center)
cluster_centers = np.array(cluster_centers)

## 6. Cluster Visualization and Analysis

In [None]:
# Create cluster visualization using PCA
from sklearn.decomposition import PCA

# Perform PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot clusters
plt.figure(figsize=(12, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis')
plt.title('Agglomerative Clustering Results (PCA Visualization)')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.colorbar(scatter, label='Cluster')
plt.show()

# Calculate and display explained variance ratio
explained_var_ratio = pca.explained_variance_ratio_
print(f"Explained variance ratio: {explained_var_ratio}")
print(f"Total explained variance: {sum(explained_var_ratio):.2%}")

## 7. Cluster Characteristics Analysis

In [None]:
# Analyze cluster characteristics
cluster_stats = df.groupby('Cluster').agg({
    'Attack_Type': ['count', 'mean'],
    'Financial_Loss': ['mean', 'std'],
    'User_Impact': ['mean', 'std'],
    'Year': ['mean', 'std']
}).round(2)

print("Cluster Statistics:")
print("-" * 80)
print(cluster_stats)

# Create heatmap of cluster characteristics
plt.figure(figsize=(12, 8))
cluster_means = df.groupby('Cluster')[feature_columns].mean()
cluster_means_scaled = (cluster_means - cluster_means.mean()) / cluster_means.std()
sns.heatmap(cluster_means_scaled, annot=True, cmap='coolwarm', center=0)
plt.title('Cluster Characteristics Heatmap')
plt.tight_layout()
plt.show()

## 8. Temporal Analysis of Clusters

In [None]:
# Analyze temporal distribution of clusters
temporal_dist = pd.crosstab(df['Year'], df['Cluster'])

# Plot temporal distribution
plt.figure(figsize=(15, 8))
temporal_dist.plot(kind='bar', stacked=True)
plt.title('Distribution of Clusters Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Incidents')
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.show()

# Plot relative proportions
plt.figure(figsize=(15, 8))
temporal_dist_pct = temporal_dist.div(temporal_dist.sum(axis=1), axis=0)
temporal_dist_pct.plot(kind='bar', stacked=True)
plt.title('Relative Proportions of Clusters Over Time')
plt.xlabel('Year')
plt.ylabel('Proportion of Incidents')
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.show()

## 9. Save Results

In [None]:
import joblib

# Save clustering results
clustering_results = {
    'model': agg_clustering,
    'labels': cluster_labels,
    'scaled_data': X_scaled,
    'feature_names': feature_names,
    'optimal_clusters': optimal_clusters,
    'scaler': scaler,
    'label_encoder': le
}

# Save the model and results
joblib.dump(clustering_results, 'agglomerative_model.joblib')

## 10. Conclusions

The Agglomerative Clustering analysis has revealed distinct patterns in cybersecurity threats:

1. We identified the optimal number of clusters through silhouette analysis
2. Each cluster represents a unique pattern of threat characteristics
3. The temporal analysis shows how these patterns have evolved over time
4. The cluster characteristics provide insights into different types of cybersecurity threats

These insights can be used to:
- Better understand the relationships between different types of cybersecurity threats
- Identify common patterns in attack characteristics
- Track the evolution of threat patterns over time
- Inform security strategies based on cluster characteristics