# K-means Clustering Analysis: Global Cybersecurity Threats

This notebook performs K-means clustering analysis on the Global Cybersecurity Threats dataset (2015-2024) to identify patterns and groupings in cybersecurity incidents.

## 1. Import Required Libraries

First, let's import all the necessary libraries for our analysis.

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import joblib

# Configure plotting
%matplotlib inline
plt.style.use('seaborn')
sns.set_theme(style="whitegrid")

# Set Pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

## 2. Load and Inspect Data

Let's load our Global Cybersecurity Threats dataset and perform initial data inspection.

In [None]:
# Load the dataset
data_path = '../Sheets/Global_Cybersecurity_Threats_2015-2024.csv'
df = pd.read_csv(data_path)

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFeature Information:")
print(df.info())
print("\nFirst few rows:")
display(df.head())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Create some initial visualizations to understand the data distribution

# Plot attack types distribution
plt.figure(figsize=(12, 6))
df['Attack Type'].value_counts().plot(kind='bar')
plt.title('Distribution of Attack Types')
plt.xlabel('Attack Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Create a boxplot for financial losses by attack type
plt.figure(figsize=(12, 6))
sns.boxplot(x='Attack Type', y='Financial Loss (in Million $)', data=df)
plt.title('Financial Loss Distribution by Attack Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Show correlation between numerical features
numerical_cols = ['Financial Loss (in Million $)', 'Number of Affected Users', 'Incident Resolution Time (in Hours)']
correlation = df[numerical_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

## 3. Preprocess Features

We need to prepare our data for K-means clustering by:
1. Encoding categorical variables
2. Scaling numerical features
3. Handling any missing values (if present)

In [None]:
# Create a copy of the dataframe for preprocessing
df_processed = df.copy()

# Initialize label encoders for categorical columns
categorical_columns = ['Country', 'Attack Type', 'Target Industry', 
                      'Attack Source', 'Security Vulnerability Type', 
                      'Defense Mechanism Used']
label_encoders = {}

# Encode categorical variables
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    df_processed[column] = label_encoders[column].fit_transform(df_processed[column])

# Select features for clustering
features_for_clustering = [
    'Year',
    'Country',
    'Attack Type',
    'Target Industry',
    'Financial Loss (in Million $)',
    'Number of Affected Users',
    'Attack Source',
    'Security Vulnerability Type',
    'Defense Mechanism Used',
    'Incident Resolution Time (in Hours)'
]

# Create feature matrix X
X = df_processed[features_for_clustering].values

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Scaled feature matrix shape:", X_scaled.shape)
print("\nFeature names:", features_for_clustering)

## 4. Determine Optimal Number of Clusters

We'll use the elbow method and silhouette analysis to determine the optimal number of clusters.

In [None]:
# Calculate inertia and silhouette scores for different values of k
k_values = range(2, 11)
inertias = []
silhouette_scores = []

for k in k_values:
    # Fit KMeans
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    
    # Calculate inertia
    inertias.append(kmeans.inertia_)
    
    # Calculate silhouette score
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

# Plot elbow curve
plt.figure(figsize=(12, 5))

# Inertia plot
plt.subplot(1, 2, 1)
plt.plot(k_values, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')

# Silhouette score plot
plt.subplot(1, 2, 2)
plt.plot(k_values, silhouette_scores, 'rx-')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')

plt.tight_layout()
plt.show()

# Print the silhouette scores
for k, score in zip(k_values, silhouette_scores):
    print(f"k={k}: Silhouette Score = {score:.3f}")

## 5. Perform K-means Clustering

Based on the elbow method and silhouette analysis, let's perform K-means clustering with the optimal number of clusters.

In [None]:
# Perform k-means clustering with the optimal number of clusters
optimal_k = 4  # We'll set this based on the elbow method and silhouette analysis results
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# Add cluster labels to the original dataframe
df_processed['Cluster'] = cluster_labels

# Perform PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Create a scatter plot of the clusters
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis')
plt.title('K-means Clustering Results (PCA Visualization)')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.colorbar(scatter, label='Cluster')
plt.show()

# Calculate explained variance ratio
explained_variance = pca.explained_variance_ratio_
print(f"Explained variance ratio: {explained_variance}")
print(f"Total variance explained: {sum(explained_variance):.2%}")

In [None]:
# Analyze cluster characteristics
cluster_stats = []

for cluster in range(optimal_k):
    cluster_data = df_processed[df_processed['Cluster'] == cluster]
    
    # Calculate statistics for numerical features
    financial_loss_mean = cluster_data['Financial Loss (in Million $)'].mean()
    affected_users_mean = cluster_data['Number of Affected Users'].mean()
    resolution_time_mean = cluster_data['Incident Resolution Time (in Hours)'].mean()
    
    # Get most common values for categorical features
    most_common_attack = label_encoders['Attack Type'].inverse_transform([cluster_data['Attack Type'].mode()[0]])[0]
    most_common_industry = label_encoders['Target Industry'].inverse_transform([cluster_data['Target Industry'].mode()[0]])[0]
    most_common_source = label_encoders['Attack Source'].inverse_transform([cluster_data['Attack Source'].mode()[0]])[0]
    
    cluster_stats.append({
        'Cluster': cluster,
        'Size': len(cluster_data),
        'Avg Financial Loss': financial_loss_mean,
        'Avg Affected Users': affected_users_mean,
        'Avg Resolution Time': resolution_time_mean,
        'Most Common Attack': most_common_attack,
        'Most Common Industry': most_common_industry,
        'Most Common Source': most_common_source
    })

# Create a DataFrame with cluster statistics
cluster_summary = pd.DataFrame(cluster_stats)
display(cluster_summary)

## 6. Save the Model

Let's save the trained K-means model and the preprocessing objects for future use.

In [None]:
# Save the model and preprocessing objects
model_data = {
    'kmeans_model': kmeans,
    'scaler': scaler,
    'label_encoders': label_encoders,
    'pca': pca,
    'feature_names': features_for_clustering
}

# Save to file
joblib.dump(model_data, 'cybersecurity_kmeans_model.joblib')
print("Model and preprocessing objects saved successfully!")