In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.preprocessing import StandardScaler, RobustScaler

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
#! pip install scikit-learn
#! pip install umap-learn
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


In [3]:
data_path= "../data/data_sensors.csv"

In [4]:
df= pd.read_csv(data_path)

In [5]:
df.columns.tolist()

['Sensor 0',
 'Sensor 1',
 'Sensor 2',
 'Sensor 3',
 'Sensor 4',
 'Sensor 5',
 'Sensor 6',
 'Sensor 7',
 'Sensor 8',
 'Sensor 9',
 'Sensor 10',
 'Sensor 11',
 'Sensor 12',
 'Sensor 13',
 'Sensor 14',
 'Sensor 15',
 'Sensor 16',
 'Sensor 17',
 'Sensor 18',
 'Sensor 19',
 'Label']

In [6]:
print("Dataset Shape:", df.shape)
print("\nData Info:")
df.info()
print("\nMissing Values per Column:")
print(df.isnull().sum())

# Separate features and labels

features =[x for x in df.columns.tolist() if x!="Label"]
X = X = df.drop(columns=["Label"])
y = df['Label']

# Analyze the labeled vs. unlabeled data
labeled_mask = y.notna()
unlabeled_mask = y.isna()

num_labeled = labeled_mask.sum()
num_unlabeled = unlabeled_mask.sum()
total_samples = len(df)

print(f"\nTotal samples: {total_samples}")
print(f"Labeled samples: {num_labeled} ({num_labeled/total_samples:.1%})")
print(f"Unlabeled samples: {num_unlabeled} ({num_unlabeled/total_samples:.1%})")

# Determine the number of clusters (k) from the unique labels
unique_labels = y.dropna().unique()
k = len(unique_labels)
print(f"\nNumber of unique breakdown types (clusters): k = {k}")
print("Unique Labels:", sorted(unique_labels))

Dataset Shape: (1600, 21)

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 21 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Sensor 0   1600 non-null   float64
 1   Sensor 1   1600 non-null   float64
 2   Sensor 2   1600 non-null   float64
 3   Sensor 3   1600 non-null   float64
 4   Sensor 4   1600 non-null   float64
 5   Sensor 5   1600 non-null   float64
 6   Sensor 6   1600 non-null   float64
 7   Sensor 7   1600 non-null   float64
 8   Sensor 8   1600 non-null   float64
 9   Sensor 9   1600 non-null   float64
 10  Sensor 10  1600 non-null   float64
 11  Sensor 11  1600 non-null   float64
 12  Sensor 12  1600 non-null   float64
 13  Sensor 13  1600 non-null   float64
 14  Sensor 14  1600 non-null   float64
 15  Sensor 15  1600 non-null   float64
 16  Sensor 16  1600 non-null   float64
 17  Sensor 17  1600 non-null   float64
 18  Sensor 18  1600 non-null   float64
 19  Sensor 19 

The dataset contains 1600 rows (breakdown events) and 21 columns (20 sensors + 1 label column).   
There are no missing values in the sensor feature columns.
We have a small subset of labeled data: 40 samples are labeled, representing 2.5% of the dataset. The remaining 1560 samples are unlabeled.
The labeled data reveals 4 distinct breakdown categories, which sets our number of clusters, k, to 4.



In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:

def cluster_vis(X, Y, method="PCA"):
    """
    Visualize high-dimensional data with dimensionality reduction and overlay Y values.
    
    Parameters:
    X : DataFrame, shape (n_samples, n_features)
        Input features
    Y : DataFrame or Series, shape (n_samples, 1) or (n_samples,)
        Target values (mostly NaN with some 1,2,3)
    method : str, optional (default="PCA")
        Dimensionality reduction method: "PCA", "TSNE", or "UMAP"
    """
    # Validate inputs
    if method not in ["PCA", "TSNE", "UMAP"]:
        raise ValueError(f"Method '{method}' not implemented. Available: PCA, TSNE, UMAP")
    
    # Perform dimensionality reduction
    if method == "PCA":
        reducer = PCA(n_components=2, random_state=42)
        X_transformed = reducer.fit_transform(X)
    elif method == "TSNE":
        reducer = TSNE(n_components=2, random_state=42, perplexity=min(30, X.shape[0]-1))
        X_transformed = reducer.fit_transform(X)
    elif method == "UMAP":
        reducer = UMAP(n_components=2, random_state=42)
        X_transformed = reducer.fit_transform(X)
    
    # Create DataFrame with transformed data
    df = pd.DataFrame(X_transformed, columns=[f'{method}_1', f'{method}_2'])
    
    y_values = Y.values 
    
    non_nan_mask = ~np.isnan(y_values)
    non_nan_indices = np.where(non_nan_mask)[0]
    unique_values = np.unique(y_values[non_nan_mask])
    
    # Create visualization
    plt.figure(figsize=(12, 8))
    
    # Plot all points in gray (background)
    plt.scatter(df[f'{method}_1'], df[f'{method}_2'], c='black', alpha=0.7, label='All data')
    
    # Overlay points with Y values
    colors = ['red', 'green', 'blue']
    markers = ['o', 's', '^']  # Different markers for each class
    
    for i, val in enumerate(unique_values):
        val_indices = non_nan_indices[y_values[non_nan_indices] == val]
        plt.scatter(
            df.iloc[val_indices][f'{method}_1'], 
            df.iloc[val_indices][f'{method}_2'],
            c=colors[i % len(colors)],
            marker=markers[i % len(markers)],
            s=100,
            label=f'Class {val}',
            edgecolors='black'
        )
    
    # Add plot elements
    plt.title(f'{method} Visualization with Y Values Overlay')
    plt.xlabel(f'{method} Component 1')
    plt.ylabel(f'{method} Component 2')
    
    # Add explained variance for PCA only
    if method == "PCA":
        plt.xlabel(f'{method} Component 1 (Explains {reducer.explained_variance_ratio_[0]*100:.1f}%)')
        plt.ylabel(f'{method} Component 2 (Explains {reducer.explained_variance_ratio_[1]*100:.1f}%)')
    
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig("/home/asad/sensor.png")
    plt.show()
    
    # Print summary statistics
    if method == "PCA":
        print("\nPCA Explained Variance Ratio:", reducer.explained_variance_ratio_)
        print("Total Variance Explained by 2 PCs: {:.1f}%".format(sum(reducer.explained_variance_ratio_)*100))
    else:
        print(f"\n{method} does not provide explained variance metrics")
    
    print("\nY Value Distribution:")
    print(pd.Series(y_values[non_nan_mask]).value_counts().sort_index())

In [9]:
cluster_vis(X, Y, method="UMAP")

NameError: name 'Y' is not defined

In [20]:
## Seeded K-Means algorithm. The core of this semi-supervised approach is to use the expert-labeled data to create intelligent initial starting points (centroids)
# for the K-Means algorithm, guiding it toward a more meaningful solution.

In [10]:

def cluster_vis(X, Y, method="PCA"):
    """
    Visualize high-dimensional data with dimensionality reduction and overlay Y values.
    
    Parameters:
    X : DataFrame, shape (n_samples, n_features)
        Input features
    Y : DataFrame or Series, shape (n_samples, 1) or (n_samples,)
        Target values (mostly NaN with some 1,2,3)
    method : str, optional (default="PCA")
        Dimensionality reduction method: "PCA", "TSNE", or "UMAP"
    """
    # Validate inputs
    if method not in ["PCA", "TSNE", "UMAP"]:
        raise ValueError(f"Method '{method}' not implemented. Available: PCA, TSNE, UMAP")
    
    # Perform dimensionality reduction
    if method == "PCA":
        reducer = PCA(n_components=2, random_state=42)
        X_transformed = reducer.fit_transform(X)
    elif method == "TSNE":
        reducer = TSNE(n_components=2, random_state=42, perplexity=min(30, X.shape[0]-1))
        X_transformed = reducer.fit_transform(X)
    elif method == "UMAP":
        reducer = UMAP(n_components=2, random_state=42)
        X_transformed = reducer.fit_transform(X)
    
    # Create DataFrame with transformed data
    df = pd.DataFrame(X_transformed, columns=[f'{method}_1', f'{method}_2'])
    
    y_values = Y.values 
    
    non_nan_mask = ~np.isnan(y_values)
    non_nan_indices = np.where(non_nan_mask)[0]
    unique_values = np.unique(y_values[non_nan_mask])
    
    # Create visualization
    plt.figure(figsize=(12, 8))
    
    # Plot all points in gray (background)
    plt.scatter(df[f'{method}_1'], df[f'{method}_2'], c='black', alpha=0.7, label='All data')
    
    # Overlay points with Y values
    colors = ['red', 'green', 'blue']
    markers = ['o', 's', '^']  # Different markers for each class
    
    for i, val in enumerate(unique_values):
        val_indices = non_nan_indices[y_values[non_nan_indices] == val]
        plt.scatter(
            df.iloc[val_indices][f'{method}_1'], 
            df.iloc[val_indices][f'{method}_2'],
            c=colors[i % len(colors)],
            marker=markers[i % len(markers)],
            s=100,
            label=f'Class {val}',
            edgecolors='black'
        )
    
    # Add plot elements
    plt.title(f'{method} Visualization with Y Values Overlay')
    plt.xlabel(f'{method} Component 1')
    plt.ylabel(f'{method} Component 2')
    
    # Add explained variance for PCA only
    if method == "PCA":
        plt.xlabel(f'{method} Component 1 (Explains {reducer.explained_variance_ratio_[0]*100:.1f}%)')
        plt.ylabel(f'{method} Component 2 (Explains {reducer.explained_variance_ratio_[1]*100:.1f}%)')
    
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig("/home/asad/sensor.png")
    plt.show()
    
    if method == "PCA":
        print("\nPCA Explained Variance Ratio:", reducer.explained_variance_ratio_)
        print("Total Variance Explained by 2 PCs: {:.1f}%".format(sum(reducer.explained_variance_ratio_)*100))
    else:
        print(f"\n{method} does not provide explained variance metrics")

In [14]:
# Combine scaled features and original labels for centroid calculation
df_scaled = pd.DataFrame(X_scaled, columns=features)
df_scaled['Label'] = y

# Isolate the labeled data
labeled_data_scaled = df_scaled[labeled_mask]

# Calculate initial centroids from the labeled data
initial_centroids = labeled_data_scaled.groupby('Label')[features].mean().values
print("Shape of initial centroids:", initial_centroids.shape)

# Configure and train the K-Means model
kmeans = KMeans(
    n_clusters=k,
    init=initial_centroids,
    n_init=1,  # We provide the seeds, so only one initialization is needed
    random_state=42
)

# Fit the model on the entire scaled dataset
kmeans.fit(X_scaled)

# Get the predicted cluster labels for all data points
predicted_labels = kmeans.labels_

Shape of initial centroids: (3, 20)


In [15]:
# Extract true and predicted labels for the labeled subset
true_labels_subset = labeled_data_scaled['Label'].astype(int)
predicted_labels_subset = predicted_labels[labeled_mask]

# Calculate supervised metrics
ari_score = adjusted_rand_score(true_labels_subset, predicted_labels_subset)
nmi_score = normalized_mutual_info_score(true_labels_subset, predicted_labels_subset)

# Calculate unsupervised metric on the full dataset
silhouette = silhouette_score(X_scaled, predicted_labels)

print(f"Adjusted Rand Index (ARI) on labeled data: {ari_score:.4f}")
print(f"Normalized Mutual Information (NMI) on labeled data: {nmi_score:.4f}")
print(f"Silhouette Score on all data: {silhouette:.4f}")

Adjusted Rand Index (ARI) on labeled data: 0.0978
Normalized Mutual Information (NMI) on labeled data: 0.1865
Silhouette Score on all data: 0.0363


In [None]:
 #results strongly suggest that the underlying clusters in the sensor data are not spherical. They are likely elongated, serpentine, or have other complex, non-convex shapes.