# NEU Steel Surface Defect Clustering
**K-Means Clustering and PCA Analysis on Image Data**

In [None]:
# Imports
import os, shutil, random
import numpy as np
import cv2
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

plt.style.use('seaborn-v0_8-darkgrid')

## Step 1: Sampling Balanced Dataset

In [None]:
def sample_images(src_root, dst_root, n_samples, seed=None):
    random.seed(seed)
    os.makedirs(dst_root, exist_ok=True)
    classes = sorted(os.listdir(src_root))
    for cls in classes:
        src_cls_dir = os.path.join(src_root, cls)
        dst_cls_dir = os.path.join(dst_root, cls)
        if not os.path.isdir(src_cls_dir):
            continue
        os.makedirs(dst_cls_dir, exist_ok=True)
        images = [img for img in os.listdir(src_cls_dir) if img.lower().endswith(('.png','.jpg','.jpeg'))]
        sampled = random.sample(images, min(n_samples, len(images)))
        for img in sampled:
            shutil.copy2(os.path.join(src_cls_dir, img), os.path.join(dst_cls_dir, img))
        print(f"{cls}: Sampled {len(sampled)} images")

## Step 2: Preprocess Images - Grayscale, Resize, Normalize, Flatten

In [None]:
def process_images(image_folder, num_images_per_class):
    flattened_images = []
    labels = []
    class_labels = {
        'crazing': 0,
        'inclusion': 1,
        'patches': 2,
        'pitted_surface': 3,
        'rolled-in_scale': 4,
        'scratches': 5
    }
    for class_folder in os.listdir(image_folder):
        class_path = os.path.join(image_folder, class_folder)
        if os.path.isdir(class_path):
            image_files = [f for f in os.listdir(class_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            sampled_images = random.sample(image_files, min(num_images_per_class, len(image_files)))
            for image in sampled_images:
                img_path = os.path.join(class_path, image)
                img = cv2.imread(img_path)
                if img is None: continue
                img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                img_resized = cv2.resize(img_gray, (64, 64))
                img_norm = img_resized / 255.0
                flattened_images.append(img_norm.reshape(-1))
                labels.append(class_labels[class_folder])
    return np.vstack(flattened_images), np.array(labels)

## Step 3: KMeans Clustering on Raw 4096-Dimensional Data

In [None]:
# Fit KMeans on raw data
kmeans = KMeans(n_clusters=6, init='k-means++', random_state=42)
kmeans.fit(X_train)
y_hat = kmeans.labels_

# Cluster to class mapping
cluster_to_label = {}
for c in range(6):
    members = y_train[y_hat == c]
    if len(members) == 0:
        cluster_to_label[c] = -1
    else:
        cluster_to_label[c] = Counter(members).most_common(1)[0][0]
y_train_mapped = np.vectorize(cluster_to_label.get)(y_hat)
print("Train Accuracy:", accuracy_score(y_train, y_train_mapped))

## Step 4: PCA + KMeans (Dimensionality Tuning)

In [None]:
scaler = StandardScaler(with_std=False)
X_train_centered = scaler.fit_transform(X_train)
X_test_centered = scaler.transform(X_test)

component_list = [5, 10, 20, 30, 40, 50, 64]
results = []
test_results = []

for l in component_list:
    pca = PCA(n_components=l)
    X_train_l = pca.fit_transform(X_train_centered)
    kmeans_l = KMeans(n_clusters=6, init='k-means++', random_state=42)
    y_hat_l = kmeans_l.fit_predict(X_train_l)

    mapping_l = {}
    for c in range(6):
        members = y_train[y_hat_l == c]
        mapping_l[c] = Counter(members).most_common(1)[0][0] if len(members) > 0 else -1

    y_mapped_l = np.vectorize(mapping_l.get)(y_hat_l)
    acc_train = accuracy_score(y_train, y_mapped_l)
    results.append((l, acc_train))

    X_test_l = pca.transform(X_test_centered)
    y_hat_test_l_raw = kmeans_l.predict(X_test_l)
    y_hat_test_l = np.vectorize(mapping_l.get)(y_hat_test_l_raw)
    acc_test = accuracy_score(y_test, y_hat_test_l)
    test_results.append((l, acc_test))

## Step 5: Plot Test Error vs Number of PCA Components

In [None]:
l_vals, test_accs = zip(*test_results)
test_errors = [1.0 - acc for acc in test_accs]

plt.figure(figsize=(8,5))
plt.plot(l_vals, test_errors, marker='o')
plt.xlabel('Number of PCA Components')
plt.ylabel('Test Classification Error')
plt.title('PCA Dimensionality vs Test Error')
plt.grid(True)
plt.show()

## Step 6: 3D PCA Visualization of Clusters

In [None]:
pca_3d = PCA(n_components=3)
X_3d = pca_3d.fit_transform(X_train_centered)
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_3d[:, 0], X_3d[:, 1], X_3d[:, 2], c=y_hat, cmap='rainbow', s=20)
ax.set_title('3D PCA Scatter Plot Colored by Cluster')
plt.show()

## Step 7: Conclusion
- KMeans on raw 4096-D images performs poorly due to high dimensionality.
- PCA improves clustering performance and computational efficiency.
- The optimal number of PCA components is around 40.
- Final model achieves best generalization with PCA + KMeans.