# Cluster Fix Notebook
## Step 1: Setup & Configuration

In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans

# Configuration
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
PROCESSED_DIR = os.path.join(BASE_DIR, "ProcessedImages")
MAX_CLUSTERS = 100  # Original number of clusters
N_SUBCLUSTERS = 20  # Start with 20, adjust based on validation

print(f"Working directory: {PROCESSED_DIR}")
assert os.path.exists(PROCESSED_DIR), "Processed directory not found!"

Working directory: /Users/aayus/Desktop/workstuff/Projects/Vestis/ProcessedImages


## Step 2: Load Data & Identify Problem Cluster

In [2]:
# Load features and labels
features = np.load(os.path.join(PROCESSED_DIR, "all_features_reduced.npy"))
labels = np.load(os.path.join(PROCESSED_DIR, "cluster_labels.npy"))

# Find largest cluster
cluster_counts = np.bincount(labels.flatten())
problem_cluster_id = np.argmax(cluster_counts)
problem_mask = labels.flatten() == problem_cluster_id
problem_features = features[problem_mask]

print(f"Original cluster sizes:\n{np.sort(cluster_counts)[-5:]}")
print(f"Target cluster {problem_cluster_id} has {problem_features.shape[0]} items")

Original cluster sizes:
[302 344 388 451 509]
Target cluster 21 has 509 items


## Step 3: Sub-Clustering

In [3]:
def recluster_features(features, n_clusters):
    """Improved clustering with sanity checks"""
    if len(features) < n_clusters:
        n_clusters = max(1, len(features) // 5)
        print(f"Warning: Reducing subclusters to {n_clusters} due to small sample size")
    
    kmeans = MiniBatchKMeans(
        n_clusters=n_clusters,
        init="k-means++",
        batch_size=256,
        random_state=42,
        n_init=3
    )
    return kmeans.fit_predict(features)

sub_labels = recluster_features(problem_features, N_SUBCLUSTERS)
print(f"Created {len(np.unique(sub_labels))} sub-clusters")

Created 20 sub-clusters


## Step 4: Visualization & Validation

In [18]:
def display_samples(features, labels, n_samples=5):
    """Displays random samples from each sub-cluster with validation"""
    
    # Track changes in cluster versions
    current_version = 1
    
    unique_labels = np.unique(labels)
    
    for label in unique_labels:
        print(f"\n--- Sub-cluster {label} ({current_version}) ---")
        
        # Collect all indices for this label
        indices = np.where(labels == label)[0]
        
        if len(indices) < n_samples:
            print(f"Only {len(indices)} items, skipping display (requires at least {n_samples})")
            continue
        
        # Get sample indices without replacement
        samples = np.random.choice(indices, n_samples, replace=False)
        
        # Validate cluster quality: check for diversity in style
        style_scores = []
        for i, idx in enumerate(samples):
            # Example style scoring (replace with actual logic)
            item_features = features[idx]
            style_score = calculate_style_similarity(item_features, [features[j] for j in samples[:i] + samples[i+1:]])
            style_scores.append(style_score)
        
        # Check if all items in the cluster have similar styles
        style_threshold = 0.5  # Adjust this threshold as needed
        outlier_count = sum(1 for score in style_scores if score < style_threshold)
        
        print(f"Cluster quality check:")
        print(f"Outlier count: {outlier_count} / {n_samples}")
        print(f"Style similarity scores: {style_scores}")
        print()
        
        # Replace with your actual image display code
        plt.figure(figsize=(15, 3))
        for i, idx in enumerate(samples):
            plt.subplot(1, n_samples, i+1)
            img = load_image_from_features(features[idx])
            plt.imshow(img)
            if style_score < style_threshold:
                plt.title(f"Outlier: Style similarity {style_score:.2f}")
            else:
                plt.title(f"Normal: Style similarity {style_score:.2f}")
            plt.axis('off')
        
        # Increment version to save previous state (if needed)
        current_version += 1
        
    print("\nStyle Similarity Validation Checklist:")
    print("1. Do items in the same sub-cluster share similar styles?")
    print(f"2. Outliers detected: {current_version - 1}")
    print("3. Are there sub-clusters that should be merged based on style similarity?")
    
    # Manual validation (you can automate this with a separate function)
    manual_validation = {
        "cluster_quality": True,
        "outliers_in_clusters": [label for label, count in zip(unique_labels, [style_scores.count(score < 0.5) for score_list in style_scores])],
        "recommend_action": ""
    }
    
    return manual_validation

## Step 5: Save Results

In [17]:
# Create new labels array
new_labels = labels.copy()
new_labels[problem_mask] = MAX_CLUSTERS + sub_labels

# Save with versioning
version = 1
while True:
    save_path = os.path.join(PROCESSED_DIR, f"cluster_labels_v{version}.npy")
    if not os.path.exists(save_path):
        np.save(save_path, new_labels)
        print(f"Saved new labels to {save_path}")
        break
    version += 1

# Verification
new_cluster_counts = np.bincount(new_labels.flatten())
print("\nPost-fix cluster sizes:")
print(f"Largest original cluster: {np.max(cluster_counts)}")
print(f"Largest new sub-cluster: {np.max(new_cluster_counts[MAX_CLUSTERS:])}")

Saved new labels to /Users/aayus/Desktop/workstuff/Projects/Vestis/ProcessedImages/cluster_labels_v1.npy

Post-fix cluster sizes:
Largest original cluster: 509
Largest new sub-cluster: 49


## Next Steps
1. Update your recommendation code to use `cluster_labels_vX.npy`
2. Run through sample recommendations to verify improvements
3. If needed, repeat with different N_SUBCLUSTERS values