## Import Libraries

In [1]:
import os
import pickle
import numpy as np
import cv2
import h5py
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupShuffleSplit
import pandas as pd

## Config Paths 

In [2]:
ROOT_DIR = os.getcwd()
DATASET_DIR = os.path.join(ROOT_DIR, "dataset")
PROCESSED_DIR = os.path.join(DATASET_DIR, "processed")
FIGURES_DIR = os.path.join(ROOT_DIR, "figures")

os.makedirs(DATASET_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(FIGURES_DIR, exist_ok=True)

PKL_PATH = os.path.join(DATASET_DIR, "LSWMD.pkl")
H5_PATH = os.path.join(PROCESSED_DIR, "wm811k_processed_224.h5")

IMG_SIZE = 224
SEED = 42
np.random.seed(SEED)

# Classes in the exact order most papers use
CLASSES = ['none', 'Loc', 'Edge-Loc', 'Center', 'Edge-Ring', 'Scratch', 'Random', 'Donut', 'Near-full'] 

## Load pickle

In [10]:
import pandas as pd

print("Loading LSWMD.pkl...")
data = pd.read_pickle(PKL_PATH)         

print(f"Total wafers in raw pickle: {len(data)}")
print(data.info())

Loading LSWMD.pkl...
Total wafers in raw pickle: 811457
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 811457 entries, 0 to 811456
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   waferMap        811457 non-null  object 
 1   dieSize         811457 non-null  float64
 2   lotName         811457 non-null  object 
 3   waferIndex      811457 non-null  float64
 4   trianTestLabel  811457 non-null  object 
 5   failureType     811457 non-null  object 
dtypes: float64(2), object(4)
memory usage: 37.1+ MB
None


## Convert to DataFrame for easier handling

In [12]:
import numpy as np

# Vectorized extraction
def extract_label(ft):
    if hasattr(ft, 'ndim') and ft.ndim == 2 and ft.size > 0:
        val = ft[0][0]
        if val not in ('', b''):
            return str(val)      # force to Python str in case it's bytes
    return np.nan

# Apply the extraction
data['failureType'] = data['failureType'].apply(extract_label)

# Keep only the fully labeled ones
df = data[data['failureType'].notna()].copy()

# Add the extra columns you wanted
df['shape'] = df['waferMap'].apply(lambda x: x.shape)
df = df.reset_index().rename(columns={'index': 'idx'})   # 'idx' = original row number in the pickle

print(f"Fully labeled samples: {len(df)}")
print("\nLabel distribution:")
print(df['failureType'].value_counts())

Fully labeled samples: 172950

Label distribution:
failureType
none         147431
Edge-Ring      9680
Edge-Loc       5189
Center         4294
Loc            3593
Scratch        1193
Random          866
Donut           555
Near-full       149
Name: count, dtype: int64


## Label encoding

In [13]:
le = LabelEncoder()
le.fit(CLASSES)
df['label_int'] = le.transform(df['failureType'])

## Lot-based train/val/test split (80/10/10) - NO LEAKAGE

In [14]:
gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=SEED)
train_idx, temp_idx = next(gss.split(df, groups=df['lotName']))

df_train = df.iloc[train_idx].copy()
df_temp = df.iloc[temp_idx].copy()

## Second split on remaining 20% → val/test 10/10

In [16]:
gss2 = GroupShuffleSplit(n_splits=1, train_size=0.5, random_state=SEED+1)
val_idx, test_idx = next(gss2.split(df_temp, groups=df_temp['lotName']))

df_val = df_temp.iloc[val_idx].copy()
df_test = df_temp.iloc[test_idx].copy()

print(f"Train: {len(df_train)} | Val: {len(df_val)} | Test: {len(df_test)}")
print(f"Unique lots - Train: {df_train['lotName'].nunique()}, Val: {df_val['lotName'].nunique()}, Test: {df_test['lotName'].nunique()}")

Train: 138503 | Val: 16834 | Test: 17613
Unique lots - Train: 8609, Val: 1076, Test: 1077


## Preprocessing function

In [17]:
def preprocess_wafer(wafer_map):
    """
    Input: raw wafer_map (H, W) with values 0,1,2
    Output: (224,224,3) uint8 image ready for ImageNet-pretrained models
    """
    # Convert to binary defect map: only defective dies = 255, everything else = 0
    defect = np.zeros(wafer_map.shape, dtype=np.uint8)
    defect[wafer_map == 2] = 255
    
    # Resize with CUBIC (as requested, though NEAREST is also popular for binary)
    resized = cv2.resize(defect, (IMG_SIZE, IMG_SIZE), interpolation=cv2.INTER_CUBIC)
    
    # Convert to 3-channel
    resized_3ch = np.repeat(resized[:, :, np.newaxis], 3, axis=2)
    
    return resized_3ch

## Process and save to HDF5 (uint8 to keep size ~25-30GB total)

In [18]:
def save_split_to_h5(df_split, split_name, h5_file):
    images = []
    labels = []
    lotnames = []
    original_indices = []  # to trace back if needed
    
    for _, row in df_split.iterrows():
        img = preprocess_wafer(row['waferMap'])
        images.append(img)
        labels.append(row['label_int'])
        lotnames.append(row['lotName'])
        original_indices.append(row['idx'])
    
    images = np.array(images, dtype=np.uint8)
    labels = np.array(labels, dtype=np.uint8)
    original_indices = np.array(original_indices, dtype=np.int32)
    
    # Save with gzip compression
    h5_file.create_dataset(f"{split_name}/images", data=images, compression="gzip", compression_opts=4)
    h5_file.create_dataset(f"{split_name}/labels", data=labels, compression="gzip")
    h5_file.create_dataset(f"{split_name}/lotnames", data=np.array(lotnames, dtype='S'), compression="gzip")
    h5_file.create_dataset(f"{split_name}/original_indices", data=original_indices, compression="gzip")
    
    print(f"{split_name.upper()} saved: {images.shape} images, {images.nbytes / 1e9:.1f} GB")

print("Processing and saving to HDF5 (this will take 30-60 minutes)...")
with h5py.File(H5_PATH, 'w') as h5f:
    save_split_to_h5(df_train, 'train', h5f)
    save_split_to_h5(df_val, 'val', h5f)
    save_split_to_h5(df_test, 'test', h5f)
    
    # Save class names and encoder
    h5f.create_dataset('class_names', data=np.array(CLASSES, dtype='S'))
    h5f.attrs['num_classes'] = len(CLASSES)

print(f"All done! HDF5 saved to: {H5_PATH}")

Processing and saving to HDF5 (this will take 30-60 minutes)...
TRAIN saved: (138503, 224, 224, 3) images, 20.8 GB
VAL saved: (16834, 224, 224, 3) images, 2.5 GB
TEST saved: (17613, 224, 224, 3) images, 2.7 GB
All done! HDF5 saved to: C:\Users\user\Desktop\project 465\dataset\processed\wm811k_processed_224.h5


## EDA Plots (saved to figures/)

In [20]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='failureType', order=CLASSES)
plt.xticks(rotation=45)
plt.title('Class Distribution (172k labeled samples)')
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, "class_distribution.png"), dpi=300)
plt.close()

## Random 9 examples (one per class)

In [22]:
fig, axes = plt.subplots(3, 3, figsize=(12, 12))
axes = axes.ravel()
for i, cls in enumerate(CLASSES):
    sample = df[df['failureType'] == cls].sample(1, random_state=SEED)
    raw_map = sample['waferMap'].values[0]
    processed = preprocess_wafer(raw_map)[:, :, 0]  # show single channel
    
    axes[i].imshow(processed, cmap='gray')
    axes[i].set_title(f"{cls} (original shape: {raw_map.shape})")
    axes[i].axis('off')

plt.suptitle("One Example Per Class (224×224 processed)", fontsize=16)
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, "examples_per_class.png"), dpi=300)
plt.close()

# Wafer original size distribution
sizes = df['shape'].apply(lambda x: f"{x[0]}×{x[1]}")
size_counts = Counter(sizes)
common_sizes = size_counts.most_common(10)

plt.figure(figsize=(10, 6))
plt.bar(range(len(common_sizes)), [v for k,v in common_sizes])
plt.xticks(range(len(common_sizes)), [k for k,v in common_sizes], rotation=45)
plt.title('Top 10 Most Common Original Wafer Sizes')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, "wafer_sizes.png"), dpi=300)
plt.close()

print(f"EDA figures saved to {FIGURES_DIR}/")
print("\n=== DATA PIPELINE COMPLETE ===")
print("We can now instantly load data with:")
print("""
with h5py.File('dataset/processed/wm811k_processed_224.h5', 'r') as f:
    X_train = f['train/images'][:]
    y_train = f['train/labels'][:]
""")

EDA figures saved to C:\Users\user\Desktop\project 465\figures/

=== DATA PIPELINE COMPLETE ===
We can now instantly load data with:

with h5py.File('dataset/processed/wm811k_processed_224.h5', 'r') as f:
    X_train = f['train/images'][:]
    y_train = f['train/labels'][:]

