In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from PIL import Image

class WaferMapDataset(Dataset):
    def __init__(self,
                 file_path,
                 split='train',       # 'train' or 'test'
                 oversample=False,
                 target_dim=(224, 224),
                 task_classes=None
                ):
        self.file_path = file_path
        self.split = split.lower()
        self.oversample = oversample
        self.target_dim = target_dim
        self.task_classes = task_classes

        # 1) Load DataFrame
        df = pd.read_pickle(self.file_path)

        # 2) Replace [0,0] with 'Unknown'
        def replace_zero_zero(x):
            if isinstance(x, (list, np.ndarray)) and np.array_equal(x, [0, 0]):
                return 'Unknown'
            return x
        if 'failureType' not in df.columns:
            raise KeyError("Missing 'failureType' column.")
        df['failureType'] = df['failureType'].apply(replace_zero_zero)
        if 'trainTestLabel' not in df.columns:
            raise KeyError("Missing 'trainTestLabel' column.")
        df['trainTestLabel'] = df['trainTestLabel'].apply(replace_zero_zero)

        # 3) Filter out 'none' or 'Unknown'
        valid_mask = ~df['failureType'].isin(['none', 'Unknown'])
        df = df[valid_mask].reset_index(drop=True)

        # 4) Resize wafer maps
        def resize_wafer_map(wmap):
            img = Image.fromarray(wmap.astype('uint8'))
            img_resized = img.resize(self.target_dim, Image.Resampling.LANCZOS)
            return np.array(img_resized)
        df['waferMap_resized'] = df['waferMap'].apply(resize_wafer_map)

        # 5) Flatten wafer maps from the resized images.
        df['waferMap_flat'] = df['waferMap_resized'].apply(lambda x: x.flatten())

        # 6) Separate train & test
        df_train = df[df['trainTestLabel'] == 'Training'].reset_index(drop=True)
        df_test  = df[df['trainTestLabel'] == 'Test'].reset_index(drop=True)
        if len(df_train) == 0:
            raise ValueError("No training samples with 'trainTestLabel' == 'Training'!")

        # 7) Fit LabelEncoder on the training set
        encoder = LabelEncoder()
        encoder.fit(df_train['failureType'].values)
        self.encoder = encoder

        # 8) If task_classes is provided, filter both training and test sets.
        if self.task_classes is not None:
            df_train['encoded'] = encoder.transform(df_train['failureType'].values)
            df_test['encoded'] = encoder.transform(df_test['failureType'].values)
            df_train = df_train[df_train['encoded'].isin(self.task_classes)].reset_index(drop=True)
            df_test = df_test[df_test['encoded'].isin(self.task_classes)].reset_index(drop=True)
            # Recalculate waferMap_flat after filtering.
            df_train['waferMap_flat'] = df_train['waferMap_resized'].apply(lambda x: x.flatten())
            df_test['waferMap_flat'] = df_test['waferMap_resized'].apply(lambda x: x.flatten())
            new_encoder = LabelEncoder()
            new_encoder.fit(df_train['failureType'].values)
            self.encoder = new_encoder

        # 9) Oversample if in training split and oversample is True.
        if self.oversample and self.split == 'train':
            X_train = np.stack(df_train['waferMap_flat'].values).astype('float32')
            y_train = df_train['failureType'].values
            ros = RandomOverSampler(random_state=42)
            X_res, y_res = ros.fit_resample(X_train, y_train)
            df_train = pd.DataFrame({
                'waferMap_flat': list(X_res),
                'failureType': y_res
            }).reset_index(drop=True)

        # 10) Choose the final split.
        df_split = df_train if self.split == 'train' else df_test

        # 11) Encode labels.
        X_data = np.stack(df_split['waferMap_flat'].values).astype('float32')
        y_data = df_split['failureType'].values
        y_enc  = self.encoder.transform(y_data)

        self.X = X_data
        self.y = y_enc
        self.num_samples = len(self.X)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        wafer_map_flat = self.X[idx]
        expected = self.target_dim[0] * self.target_dim[1]
        if wafer_map_flat.size == expected:
            wafer_map_tensor = torch.from_numpy(wafer_map_flat).float()
            wafer_map_tensor = wafer_map_tensor.view(1, self.target_dim[0], self.target_dim[1]).repeat(3, 1, 1)
        else:
            raise ValueError(f"Sample {idx} has size {wafer_map_flat.size}, expected {expected}. Check your data!")
        label_tensor = torch.tensor(self.y[idx], dtype=torch.long)
        return wafer_map_tensor, label_tensor



In [4]:
# Testing the WaferMapDataset and DataLoader functionality

from Wafer_data_dataset_resize import WaferMapDataset
from torch.utils.data import DataLoader

# Set up the dataset for a specific task (e.g., task_classes [0, 1])
# Adjust task_classes if needed.
task_classes = [2, 3]
try:
    dataset = WaferMapDataset(
        file_path="D:/Waffer Data/WM811K.pkl",
        split='train',
        oversample=False,
        target_dim=(224, 224),
        task_classes=task_classes
    )
    print("Dataset loaded successfully!")
    print("Number of samples:", len(dataset))

    # Check a few individual samples
    for i in range(min(5, len(dataset))):
        sample, label = dataset[i]
        print(f"Sample {i}: shape = {sample.shape}, label = {label}")

    # Create a DataLoader and inspect a batch
    dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
    batch = next(iter(dataloader))
    data, target = batch
    print(f"\nBatch data shape: {data.shape}")   # Expecting (batch_size, 3, 224, 224)
    print(f"Batch target shape: {target.shape}")   # Expecting (batch_size,)
except Exception as e:
    print("Error while testing dataset:", e)


Dataset loaded successfully!
Number of samples: 10971
Sample 0: shape = torch.Size([3, 224, 224]), label = 0
Sample 1: shape = torch.Size([3, 224, 224]), label = 0
Sample 2: shape = torch.Size([3, 224, 224]), label = 0
Sample 3: shape = torch.Size([3, 224, 224]), label = 0
Sample 4: shape = torch.Size([3, 224, 224]), label = 0

Batch data shape: torch.Size([4, 3, 224, 224])
Batch target shape: torch.Size([4])


In [8]:
from collections import Counter
from Wafer_data_dataset_resize import WaferMapDataset

task_classes = [2, 3]

# Load the full training dataset (using all classes)
train_dataset = WaferMapDataset(
    file_path="D:/Waffer Data/WM811K.pkl",
    split="train",
    oversample=False,
    target_dim=(224, 224),
    task_classes=task_classes  # Using None loads all classes
)
train_labels = train_dataset.y
train_counts = Counter(train_labels)
print("Training set label distribution:")
for label, count in train_counts.items():
    print(f"Label {label}: {count} samples")

# Load the full test dataset (using all classes)
test_dataset = WaferMapDataset(
    file_path="D:/Waffer Data/WM811K.pkl",
    split="test",
    oversample=False,
    target_dim=(224, 224),
    task_classes=task_classes  # Using None loads all classes
)
test_labels = test_dataset.y
test_counts = Counter(test_labels)
print("\nTest set label distribution:")
for label, count in test_counts.items():
    print(f"Label {label}: {count} samples")


Training set label distribution:
Label 0: 2417 samples
Label 1: 8554 samples

Test set label distribution:
Label 0: 2772 samples
Label 1: 1126 samples
