In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tifffile
import os
from tqdm import tqdm
import glob


#### Fleksibel Load

In [None]:
################ Load Data
MODE = "folder"  # "folder" atau "csv"
DATA_DIR = "data/train"
CSV_PATH = None  # kalau pakai CSV

IMG_EXT = ['jpg', 'jpeg', 'png', 'bmp', 'tiff']

def load_dataset(mode="folder", data_dir=None, csv_path=None):
    if mode == "folder":
        image_paths = []
        labels = []
        for class_name in os.listdir(data_dir):
            class_dir = os.path.join(data_dir, class_name)
            if os.path.isdir(class_dir):
                files = glob.glob(os.path.join(class_dir, "*"))
                for f in files:
                    if f.split(".")[-1].lower() in IMG_EXT:
                        image_paths.append(f)
                        labels.append(class_name)
        df = pd.DataFrame({"image_path": image_paths, "label": labels})
    elif mode == "csv":
        df = pd.read_csv(csv_path)
        assert all(c in df.columns for c in ["image_path", "label"]), "CSV must contain 'image_path' and 'label'"
    else:
        raise ValueError("mode must be 'folder' or 'csv'")

    df = df.sample(frac=1).reset_index(drop=True)
    print(f"✅ Loaded {len(df)} images across {df['label'].nunique()} classes.")
    return df

In [None]:
########## Tabulate Data

data_dir = "intel_dataset"

rows = []
for label in os.listdir(data_dir):
    class_path = os.path.join(data_dir, label)
    if os.path.isdir(class_path):
        for img_name in os.listdir(class_path):
            if img_name.lower().endswith(('jpg','jpeg','png')):
                rows.append({
                    'image_path': os.path.join(class_path, img_name),
                    'label': label
                })

df = pd.DataFrame(rows)
print(df.head())
print("Total images:", len(df))
print(df['label'].value_counts())


#### Data Understanding




In [None]:
############# Show Sample

def show_samples(df, n_per_class=3):
    classes = df['label'].unique()
    for cls in classes:
        subset = df[df['label'] == cls].sample(min(n_per_class, len(df[df['label'] == cls])))
        plt.figure(figsize=(n_per_class * 2, 2))
        for i, (_, row) in enumerate(subset.iterrows()):
            img = Image.open(row['image_path'])
            plt.subplot(1, n_per_class, i + 1)
            plt.imshow(img)
            plt.axis('off')
            plt.title(cls)
        plt.tight_layout()
        plt.show()

show_samples(df)
        
show_samples(df_train, data_dir = train_img)

In [None]:
############# Analyze Image Sizes

widths, heights, ratios = [], [], []
for path in tqdm(df['image_path'], desc="Analyzing image sizes"):
    try:
        with Image.open(path) as img:
            w, h = img.size
            widths.append(w)
            heights.append(h)
            ratios.append(w / h)
    except:
        widths.append(None)
        heights.append(None)
        ratios.append(None)

df['width'] = widths
df['height'] = heights
df['aspect_ratio'] = ratios

print(df[['width','height','aspect_ratio']].describe())

# Plot image size distribution
fig, axes = plt.subplots(1, 2, figsize=(12,4))
sns.histplot(df['width'], ax=axes[0], kde=True, bins=30)
sns.histplot(df['height'], ax=axes[1], kde=True, bins=30)
axes[0].set_title("Image Width Distribution")
axes[1].set_title("Image Height Distribution")
plt.show()

sns.histplot(df['aspect_ratio'], kde=True, bins=30)
plt.title("Aspect Ratio Distribution (W/H)")
plt.show()

#### Tiff File

In [None]:
## #pip install --force-reinstall "tifffile==2024.8.30"
#pip install --force-reinstall "imagecodecs"
def show_samples(df, data_dir, n_per_class=5, image_paths="filename", labels="genus"):
    subset = df.sample(min(n_per_class, len(df)))
    plt.figure(figsize=(n_per_class * 2, 2))

    for i, (_, row) in enumerate(subset.iterrows()):
        img_path = os.path.join(data_dir, row[image_paths])
        try:
            img_array = tifffile.imread(img_path)
        except Exception:
            img = Image.open(img_path).convert("RGB")
            img_array = np.array(img)

        if img_array.ndim == 2:
            img_array = np.stack([img_array]*3, axis=-1)
        elif img_array.ndim == 3 and img_array.shape[2] > 3:
            img_array = img_array[:, :, :3]
        img_display = img_array[..., :3] / np.max(img_array)
        plt.subplot(1, n_per_class, i + 1)
        plt.imshow(img_display)
        plt.axis('off')
    plt.tight_layout()
    plt.show()
    
import imagecodecs
import tifffile

print("tifffile:", tifffile.__version__)
print("imagecodecs:", imagecodecs.__version__)

show_samples(df_train, data_dir = image_folder, image_paths = "name")