In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

import os
import pandas as pd
from PIL import Image
import torch
from torchvision import transforms

from matplotlib import pyplot as plt

In [2]:


# Ton dossier de data (depuis CNN_simple.ipynb)
DATA_PATH = "../cartoonset10k"

transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor()
])

data_pairs = []   # (image_tensor, hair_color_index)

for file in os.listdir(DATA_PATH):
    if file.endswith(".csv"):
        csv_path = os.path.join(DATA_PATH, file)
        img_path = csv_path.replace(".csv", ".png")

        if not os.path.exists(img_path):
            continue

        # 1) Charger l'image
        img = Image.open(img_path).convert("RGB")
        img = transform(img)

        # 2) Charger le CSV SANS header
        df = pd.read_csv(csv_path, header=None)

        # 3) Nettoyer la colonne des noms d'attributs
        df[0] = df[0].str.replace('"', '')

        # 4) Récupérer la ligne correspondant à hair_color
        hair_row = df[df[0] == "hair_color"]
        if len(hair_row) == 0:
            continue  # au cas où

        hair_color_value = int(hair_row.iloc[0, 1])   # 2ème colonne = valeur du label

        data_pairs.append((img, hair_color_value))

print("Nombre de (image, hair_color) :", len(data_pairs))
print("Exemple y :", data_pairs[0][1])
print("Shape image x :", data_pairs[0][0].shape)

Nombre de (image, hair_color) : 10000
Exemple y : 2
Shape image x : torch.Size([3, 64, 64])


In [3]:
# =========================
# 1) Dataset & DataLoader
# =========================

class HairColorDataset(Dataset):
    def __init__(self, data_pairs):
        self.images = [x for x, _ in data_pairs]
        self.labels = [int(y) for _, y in data_pairs]   # 0..9 normalement

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        x = self.images[idx]
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

full_dataset = HairColorDataset(data_pairs)

# split train / val (80% / 20%)
train_size = int(0.8 * len(full_dataset))
val_size   = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False)

In [4]:
# Extraire X et y en numpy
X = torch.stack([img.view(-1) for img, _ in data_pairs]).numpy()
y = torch.tensor([y for _, y in data_pairs]).numpy()

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ============================
# 1) RANDOM FOREST
# ============================

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,      # tu peux changer
    max_depth=None,        # laisse None pour laisser l’arbre tout explorer
    random_state=42
)
rf.fit(X_train, y_train)
print(f"Random Forest Acc: {rf.score(X_val, y_val):.4f}")


Random Forest Acc: 0.9850


In [5]:
# ============================
# 2) KNN (plusieurs k)
# ============================

from sklearn.neighbors import KNeighborsClassifier

for k in [3, 5, 7, 9, 11]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    acc = knn.score(X_val, y_val)
    print(f"KNN (k={k}) Accuracy: {acc:.4f}")

KNN (k=3) Accuracy: 0.7705
KNN (k=5) Accuracy: 0.7680
KNN (k=7) Accuracy: 0.7680
KNN (k=9) Accuracy: 0.7545
KNN (k=11) Accuracy: 0.7415
