In [None]:
# from scipy.io import loadmat
#
# misc_root = "archive/misc"
# mat_data = loadmat(os.path.join(misc_root, "make_model_name.mat"))
# print(mat_data.keys())
#
# car_type_mat = loadmat(os.path.join(misc_root, "car_type.mat"))
# print(car_type_mat.keys())


In [None]:
# import os
# from collections import defaultdict
#
# # Analyser la structure des dossiers d'images
# print("=== ANALYSE DE LA STRUCTURE DES DONNÉES ===")
#
# # Compter le nombre de modèles par marque
# make_model_count = defaultdict(set)
#
# for make_id in range(1, 20):  # Analyser les 20 premières marques
#     image_dir = os.path.join("archive/image", str(make_id))
#     if not os.path.exists(image_dir):
#         continue
#
#     print(f"\nMake ID {make_id}:")
#     for subdir in os.listdir(image_dir):
#         subdir_path = os.path.join(image_dir, subdir)
#         if os.path.isdir(subdir_path):
#             print(f"  {subdir}/")
#             # Essayer d'extraire le modèle du nom du sous-dossier
#             make_model_count[make_id].add(subdir)
#
# print(f"\nModèles trouvés par marque:")
# for make_id, models in make_model_count.items():
#     print(f"Make {make_id}: {len(models)} modèles - {sorted(models)}")

In [1]:
import os
import re
import shutil
from scipy.io import loadmat
import numpy as np
from tqdm import tqdm
import pandas as pd

# --- chemins ---
compcars_root = "archive"
images_root = os.path.join(compcars_root, "image")
labels_root = os.path.join(compcars_root, "label")
misc_root = os.path.join(compcars_root, "misc")
output_root = "dataset_corrected"
os.makedirs(output_root, exist_ok=True)

# --- utilitaires ---
def sanitize(name: str) -> str:
    name = name.strip().replace("/", "_").replace("\\", "_").replace(" ", "_")
    name = re.sub(r"[^A-Za-z0-9_.\-]+", "_", name)
    return name or "Unknown"

def extract_matlab_string(cell):
    if isinstance(cell, np.ndarray):
        if cell.size == 0:
            return None
        item = cell.flat[0]
        if isinstance(item, str):
            return item
        elif isinstance(item, np.ndarray):
            return extract_matlab_string(item)
    elif isinstance(cell, str):
        return cell
    return None

# --- charge les noms de marques et modèles ---
mat_path = os.path.join(misc_root, "make_model_name.mat")
mat = loadmat(mat_path)
model_names_mat = mat['model_names']
make_names_mat = mat['make_names']

make_names_list = []
for i in range(make_names_mat.shape[0]):
    make_name = extract_matlab_string(make_names_mat[i, 0])
    if make_name and make_name.startswith("['") and make_name.endswith("']"):
        make_name = make_name[2:-2]
    make_names_list.append(make_name or f"Make_{i+1}")

model_names_list = []
for i in range(model_names_mat.shape[0]):
    model_name = extract_matlab_string(model_names_mat[i, 0])
    if model_name is None or model_name == '[]' or model_name == '':
        model_names_list.append(f"Unknown_Model_{i+1}")
    else:
        if model_name.startswith("['") and model_name.endswith("']"):
            model_name = model_name[2:-2]
        model_names_list.append(model_name)

# --- Lire le fichier attributes.txt pour le mapping ---
print("Lecture du fichier attributes.txt...")
attributes_path = os.path.join(misc_root, "attributes.txt")

# Créer un mapping model_id -> make_id basé sur les attributs
model_to_make = {}
try:
    with open(attributes_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        # Saute la première ligne (en-tête)
        for line in lines[1:]:
            parts = line.strip().split()
            if len(parts) >= 6:
                model_id = int(parts[0])
                # Le make_id peut être déduit de la structure ou d'autres fichiers
                # Pour l'instant, nous allons utiliser une approche différente
                model_to_make[model_id] = None  # À compléter plus tard
except:
    print("Impossible de lire attributes.txt")

# --- APPROCHE ALTERNATIVE: Utiliser la structure des dossiers ---
print("Création du mapping à partir de la structure...")

# Analyser la structure pour trouver la correspondance
make_model_mapping = {}

for make_id in tqdm(range(1, len(make_names_list) + 1), desc="Analyse structure"):
    make_dir = os.path.join(images_root, str(make_id))
    if not os.path.exists(make_dir):
        continue

    # Compter les sous-dossiers (qui représentent les modèles)
    subfolders = []
    for item in os.listdir(make_dir):
        item_path = os.path.join(make_dir, item)
        if os.path.isdir(item_path):
            subfolders.append(item)

    if subfolders:
        make_model_mapping[make_id] = {
            'make_name': make_names_list[make_id - 1],
            'subfolders': subfolders,
            'model_count': len(subfolders)
        }

# Afficher la structure analysée
print("\nStructure analysée:")
for make_id, info in list(make_model_mapping.items())[:20]:
    print(f"Make {make_id} ({info['make_name']}): {info['model_count']} modèles - {info['subfolders']}")

# --- Indexation détaillée des images ---
print("\nIndexation détaillée des images...")
image_details = {}

for make_id in tqdm(range(1, len(make_names_list) + 1), desc="Indexation détaillée"):
    make_dir = os.path.join(images_root, str(make_id))
    if not os.path.exists(make_dir):
        continue

    for root, dirs, files in os.walk(make_dir):
        for file in files:
            if file.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".webp")):
                # Extraire le chemin relatif pour trouver le modèle
                rel_path = os.path.relpath(root, make_dir)
                subfolder = rel_path.split(os.sep)[0] if os.sep in rel_path else rel_path

                base_name = os.path.splitext(file)[0]
                key = (make_id, base_name)

                image_details[key] = {
                    'path': os.path.join(root, file),
                    'subfolder': subfolder,
                    'make_id': make_id
                }

print(f"Images indexées: {len(image_details)}")

# --- Copie avec mapping basé sur la structure ---
total_copied = 0
per_class_counts = {}
unknown_mappings = 0

print("\nCopie avec mapping structurel...")
for make_id in tqdm(range(1, len(make_names_list) + 1), desc="Traitement final"):
    label_dir = os.path.join(labels_root, str(make_id))
    if not os.path.exists(label_dir):
        continue

    make_name = make_names_list[make_id - 1]

    for root, _, files in os.walk(label_dir):
        for file in files:
            if not file.endswith('.txt'):
                continue

            txt_path = os.path.join(root, file)
            base_name = os.path.splitext(file)[0]

            image_key = (make_id, base_name)
            if image_key not in image_details:
                continue

            img_info = image_details[image_key]
            img_path = img_info['path']
            subfolder = img_info['subfolder']

            try:
                with open(txt_path, 'r') as f:
                    local_model_id = int(f.readline().strip())
            except:
                continue

            if local_model_id <= 0:
                continue

            # UTILISER le sous-dossier comme nom de modèle
            # C'est la clé : le sous-dossier EST le nom du modèle
            model_name = subfolder

            # Nettoyer le nom du modèle si nécessaire
            if model_name.isdigit():
                # Si c'est un nombre, essayons de trouver un nom plus descriptif
                if make_id in make_model_mapping:
                    subfolders = make_model_mapping[make_id]['subfolders']
                    if int(model_name) <= len(subfolders):
                        model_name = subfolders[int(model_name) - 1]
                    else:
                        model_name = f"Model_{model_name}"
                else:
                    model_name = f"Model_{model_name}"

            class_name = f"{make_name} {model_name}"
            sanitized_name = sanitize(class_name)

            dst_dir = os.path.join(output_root, sanitized_name)
            os.makedirs(dst_dir, exist_ok=True)

            dst_filename = f"{make_id}_{model_name}_{os.path.basename(img_path)}"
            dst_path = os.path.join(dst_dir, dst_filename)

            counter = 1
            while os.path.exists(dst_path):
                name, ext = os.path.splitext(dst_filename)
                dst_path = os.path.join(dst_dir, f"{name}_{counter}{ext}")
                counter += 1

            try:
                shutil.copy2(img_path, dst_path)
                total_copied += 1
                per_class_counts[sanitized_name] = per_class_counts.get(sanitized_name, 0) + 1

                if total_copied <= 20:
                    print(f"  {make_name} {model_name}")

            except Exception as e:
                print(f"Erreur: {e}")
                continue

print(f"\n✅ COPIE TERMINÉE!")
print(f"Images copiées: {total_copied}")
print(f"Classes créées: {len(per_class_counts)}")
print(f"Mappings inconnus: {unknown_mappings}")

print("\nTop 20 classes:")
sorted_classes = sorted(per_class_counts.items(), key=lambda x: x[1], reverse=True)
for class_name, count in sorted_classes[:20]:
    print(f"  {class_name}: {count} images")

# Vérification
print(f"\nVérification des dossiers:")
output_dirs = [d for d in os.listdir(output_root) if os.path.isdir(os.path.join(output_root, d))]
print(f"Dossiers créés: {len(output_dirs)}")
print("Exemples:")
for d in output_dirs[:10]:
    print(f"  {d}")

Lecture du fichier attributes.txt...
Création du mapping à partir de la structure...


Analyse structure: 100%|██████████| 163/163 [00:00<00:00, 12781.54it/s]



Structure analysée:
Make 1 (ABT): 12 modèles - ['1101', '1102', '1103', '1104', '1105', '1106', '1107', '1108', '1109', '1110', '1112', '1113']
Make 2 (BAC): 1 modèles - ['1308']
Make 3 (Conquest): 1 modèles - ['1758']
Make 4 (DS): 7 modèles - ['509', '510', '511', '512', '514', '515', '516']
Make 5 (Dacia): 5 modèles - ['1039', '1040', '1041', '1042', '1043']
Make 6 (Fisker): 5 modèles - ['1773', '1774', '1775', '1776', '1777']
Make 7 (GMC): 7 modèles - ['681', '682', '683', '684', '685', '686', '687']
Make 8 (Gumpert): 3 modèles - ['1045', '1046', '1047']
Make 9 (Hennessey): 3 modèles - ['1114', '1115', '1116']
Make 10 (Icona): 1 modèles - ['1309']
Make 11 (Jeep): 9 modèles - ['734', '735', '736', '737', '738', '739', '741', '742', '745']
Make 12 (KTM): 1 modèles - ['1318']
Make 13 (MELKUS): 1 modèles - ['1980']
Make 14 (MG): 9 modèles - ['1827', '1828', '1829', '1830', '1831', '1832', '1833', '1835', '1836']
Make 15 (MINI): 16 modèles - ['827', '828', '829', '830', '831', '832', '8

Indexation détaillée: 100%|██████████| 163/163 [00:01<00:00, 100.47it/s]


Images indexées: 136642

Copie avec mapping structurel...


Traitement final:   0%|          | 0/163 [00:00<?, ?it/s]

  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1101
  ABT Model_1102
  ABT Model_1102


Traitement final: 100%|██████████| 163/163 [00:33<00:00,  4.86it/s]


✅ COPIE TERMINÉE!
Images copiées: 135946
Classes créées: 1716
Mappings inconnus: 0

Top 20 classes:
  Buck_Model_196: 303 images
  Chevy_Model_1915: 287 images
  BWM_Model_68: 283 images
  Benz_Model_127: 272 images
  Chevy_Model_1917: 237 images
  BWM_Model_70: 233 images
  KIA_Model_1181: 229 images
  Audi_34: 228 images
  Ford_Model_590: 214 images
  Volvo_Model_1706: 205 images
  Benz_Model_154: 200 images
  Toyota_Model_1386: 196 images
  Audi_33: 196 images
  Peugeot_Model_259: 195 images
  Benz_Model_132: 183 images
  Buck_Model_192: 182 images
  Audi_38: 179 images
  BAW_Model_342: 178 images
  Volkswagen_Model_501: 175 images
  Citroen_Model_1259: 171 images

Vérification des dossiers:
Dossiers créés: 1716
Exemples:
  ABT_Model_1101
  ABT_Model_1102
  ABT_Model_1103
  ABT_Model_1104
  ABT_Model_1105
  ABT_Model_1106
  ABT_Model_1107
  ABT_Model_1108
  ABT_Model_1109
  ABT_Model_1110



