# Dépendance

In [1]:
import zipfile
from tqdm import tqdm
import os

import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image
import numpy as np

# Dataset (Chargement + Dezip)

### Chargement

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
os.chdir('/content/drive/MyDrive/Projet/ADD0')
print("Répertoire courant :", os.getcwd())

Répertoire courant : /content/drive/MyDrive/Projet/ADD0


In [4]:
!ls data

pokemon_images	      pokemon_train_12k.csv  pokemon_val.csv
pokemon_test_12k.csv  pokemon_train.csv
pokemon_test.csv      pokemon_val_12k.csv


### Dezip

In [None]:
zip_path = "data/pokemon_images/images12k.zip"
extract_path = "data/pokemon_images/images12k"

os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    files = zip_ref.namelist()
    print(f"Nombre total de fichiers : {len(files)}")

    for file in tqdm(files, desc="Extraction", unit="file"):
        zip_ref.extract(file, extract_path)

In [13]:
!ls data

pokemon_images		     pokemon_train_12k.csv	pokemon_val.csv
pokemon_test.csv	     pokemon_train_cleaned.csv
pokemon_train_12k_clean.csv  pokemon_train.csv


# Preprocessing

# Dataset 1k

In [None]:
!ls

 ADD.pdf		    output_add		       simple_train.py
 data			    output_teacher	       teacher.ipynb
 debug_plots		    prepare_local_dataset.py   test_quality.py
 download_pokemon.py	    preprocessing12k.py       'train_2 (1).py'
 finetune_teacher.py	    preprocessing.ipynb        train_2.py
 generate.py		    preprocessing.py	       train_debug.py
 generate_with_teacher.py   prompts.txt		       trainGPT2.py
 main.ipynb		    quality_test	       trainGPT3.py
 modelGPT2.py		    README.md		       trainGPT.py
 modelGPT.py		    requirements.txt	       train_pokemon_add.sh
 model.py		    run.sh		       train.py
 output			    simple_model.py


In [None]:
!python preprocessing.py --platform linux --base_dir data --output_csv pokemon_train.csv

Chargement des annotations depuis : data/pokemon_images/annotations.xlsx
Metadata sauvegardée dans data/pokemon_images/processed/pokemon_metadata.csv
Prétraitement des images:   0% 0/809 [00:00<?, ?it/s]bulbasaur → data/pokemon_images/processed/pokemon_0000.png
ivysaur → data/pokemon_images/processed/pokemon_0001.png
Prétraitement des images:   0% 2/809 [00:00<00:50, 15.94it/s]venusaur → data/pokemon_images/processed/pokemon_0002.png
Prétraitement des images: 100% 809/809 [00:43<00:00, 18.76it/s]

Résumé : 809 images traitées, 0 manquantes
Dataset final sauvegardé : data/pokemon_train.csv


# Dataset 12k

In [9]:
!python preprocessing12k.py --platform linux --base_dir data --output_csv pokemon_train_12k.csv

Chargement des annotations depuis : data/pokemon_images/annotations12k.xlsx
Métadonnées sauvegardées dans : data/pokemon_images/processed_12k/pokemon_metadata_12k.csv
Prétraitement des images:   0% 0/11007 [00:00<?, ?it/s]bulbasaur → data/pokemon_images/processed_12k/pokemon12k_00000.png
Prétraitement des images:   0% 1/11007 [00:03<9:16:17,  3.03s/it]bulbasaur → data/pokemon_images/processed_12k/pokemon12k_00001.png
Prétraitement des images:   0% 2/11007 [00:03<4:42:10,  1.54s/it]bulbasaur → data/pokemon_images/processed_12k/pokemon12k_00002.png
Prétraitement des images:  20% 2213/11007 [13:31<52:56,  2.77it/s]Image introuvable : data/pokemon_images/images12k/porygon/porygon-z.jpg
Prétraitement des images:  29% 3154/11007 [18:58<43:47,  2.99it/s]Image introuvable : data/pokemon_images/images12k/unown/unown.jpg
Prétraitement des images:  57% 6299/11007 [37:44<26:55,  2.91it/s]Image introuvable : data/pokemon_images/images12k/burmy/burmy.jpg
Prétraitement des images:  59% 6459/11007 [38

# Nettoyage (si besoin seulement)

In [7]:
# Charger le CSV existant
csv_path = "data/pokemon_train_12k.csv"
df = pd.read_csv(csv_path)

In [8]:
df.head()

Unnamed: 0,name,image_path,type,caption
0,umbreon,pokemon_images/processed_12k/pokemon12k_03108.png,Dark,"umbreon, a Dark type Moonlight Pokémon. Umbreo..."
1,meloetta-aria,pokemon_images/processed_12k/pokemon12k_09760.png,Normal/Psychic,"meloetta-aria, a Normal/Psychic type Unknown. ..."
2,blastoise,pokemon_images/processed_12k/pokemon12k_00157.png,Water,"blastoise, a Water type Shellfish Pokémon. Bla..."
3,toxicroak,pokemon_images/processed_12k/pokemon12k_06973.png,Poison/Fighting,"toxicroak, a Poison/Fighting type Toxic Mouth ..."
4,burmy,pokemon_images/processed_12k/pokemon12k_06311.png,Bug,"burmy, a Bug type Bagworm Pokémon. Burmy is a ..."


### Modification de la colonne 'image_path' en ajoutant "/pokemon_images" au début de chaque chemin

In [None]:
tqdm.pandas(desc="Correction des chemins")

df["image_path"] = df["image_path"].progress_apply(
    lambda path: path.replace("data/processed_12k", "data/pokemon_images/processed_12k")
)

# Sauvegarder le fichier corrigé
df.to_csv(csv_path, index=False)
print(f"Chemins corrigés et sauvegardés dans {csv_path}")

Correction des chemins: 100%|██████████| 10918/10918 [00:00<00:00, 413805.85it/s]


Chemins corrigés et sauvegardés dans data/pokemon_train_12k.csv


### Modification de la colonne 'image_path' en ajoutant "data/" au début de chaque chemin

In [7]:
import pandas as pd
df = pd.read_csv("data/pokemon_train_12k.csv")
df['image_path'] = df['image_path'].apply(lambda path: path if path.startswith("data/") else "data/" + path)
print(df['image_path'].head())
df.to_csv("data/pokemon_train_12k.csv", index=False)

0    data/pokemon_images/processed_12k/pokemon12k_0...
1    data/pokemon_images/processed_12k/pokemon12k_0...
2    data/pokemon_images/processed_12k/pokemon12k_0...
3    data/pokemon_images/processed_12k/pokemon12k_0...
4    data/pokemon_images/processed_12k/pokemon12k_0...
Name: image_path, dtype: object


### Suppression des échantillons renvoyant vers une image inéxistante

In [14]:
def image_exists(path):
    return os.path.exists(path)

csv_file = "data/pokemon_train_12k.csv"
df = pd.read_csv(csv_file)
initial_count = len(df)
print("Nombre initial d'échantillons :", initial_count)
df_filtered = df[df['image_path'].apply(image_exists)]
removed_count = initial_count - len(df_filtered)
print("Nombre d'échantillons supprimés (images manquantes) :", removed_count)
print("Nombre d'échantillons restants :", len(df_filtered))
df_filtered.to_csv(csv_file, index=False) # Sauvegarder le DataFrame nettoyé (écrase le CSV d'origine)
print(f"Le fichier CSV a été mis à jour et sauvegardé dans : {csv_file}")

Nombre initial d'échantillons : 8637
Nombre d'échantillons supprimés (images manquantes) : 0
Nombre d'échantillons restants : 8637
Le fichier CSV a été mis à jour et sauvegardé dans : data/pokemon_train_12k.csv


# Split

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

def create_splits(full_csv, train_csv="data/pokemon_train.csv", val_csv="data/pokemon_val.csv", test_csv="data/pokemon_test.csv"):
    if not (os.path.exists(val_csv) and os.path.exists(test_csv)):
        print("Création des splits train / val / test...")
        df_full = pd.read_csv(full_csv)
        train_df, temp_df = train_test_split(df_full, test_size=0.2, random_state=42)
        val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

        # le fichier original "pokemon_train.csv" sera réécrit avec 80% des données.
        train_df.to_csv(train_csv, index=False)
        val_df.to_csv(val_csv, index=False)
        test_df.to_csv(test_csv, index=False)
        print("Splits créés : {} samples train, {} samples val, {} samples test".format(
            len(train_df), len(val_df), len(test_df)))
    else:
        print("Les fichiers split train/val/test existent déjà.")

In [None]:
full_csv_path = "data/pokemon_train.csv"
create_splits(full_csv_path)