In [3]:
import pandas as pd
import os

# Répertoires d'origine
data_dir = "data2/text"  # adapte si besoin
output_dir = "data2/retraining_text"
os.makedirs(output_dir, exist_ok=True)

# Liste des fichiers à échantillonner
files = ["train.csv", "val.csv", "test.csv"]

for filename in files:
    input_path = os.path.join(data_dir, filename)
    output_path = os.path.join(output_dir, filename.replace(".csv", "_1pct.csv"))

    df = pd.read_csv(input_path)
    df_sample = df.sample(frac=0.01, random_state=42)
    df_sample.to_csv(output_path, index=False)

    print(f"{filename}: {len(df_sample)} lignes exportées vers {output_path}")

train.csv: 679 lignes exportées vers data2/retraining_text/train_1pct.csv
val.csv: 85 lignes exportées vers data2/retraining_text/val_1pct.csv
test.csv: 85 lignes exportées vers data2/retraining_text/test_1pct.csv


In [4]:
import shutil

# Dossier contenant toutes les images
image_src_dir = "data2/images"
# Dossier de destination pour les images échantillonnées
image_dest_base = "data2/retraining_images"
os.makedirs(image_dest_base, exist_ok=True)

for split in ["train", "val", "test"]:
    csv_path = f"data2/retraining_text/{split}_1pct.csv"
    image_dest_dir = os.path.join(image_dest_base, split)
    os.makedirs(image_dest_dir, exist_ok=True)

    df = pd.read_csv(csv_path)

    copied = 0
    for _, row in df.iterrows():
        image_name = f"image_{int(row['imageid'])}_product_{int(row['productid'])}.jpg"
        src_path = os.path.join(image_src_dir, split, image_name)
        dest_path = os.path.join(image_dest_dir, image_name)

        if os.path.exists(src_path):
            shutil.copy(src_path, dest_path)
            copied += 1
        else:
            print(f"[⚠️] Image manquante : {src_path}")

    print(f"{split}: {copied} images copiées vers {image_dest_dir}")

train: 679 images copiées vers data2/retraining_images/train
val: 85 images copiées vers data2/retraining_images/val
test: 85 images copiées vers data2/retraining_images/test
