# R√©sultats - Mode MINI (300 images)

**Date d'ex√©cution**: 21 novembre 2025  
**Mode**: MINI (300 images de pommes)  
**Pipeline**: Feature Extraction (MobileNetV2) + PCA (50 composantes)

---

## üìã Table des mati√®res

1. [Configuration et chargement des donn√©es](#1-configuration-et-chargement)
2. [M√©tadonn√©es](#2-m√©tadonn√©es)
3. [Features brutes (1280D)](#3-features-brutes)
4. [Features PCA (50D)](#4-features-pca)
5. [Informations du mod√®le PCA](#5-mod√®le-pca)
6. [Visualisations](#6-visualisations)
7. [Conclusions](#7-conclusions)

## 1. Configuration et chargement

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Configuration des graphiques
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Chemins des donn√©es
BASE_DIR = Path('.')
METADATA_DIR = BASE_DIR / 'metadata'
FEATURES_DIR = BASE_DIR / 'features' / 'csv'
PCA_DIR = BASE_DIR / 'pca' / 'csv'
MODEL_INFO_DIR = BASE_DIR / 'model_info'

print("‚úÖ Imports r√©ussis")
print(f"üìÇ R√©pertoire de travail: {BASE_DIR.absolute()}")

## 2. M√©tadonn√©es

In [None]:
# Charger les m√©tadonn√©es
metadata_files = list(METADATA_DIR.glob('**/part-*.csv'))
print(f"üìÑ Fichiers de m√©tadonn√©es trouv√©s: {len(metadata_files)}")

if metadata_files:
    df_metadata = pd.concat([pd.read_csv(f) for f in metadata_files], ignore_index=True)
    print(f"‚úÖ M√©tadonn√©es charg√©es: {len(df_metadata)} images")
else:
    print("‚ùå Aucun fichier de m√©tadonn√©es trouv√©")
    df_metadata = None

In [None]:
if df_metadata is not None:
    print("\n" + "="*60)
    print("üìä STATISTIQUES DES M√âTADONN√âES")
    print("="*60)
    
    print(f"\nüñºÔ∏è  Nombre total d'images: {len(df_metadata):,}")
    print(f"üè∑Ô∏è  Nombre de classes: {df_metadata['label'].nunique()}")
    
    print("\nüì¶ Colonnes disponibles:")
    print(df_metadata.columns.tolist())
    
    print("\nüëÅÔ∏è  Aper√ßu des donn√©es:")
    display(df_metadata.head(10))
    
    print("\nüìä Distribution des classes:")
    class_counts = df_metadata['label'].value_counts()
    display(class_counts)

In [None]:
if df_metadata is not None:
    # Visualisation de la distribution des classes
    fig, ax = plt.subplots(figsize=(12, 6))
    class_counts = df_metadata['label'].value_counts()
    class_counts.plot(kind='bar', ax=ax, color='skyblue', edgecolor='black')
    ax.set_title('Distribution des images par classe', fontsize=16, fontweight='bold')
    ax.set_xlabel('Classe', fontsize=12)
    ax.set_ylabel('Nombre d\'images', fontsize=12)
    ax.grid(axis='y', alpha=0.3)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    print(f"\nüìä Statistiques:")
    print(f"   ‚Ä¢ Classe la plus repr√©sent√©e: {class_counts.idxmax()} ({class_counts.max()} images)")
    print(f"   ‚Ä¢ Classe la moins repr√©sent√©e: {class_counts.idxmin()} ({class_counts.min()} images)")
    print(f"   ‚Ä¢ Moyenne par classe: {class_counts.mean():.1f} images")

## 3. Features brutes (1280D)

In [None]:
# Charger les features brutes
features_files = list(FEATURES_DIR.glob('**/part-*.csv'))
print(f"üìÑ Fichiers de features trouv√©s: {len(features_files)}")

if features_files:
    df_features = pd.concat([pd.read_csv(f) for f in features_files], ignore_index=True)
    print(f"‚úÖ Features charg√©es: {len(df_features)} images")
    
    # Convertir la colonne features_string en array
    df_features['features_array'] = df_features['features_string'].apply(
        lambda x: np.array([float(v) for v in x.split(',')]) if pd.notna(x) else None
    )
    print(f"‚úÖ Conversion en arrays numpy r√©ussie")
else:
    print("‚ùå Aucun fichier de features trouv√©")
    df_features = None

In [None]:
if df_features is not None and 'features_array' in df_features.columns:
    print("\n" + "="*60)
    print("üé® ANALYSE DES FEATURES BRUTES (MobileNetV2 - 1280D)")
    print("="*60)
    
    # Afficher quelques √©chantillons
    print("\nüëÅÔ∏è  Aper√ßu des donn√©es (5 premi√®res lignes):")
    display(df_features[['path', 'label']].head())
    
    # Cr√©er une matrice de features
    features_matrix = np.vstack(df_features['features_array'].values)
    print(f"\nüìê Shape de la matrice: {features_matrix.shape}")
    print(f"   ‚Ä¢ Nombre d'images: {features_matrix.shape[0]}")
    print(f"   ‚Ä¢ Dimensions par image: {features_matrix.shape[1]}")
    
    # Statistiques descriptives
    print(f"\nüìä Statistiques des features:")
    print(f"   ‚Ä¢ Min: {features_matrix.min():.6f}")
    print(f"   ‚Ä¢ Max: {features_matrix.max():.6f}")
    print(f"   ‚Ä¢ Mean: {features_matrix.mean():.6f}")
    print(f"   ‚Ä¢ Std: {features_matrix.std():.6f}")
    print(f"   ‚Ä¢ M√©diane: {np.median(features_matrix):.6f}")

In [None]:
if df_features is not None and 'features_array' in df_features.columns:
    # Visualisation: Heatmap des premi√®res features
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Heatmap des 50 premi√®res dimensions pour 20 images
    sample_features = features_matrix[:20, :50]
    sns.heatmap(sample_features, cmap='viridis', ax=ax1, cbar_kws={'label': 'Valeur'})
    ax1.set_title('Heatmap des features (20 images √ó 50 premi√®res dimensions)', 
                  fontsize=14, fontweight='bold')
    ax1.set_xlabel('Dimension de feature')
    ax1.set_ylabel('Image')
    
    # Distribution des valeurs de features
    ax2.hist(features_matrix.flatten(), bins=100, color='steelblue', alpha=0.7, edgecolor='black')
    ax2.set_title('Distribution des valeurs de features', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Valeur')
    ax2.set_ylabel('Fr√©quence')
    ax2.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 4. Features PCA (50D)

In [None]:
# Charger les features PCA
pca_files = list(PCA_DIR.glob('**/part-*.csv'))
print(f"üìÑ Fichiers PCA trouv√©s: {len(pca_files)}")

if pca_files:
    df_pca = pd.concat([pd.read_csv(f) for f in pca_files], ignore_index=True)
    print(f"‚úÖ Features PCA charg√©es: {len(df_pca)} images")
    
    # Convertir la colonne pca_features_string en array
    df_pca['pca_array'] = df_pca['pca_features_string'].apply(
        lambda x: np.array([float(v) for v in x.split(',')]) if pd.notna(x) else None
    )
    print(f"‚úÖ Conversion en arrays numpy r√©ussie")
else:
    print("‚ùå Aucun fichier PCA trouv√©")
    df_pca = None

In [None]:
if df_pca is not None and 'pca_array' in df_pca.columns:
    print("\n" + "="*60)
    print("üìä ANALYSE DES FEATURES PCA (50D)")
    print("="*60)
    
    # Afficher quelques √©chantillons
    print("\nüëÅÔ∏è  Aper√ßu des donn√©es (5 premi√®res lignes):")
    display(df_pca[['path', 'label']].head())
    
    # Cr√©er une matrice PCA
    pca_matrix = np.vstack(df_pca['pca_array'].values)
    print(f"\nüìê Shape de la matrice PCA: {pca_matrix.shape}")
    print(f"   ‚Ä¢ Nombre d'images: {pca_matrix.shape[0]}")
    print(f"   ‚Ä¢ Dimensions par image: {pca_matrix.shape[1]}")
    print(f"   ‚Ä¢ R√©duction: 1280 ‚Üí {pca_matrix.shape[1]} ({(1 - pca_matrix.shape[1]/1280)*100:.1f}% compression)")
    
    # Statistiques descriptives
    print(f"\nüìä Statistiques des features PCA:")
    print(f"   ‚Ä¢ Min: {pca_matrix.min():.6f}")
    print(f"   ‚Ä¢ Max: {pca_matrix.max():.6f}")
    print(f"   ‚Ä¢ Mean: {pca_matrix.mean():.6f}")
    print(f"   ‚Ä¢ Std: {pca_matrix.std():.6f}")
    print(f"   ‚Ä¢ M√©diane: {np.median(pca_matrix):.6f}")

In [None]:
if df_pca is not None and 'pca_array' in df_pca.columns:
    # Statistiques par composante
    print("\nüìà Statistiques par composante principale:")
    pca_df_stats = pd.DataFrame({
        'Composante': [f'PC{i+1}' for i in range(pca_matrix.shape[1])],
        'Mean': pca_matrix.mean(axis=0),
        'Std': pca_matrix.std(axis=0),
        'Min': pca_matrix.min(axis=0),
        'Max': pca_matrix.max(axis=0)
    })
    display(pca_df_stats.head(10))

## 5. Informations du mod√®le PCA

In [None]:
# Charger les informations du mod√®le PCA
model_info_files = list(MODEL_INFO_DIR.glob('model_info_*/part-*.txt'))
print(f"üìÑ Fichiers model_info trouv√©s: {len(model_info_files)}")

if model_info_files:
    # Lire tous les fichiers et concat√©ner
    model_info_json = ''
    for f in model_info_files:
        with open(f, 'r') as file:
            content = file.read().strip()
            if content:  # Ignorer les fichiers vides
                model_info_json += content
    
    if model_info_json:
        model_info = json.loads(model_info_json)
        print(f"‚úÖ Informations du mod√®le charg√©es")
    else:
        print("‚ö†Ô∏è  Tous les fichiers sont vides")
        model_info = None
else:
    print("‚ùå Aucun fichier model_info trouv√©")
    model_info = None

In [None]:
if model_info:
    print("\n" + "="*60)
    print("ü§ñ INFORMATIONS DU MOD√àLE PCA")
    print("="*60)
    
    print(f"\n‚è∞ Timestamp: {model_info['timestamp']}")
    print(f"üìä Composantes PCA: {model_info['pca_components']}")
    print(f"üìê Dimensions originales: {model_info['original_dimensions']}")
    print(f"üìâ Dimensions r√©duites: {model_info['reduced_dimensions']}")
    print(f"üñºÔ∏è  Images trait√©es: {model_info['num_images_processed']:,}")
    print(f"\nüìà Variance totale expliqu√©e: {model_info['total_variance_explained']:.4f} ({model_info['total_variance_explained']*100:.2f}%)")
    
    print(f"\nüìä Top 10 composantes principales:")
    for i in range(min(10, len(model_info['variance_by_component']))):
        var = model_info['variance_by_component'][i]
        cum_var = model_info['cumulative_variance'][i]
        print(f"   PC{i+1:2d}: {var:.6f} ({var*100:5.2f}%) | Cumul√©e: {cum_var:.6f} ({cum_var*100:5.2f}%)")

In [None]:
# Charger les donn√©es de variance depuis le CSV
variance_files = list(MODEL_INFO_DIR.glob('variance_*/part-*.csv'))
print(f"üìÑ Fichiers variance trouv√©s: {len(variance_files)}")

if variance_files:
    df_variance = pd.concat([pd.read_csv(f) for f in variance_files], ignore_index=True)
    df_variance = df_variance.sort_values('component').reset_index(drop=True)
    print(f"‚úÖ Donn√©es de variance charg√©es: {len(df_variance)} composantes")
    display(df_variance.head(10))
else:
    print("‚ùå Aucun fichier de variance trouv√©")
    df_variance = None

## 6. Visualisations

### 6.1 Variance expliqu√©e par composante

In [None]:
if df_variance is not None:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Graphique 1: Variance expliqu√©e par les 10 premi√®res composantes
    top10 = df_variance.head(10)
    ax1.bar(top10['component'], top10['variance_explained'], color='steelblue', edgecolor='black')
    ax1.set_title('Variance expliqu√©e - Top 10 composantes', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Composante principale', fontsize=12)
    ax1.set_ylabel('Variance expliqu√©e', fontsize=12)
    ax1.grid(axis='y', alpha=0.3)
    ax1.set_xticks(top10['component'])
    
    # Graphique 2: Variance cumul√©e pour toutes les composantes
    ax2.plot(df_variance['component'], df_variance['cumulative_variance'], 
             marker='o', linewidth=2, markersize=4, color='darkgreen')
    ax2.axhline(y=0.90, color='red', linestyle='--', label='90% variance')
    ax2.axhline(y=0.95, color='orange', linestyle='--', label='95% variance')
    ax2.set_title('Variance cumul√©e - Toutes les composantes', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Nombre de composantes', fontsize=12)
    ax2.set_ylabel('Variance cumul√©e', fontsize=12)
    ax2.legend()
    ax2.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Analyse
    n_90 = df_variance[df_variance['cumulative_variance'] >= 0.90]['component'].min()
    n_95 = df_variance[df_variance['cumulative_variance'] >= 0.95]['component'].min()
    n_99 = df_variance[df_variance['cumulative_variance'] >= 0.99]['component'].min()
    
    print(f"\nüìä Analyse de variance:")
    print(f"   ‚Ä¢ {n_90} composantes pour capturer 90% de variance")
    print(f"   ‚Ä¢ {n_95} composantes pour capturer 95% de variance")
    print(f"   ‚Ä¢ {n_99 if not pd.isna(n_99) else 'N/A'} composantes pour capturer 99% de variance")

### 6.2 Projection 2D (PC1 vs PC2)

In [None]:
if df_pca is not None and 'pca_array' in df_pca.columns:
    # Cr√©er un DataFrame avec PC1 et PC2
    pca_2d = pd.DataFrame({
        'PC1': [arr[0] for arr in df_pca['pca_array']],
        'PC2': [arr[1] for arr in df_pca['pca_array']],
        'label': df_pca['label']
    })
    
    # Scatter plot 2D
    fig, ax = plt.subplots(figsize=(14, 10))
    
    # Cr√©er un scatter plot par classe
    for label in pca_2d['label'].unique():
        mask = pca_2d['label'] == label
        ax.scatter(pca_2d.loc[mask, 'PC1'], 
                  pca_2d.loc[mask, 'PC2'],
                  label=label, 
                  alpha=0.7, 
                  s=100,
                  edgecolors='black',
                  linewidths=0.5)
    
    ax.set_title('Projection PCA 2D (PC1 vs PC2) - Color√© par classe', 
                fontsize=16, fontweight='bold')
    ax.set_xlabel(f'PC1 ({df_variance.iloc[0]["variance_explained"]*100:.2f}% variance)', 
                 fontsize=12)
    ax.set_ylabel(f'PC2 ({df_variance.iloc[1]["variance_explained"]*100:.2f}% variance)', 
                 fontsize=12)
    ax.legend(title='Classe', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nüìä Statistiques de projection 2D:")
    print(f"   ‚Ä¢ PC1 range: [{pca_2d['PC1'].min():.2f}, {pca_2d['PC1'].max():.2f}]")
    print(f"   ‚Ä¢ PC2 range: [{pca_2d['PC2'].min():.2f}, {pca_2d['PC2'].max():.2f}]")

### 6.3 Projection 3D (PC1, PC2, PC3)

In [None]:
if df_pca is not None and 'pca_array' in df_pca.columns:
    from mpl_toolkits.mplot3d import Axes3D
    
    # Cr√©er un DataFrame avec PC1, PC2 et PC3
    pca_3d = pd.DataFrame({
        'PC1': [arr[0] for arr in df_pca['pca_array']],
        'PC2': [arr[1] for arr in df_pca['pca_array']],
        'PC3': [arr[2] for arr in df_pca['pca_array']],
        'label': df_pca['label']
    })
    
    # Scatter plot 3D
    fig = plt.figure(figsize=(14, 10))
    ax = fig.add_subplot(111, projection='3d')
    
    # Cr√©er un scatter plot par classe
    colors = plt.cm.tab10(np.linspace(0, 1, pca_3d['label'].nunique()))
    for i, label in enumerate(pca_3d['label'].unique()):
        mask = pca_3d['label'] == label
        ax.scatter(pca_3d.loc[mask, 'PC1'], 
                  pca_3d.loc[mask, 'PC2'],
                  pca_3d.loc[mask, 'PC3'],
                  label=label,
                  alpha=0.7,
                  s=100,
                  c=[colors[i]],
                  edgecolors='black',
                  linewidths=0.5)
    
    ax.set_title('Projection PCA 3D (PC1, PC2, PC3) - Color√© par classe', 
                fontsize=16, fontweight='bold')
    ax.set_xlabel(f'PC1 ({df_variance.iloc[0]["variance_explained"]*100:.2f}%)', fontsize=10)
    ax.set_ylabel(f'PC2 ({df_variance.iloc[1]["variance_explained"]*100:.2f}%)', fontsize=10)
    ax.set_zlabel(f'PC3 ({df_variance.iloc[2]["variance_explained"]*100:.2f}%)', fontsize=10)
    ax.legend(title='Classe', bbox_to_anchor=(1.15, 1), loc='upper left')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nüìä Statistiques de projection 3D:")
    print(f"   ‚Ä¢ PC1 range: [{pca_3d['PC1'].min():.2f}, {pca_3d['PC1'].max():.2f}]")
    print(f"   ‚Ä¢ PC2 range: [{pca_3d['PC2'].min():.2f}, {pca_3d['PC2'].max():.2f}]")
    print(f"   ‚Ä¢ PC3 range: [{pca_3d['PC3'].min():.2f}, {pca_3d['PC3'].max():.2f}]")

### 6.4 Distribution des valeurs PCA

In [None]:
if df_pca is not None and 'pca_array' in df_pca.columns:
    # Cr√©er un DataFrame avec les 3 premi√®res composantes
    pca_components = pd.DataFrame({
        'PC1': [arr[0] for arr in df_pca['pca_array']],
        'PC2': [arr[1] for arr in df_pca['pca_array']],
        'PC3': [arr[2] for arr in df_pca['pca_array']]
    })
    
    # Histogrammes des distributions
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    for i, (col, ax) in enumerate(zip(['PC1', 'PC2', 'PC3'], axes)):
        ax.hist(pca_components[col], bins=30, color=f'C{i}', alpha=0.7, edgecolor='black')
        ax.set_title(f'Distribution de {col}', fontsize=14, fontweight='bold')
        ax.set_xlabel('Valeur', fontsize=12)
        ax.set_ylabel('Fr√©quence', fontsize=12)
        ax.grid(axis='y', alpha=0.3)
        
        # Ajouter des statistiques
        mean_val = pca_components[col].mean()
        std_val = pca_components[col].std()
        ax.axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.2f}')
        ax.axvline(mean_val + std_val, color='orange', linestyle=':', linewidth=2, label=f'¬±1 Std: {std_val:.2f}')
        ax.axvline(mean_val - std_val, color='orange', linestyle=':', linewidth=2)
        ax.legend()
    
    plt.tight_layout()
    plt.show()

### 6.5 Boxplot des composantes principales par classe

In [None]:
if df_pca is not None and 'pca_array' in df_pca.columns:
    # Cr√©er un DataFrame pour le boxplot
    pca_boxplot = pd.DataFrame({
        'PC1': [arr[0] for arr in df_pca['pca_array']],
        'PC2': [arr[1] for arr in df_pca['pca_array']],
        'PC3': [arr[2] for arr in df_pca['pca_array']],
        'label': df_pca['label']
    })
    
    # Boxplots par classe
    fig, axes = plt.subplots(3, 1, figsize=(14, 12))
    
    for i, (col, ax) in enumerate(zip(['PC1', 'PC2', 'PC3'], axes)):
        pca_boxplot.boxplot(column=col, by='label', ax=ax, grid=False)
        ax.set_title(f'Distribution de {col} par classe', fontsize=14, fontweight='bold')
        ax.set_xlabel('Classe', fontsize=12)
        ax.set_ylabel(f'{col} valeur', fontsize=12)
        plt.sca(ax)
        plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    plt.show()

## 7. Conclusions

In [None]:
print("="*70)
print("üìù CONCLUSIONS - MODE MINI (300 images)")
print("="*70)

if model_info:
    print(f"\n‚úÖ Pipeline ex√©cut√© avec succ√®s:")
    print(f"   ‚Ä¢ {model_info['num_images_processed']} images trait√©es")
    print(f"   ‚Ä¢ Extraction de features: MobileNetV2 (1280D)")
    print(f"   ‚Ä¢ R√©duction PCA: 1280D ‚Üí 50D (96.1% compression)")
    print(f"   ‚Ä¢ Variance totale pr√©serv√©e: {model_info['total_variance_explained']*100:.2f}%")

if df_metadata is not None:
    print(f"\nüìä Donn√©es analys√©es:")
    print(f"   ‚Ä¢ Nombre de classes: {df_metadata['label'].nunique()}")
    print(f"   ‚Ä¢ Classes: {', '.join(df_metadata['label'].unique())}")

if df_variance is not None:
    n_90 = df_variance[df_variance['cumulative_variance'] >= 0.90]['component'].min()
    n_95 = df_variance[df_variance['cumulative_variance'] >= 0.95]['component'].min()
    
    print(f"\nüí° Insights cl√©s:")
    print(f"   ‚Ä¢ Les 2 premi√®res composantes (PC1, PC2) capturent "
          f"{df_variance.iloc[:2]['cumulative_variance'].max()*100:.2f}% de variance")
    print(f"   ‚Ä¢ Seulement {n_90} composantes suffisent pour 90% de variance")
    print(f"   ‚Ä¢ {n_95} composantes pour 95% de variance")
    print(f"   ‚Ä¢ Les features PCA permettent une s√©paration visuelle des classes")

print(f"\nüöÄ Prochaines √©tapes:")
print(f"   1. Ex√©cuter le pipeline en mode 'apples' (~6,400 images)")
print(f"   2. Ex√©cuter le pipeline en mode 'full' (~67,000 images)")
print(f"   3. Comparer les r√©sultats des 3 modes")
print(f"   4. Entra√Æner un mod√®le de classification sur les features PCA")
print(f"   5. √âvaluer la performance: PCA 50D vs features brutes 1280D")

print("\n" + "="*70)