In [6]:

import os
import numpy as np
import pandas as pd
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path

In [7]:
PROJECT_ROOT = (Path.cwd() / ".." / "..").resolve()
DATA_PATH = PROJECT_ROOT / "datasets"
ORIGINAL_DATA = DATA_PATH / "raw" / "images"
PROCESSED_DATA = DATA_PATH / "processed" / "images"

In [8]:
def analyze_images_basic(folder_path):
    """
    Analiza las características básicas de todas las imágenes en una carpeta
    """
    extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp', '.gif')
    image_paths = []
    
    # Buscar todas las imágenes
    for ext in extensions:
        image_paths.extend(Path(folder_path).rglob(f'*{ext}'))
        image_paths.extend(Path(folder_path).rglob(f'*{ext.upper()}'))
    
    print(f"Encontradas {len(image_paths)} imágenes en {folder_path}")
    
    # Lista para almacenar datos
    image_data = []
    
    for img_path in image_paths:
        try:
            with Image.open(img_path) as img:
                # Información básica
                width, height = img.size
                mode = img.mode
                format_img = img.format
                
                # Calcular aspectos adicionales
                aspect_ratio = width / height
                total_pixels = width * height
                
                # Detectar si es cuadrada, horizontal o vertical
                orientation = 'Cuadrada' if width == height else ('Horizontal' if width > height else 'Vertical')
                
                image_data.append({
                    'archivo': img_path.name,
                    'ancho': width,
                    'alto': height,
                    'aspect_ratio': aspect_ratio,
                    'total_pixels': total_pixels,
                    'orientacion': orientation,
                    'modo_color': mode,
                    'formato': format_img,
                    'ruta': str(img_path)
                })
                
        except Exception as e:
            print(f"Error procesando {img_path}: {e}")
    
    return pd.DataFrame(image_data)

In [9]:
df = analyze_images_basic(PROCESSED_DATA / "train" / "images")


# 
# Mostrar estadísticas básicas
print("=== RESUMEN DEL DATASET ===")
print(f"Total de imágenes: {len(df)}")
print(f"\nFormatos encontrados:")
print(df['formato'].value_counts())
print(f"\nModos de color:")
print(df['modo_color'].value_counts())
print(f"\nOrientaciones:")
print(df['orientacion'].value_counts())

# %% Estadísticas de dimensiones
print(f"\n=== ESTADÍSTICAS DE DIMENSIONES ===")
print(f"Ancho - Min: {df['ancho'].min()}, Max: {df['ancho'].max()}, Promedio: {df['ancho'].mean():.1f}")
print(f"Alto - Min: {df['alto'].min()}, Max: {df['alto'].max()}, Promedio: {df['alto'].mean():.1f}")
print(f"Aspect Ratio - Min: {df['aspect_ratio'].min():.2f}, Max: {df['aspect_ratio'].max():.2f}, Promedio: {df['aspect_ratio'].mean():.2f}")

# %% Dimensiones más comunes
print(f"\n=== DIMENSIONES MÁS COMUNES ===")
df['dimension'] = df['ancho'].astype(str) + 'x' + df['alto'].astype(str)
top_dimensions = df['dimension'].value_counts().head(10)
print(top_dimensions)


Encontradas 2936 imágenes en C:\Users\Sebastian\Desktop\MachineLearningPC2\TF_Machine_Learning_1888\datasets\processed\images\train\images
=== RESUMEN DEL DATASET ===
Total de imágenes: 2936

Formatos encontrados:
formato
JPEG    2936
Name: count, dtype: int64

Modos de color:
modo_color
L    2936
Name: count, dtype: int64

Orientaciones:
orientacion
Cuadrada    2936
Name: count, dtype: int64

=== ESTADÍSTICAS DE DIMENSIONES ===
Ancho - Min: 1024, Max: 1024, Promedio: 1024.0
Alto - Min: 1024, Max: 1024, Promedio: 1024.0
Aspect Ratio - Min: 1.00, Max: 1.00, Promedio: 1.00

=== DIMENSIONES MÁS COMUNES ===
dimension
1024x1024    2936
Name: count, dtype: int64
