# Explore here

In [4]:
# Your code here
import os
import shutil
import cv2
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
from sklearn.model_selection import train_test_split
from src.utils import create_directory_structure
from src.utils import plot_image_samples, plot_class_distribution
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# 1. Configurar estructura de directorios
RAW_PATH = "../data/raw/train/"
INTERIM_PATH = "../data/interim/"
create_directory_structure(INTERIM_PATH)

# 2. Procesar imágenes (redimensionar a 200x200)
def process_images(source_path, dest_path, target_size=(200, 200)):
    for filename in os.listdir(source_path):
        img = cv2.imread(os.path.join(source_path, filename))
        if img is not None:
            img = cv2.resize(img, target_size)
            class_folder = 'cats' if 'cat' in filename else 'dogs'
            cv2.imwrite(os.path.join(dest_path, class_folder, filename), img)

# 3. Dividir en entrenamiento/validación (80/20)
def split_dataset(base_path):
    for animal in ['cats', 'dogs']:
        files = os.listdir(os.path.join(base_path, animal))
        train, val = train_test_split(files, test_size=0.2, random_state=42)
        
        for file in train:
            src = os.path.join(base_path, animal, file)
            dst = os.path.join(base_path, 'train', animal, file)
            shutil.move(src, dst)
        
        for file in val:
            src = os.path.join(base_path, animal, file)
            dst = os.path.join(base_path, 'validation', animal, file)
            shutil.move(src, dst)

# Ejecutar procesamiento
process_images(RAW_PATH, os.path.join(INTERIM_PATH, "full"))
split_dataset(os.path.join(INTERIM_PATH, "full"))

ModuleNotFoundError: No module named 'src'

In [None]:
# 1. Distribución de clases
plot_class_distribution(
    train_path="../data/interim/full/train",
    val_path="../data/interim/full/validation"
)

# 2. Muestras visuales
fig, ax = plt.subplots(2, 5, figsize=(20, 8))
plot_image_samples("../data/interim/full/train/cats", ax[0], 'Cats (Train)')
plot_image_samples("../data/interim/full/train/dogs", ax[1], 'Dogs (Train)')
plt.tight_layout()
plt.savefig("../reports/sample_images.png")
plt.show()

# 3. Estadísticas de tamaño (originales)
def check_original_sizes(path):
    sizes = []
    for filename in os.listdir(path):
        img = cv2.imread(os.path.join(path, filename))
        if img is not None:
            sizes.append(img.shape[:2])
    
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=[w for h,w in sizes], y=[h for h,w in sizes])
    plt.title("Original Image Dimensions")
    plt.xlabel("Width")
    plt.ylabel("Height")
    plt.savefig("../reports/image_dimensions.png")

check_original_sizes(RAW_PATH)

In [None]:
def create_directory_structure(base_path):
    os.makedirs(os.path.join(base_path, "full", "cats"), exist_ok=True)
    os.makedirs(os.path.join(base_path, "full", "dogs"), exist_ok=True)
    os.makedirs(os.path.join(base_path, "full", "train", "cats"), exist_ok=True)
    os.makedirs(os.path.join(base_path, "full", "train", "dogs"), exist_ok=True)
    os.makedirs(os.path.join(base_path, "full", "validation", "cats"), exist_ok=True)
    os.makedirs(os.path.join(base_path, "full", "validation", "dogs"), exist_ok=True)

def plot_class_distribution(train_path, val_path):
    train_cats = len(os.listdir(os.path.join(train_path, "cats")))
    train_dogs = len(os.listdir(os.path.join(train_path, "dogs")))
    val_cats = len(os.listdir(os.path.join(val_path, "cats")))
    val_dogs = len(os.listdir(os.path.join(val_path, "dogs")))
    
    plt.figure(figsize=(10, 6))
    sns.barplot(
        x=['Train Cats', 'Train Dogs', 'Val Cats', 'Val Dogs'],
        y=[train_cats, train_dogs, val_cats, val_dogs]
    )
    plt.title("Class Distribution")
    plt.ylabel("Count")
    plt.savefig("../reports/class_distribution.png")
    plt.show()

def plot_image_samples(path, axes, title):
    files = [f for f in os.listdir(path) if f.endswith('.jpg')][:5]
    for i, filename in enumerate(files):
        img = mpimg.imread(os.path.join(path, filename))
        axes[i].imshow(img)
        axes[i].axis('off')
    axes[0].set_title(title, fontsize=12)

In [None]:
# 1. Generadores de datos
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    zoom_range=0.2,
    horizontal_flip=True
)

val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    "../data/interim/full/train",
    target_size=(200, 200),
    batch_size=32,
    class_mode='categorical'
)

validation_generator = val_datagen.flow_from_directory(
    "../data/interim/full/validation",
    target_size=(200, 200),
    batch_size=32,
    class_mode='categorical'
)

# 2. Construcción del modelo (ajustado a 200x200)
model = Sequential([
    Conv2D(64, (3,3), activation='relu', padding='same', input_shape=(200,200,3)),
    Conv2D(64, (3,3), activation='relu', padding='same'),
    MaxPool2D(2,2),
    
    Conv2D(128, (3,3), activation='relu', padding='same'),
    Conv2D(128, (3,3), activation='relu', padding='same'),
    MaxPool2D(2,2),
    
    # ... continuar con arquitectura similar ajustando tamaños ...
    
    Flatten(),
    Dense(1024, activation='relu'),
    Dense(512, activation='relu'),
    Dense(2, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# 3. Callbacks y entrenamiento
checkpoint = ModelCheckpoint(
    '../models/best_model.h5',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max'
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    train_generator,
    epochs=30,
    validation_data=validation_generator,
    callbacks=[checkpoint, early_stop]
)

# Guardar modelo final
model.save('../models/final_model.h5')

In [None]:
# 1. Gráficas de precisión/pérdida
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training Loss')
plt.legend()

plt.savefig('../reports/training_history.png')
plt.show()

# 2. Evaluación final
test_loss, test_acc = model.evaluate(validation_generator)
print(f'\nTest accuracy: {test_acc:.4f}, Test loss: {test_loss:.4f}')

# 3. Matriz de confusión
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

y_true = []
y_pred = []

for i in range(len(validation_generator)):
    X, y = validation_generator[i]
    y_true.extend(np.argmax(y, axis=1))
    y_pred.extend(np.argmax(model.predict(X), axis=1))

print(classification_report(y_true, y_pred))
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.savefig('../reports/confusion_matrix.png') 