# Preparación de Dataset para Segmentación de Tumores Cerebrales

## Datasets Recomendados
1. **BraTS 2023**: Estándar de oro para segmentación
2. **LGG Segmentation**: Dataset de Kaggle para gliomas
3. **Brain Tumor Classification**: Mú

In [None]:
import os
import numpy as np
import pandas as pd
import nibabel as nib
import cv2
import matplotlib.pyplot as plt
from pathlib import Path
import zipfile
import requests
from sklearn.model_selection import train_test_split
import pydicom
from PIL import Image

# Configuración
DATASET_DIR = Path("datasets")
PROCESSED_DIR = Path("processed_data")
DATASET_DIR.mkdir(exist_ok=True)
PROCESSED_DIR.mkdir(exist_ok=True)

In [None]:
def download_lgg_dataset():
    """Descargar LGG Segmentation Dataset de Kaggle"""
    # Instrucciones para descarga manual
    print("Para descargar el LGG Dataset:")
    print("1. Ve a: https://www.kaggle.com/mateuszbuda/lgg-mri-segmentation")
    print("2. Descarga el archivo lgg-mri-segmentation.zip")
    print("3. Extrae en la carpeta 'datasets/lgg/'")

    lgg_path = DATASET_DIR / "lgg"
    if lgg_path.exists():
        print(f"✓ Dataset LGG encontrado en: {lgg_path}")
        return True
    else:
        print(f"✗ Dataset LGG no encontrado en: {lgg_path}")
        return False

# Verificar dataset
download_lgg_dataset

In [None]:
def analyze_lgg_dataset():
    """Analizar el dataset LGG"""
    lgg_path = DATASET_DIR / "lgg"

    if not lgg_path.exists():
        print("Dataset LGG no encontrado")
        return

    patients = []
    for patient_dir in lgg_path.iterdir():
        if patient_dir.is_dir():
            images = list(patient_dir.glob("*.tif"))
            masks = list(patient_dir.glob("*_mask.tif"))

            patients.append({
                'patient_id': patient_dir.name,
                'num_images': len(images) - len(masks),  # Excluir máscaras del conteo
                'num_masks': len(masks),
                'path': str(patient_dir)
            })

    df = pd.DataFrame(patients)
    print(f"Total de pacientes: {len(df)}")
    print(f"Promedio de imágenes por paciente: {df['num_images'].mean():.1f}")
    print(f"Total de imágenes: {df['num_images'].sum()}")
    print(f"Total de máscaras: {df['num_masks'].sum()}")

    return df

# Analizar dataset
df_patients = analyze_lgg_dataset()
if df_patients is not None:
    display(df_patients.head())

In [None]:
def visualize_samples(num_samples=3):
    """Visualizar muestras del dataset"""
    lgg_path = DATASET_DIR / "lgg"

    if not lgg_path.exists():
        print("Dataset no encontrado")
        return

    patient_dirs = [d for d in lgg_path.iterdir() if d.is_dir()]

    fig, axes = plt.subplots(num_samples, 3, figsize=(15, 5*num_samples))
    if num_samples == 1:
        axes = axes.reshape(1, -1)

    for i in range(min(num_samples, len(patient_dirs))):
        patient_dir = patient_dirs[i]

        # Buscar imagen y máscara
        images = [f for f in patient_dir.glob("*.tif") if "_mask" not in f.name]
        masks = list(patient_dir.glob("*_mask.tif"))

        if images and masks:
            # Cargar imagen y máscara
            img_path = images[len(images)//2]  # Imagen del medio
            mask_path = masks[len(masks)//2]   # Máscara correspondiente

            image = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
            mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)

            # Visualizar
            axes[i, 0].imshow(image, cmap='gray')
            axes[i, 0].set_title(f'Imagen Original\n{patient_dir.name}')
            axes[i, 0].axis('off')

            axes[i, 1].imshow(mask, cmap='gray')
            axes[i, 1].set_title('Máscara Ground Truth')
            axes[i, 1].axis('off')

            # Overlay
            overlay = image.copy()
            overlay[mask > 0] = 255
            axes[i, 2].imshow(overlay, cmap='gray')
            axes[i, 2].set_title('Overlay')
            axes[i, 2].axis('off')

    plt.tight_layout()
    plt.show()

# Visualizar muestras
visualize_samples(3)

In [None]:
def preprocess_lgg_dataset():
    """Preprocesar dataset LGG para entrenamiento"""
    lgg_path = DATASET_DIR / "lgg"
    output_path = PROCESSED_DIR / "lgg_processed"
    output_path.mkdir(exist_ok=True)

    if not lgg_path.exists():
        print("Dataset LGG no encontrado")
        return

    all_data = []

    for patient_dir in lgg_path.iterdir():
        if not patient_dir.is_dir():
            continue

        print(f"Procesando paciente: {patient_dir.name}")

        # Buscar imágenes y máscaras
        images = sorted([f for f in patient_dir.glob("*.tif") if "_mask" not in f.name])
        masks = sorted(patient_dir.glob("*_mask.tif"))

        for i, img_path in enumerate(images):
            # Buscar máscara correspondiente
            mask_path = patient_dir / f"{img_path.stem}_mask.tif"

            if mask_path.exists():
                # Cargar imagen y máscara
                image = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
                mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)

                # Redimensionar a tamaño estándar
                target_size = (256, 256)
                image_resized = cv2.resize(image, target_size)
                mask_resized = cv2.resize(mask, target_size)

                # Normalizar imagen
                image_normalized = image_resized / 255.0
                mask_binary = (mask_resized > 127).astype(np.uint8)

                # Guardar datos procesados
                patient_output = output_path / patient_dir.name
                patient_output.mkdir(exist_ok=True)

                img_output = patient_output / f"image_{i:03d}.npy"
                mask_output = patient_output / f"mask_{i:03d}.npy"

                np.save(img_output, image_normalized)
                np.save(mask_output, mask_binary)

                all_data.append({
                    'patient_id': patient_dir.name,
                    'image_path': str(img_output),
                    'mask_path': str(mask_output),
                    'slice_idx': i
                })

    # Guardar índice
    df_data = pd.DataFrame(all_data)
    df_data.to_csv(output_path / "dataset_index.csv", index=False)

    print(f"Dataset procesado guardado en: {output_path}")
    print(f"Total de muestras: {len(df_data)}")

    return df_data

# Procesar dataset
df_processed = preprocess_lgg_dataset()
if df_processed is not None:
    display(