<a href="https://colab.research.google.com/github/Eggochi/Proyecto_Galaxias/blob/main/Co_training_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
#from google.colab import drive
#drive.mount('/content/drive')
#!unzip /content/drive/MyDrive/Proyecto_Galaxias/smallGZ1.zip -d /content/Imagenes

In [51]:
import torch
import pandas as pd
import os
import numpy as np

from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, Subset , DataLoader, SubsetRandomSampler

from torchvision.datasets.folder import default_loader, IMG_EXTENSIONS, has_file_allowed_extension
from torchvision.ops import sigmoid_focal_loss
from sklearn.metrics import f1_score
import torch.nn as nn
from torchvision import datasets, models, transforms
from torch import optim
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as F
from copy import deepcopy

In [52]:
# ======================
# CONFIGURACIÓN
# ======================
#Datos
data_dir = '/content/Imagenes/smallGZ1'  # Cambia esta ruta a donde tengas tus imagenes
cvs_file = '/content/drive/MyDrive/Proyecto_Galaxias/Copia de smallCSV_class.csv'  # Ruta al archivo CSV con rutas y etiquetas
df=pd.read_csv(cvs_file)
num_classes= 6

partition_size = 0.2  # Proporción para validación y test

#Entrenamiento
batch_size = 16     # Ajusta si te da "out of memory" (usa 8 o 12)
max_epochs = 25
learning_rate = 0.0005
k_folds = 5
ft_parameters=['features.5','features.6','features.7','features.8']

#Semi Suprervisado
thresholds = {
    0: 0.92,  # clase dominante
    3: 0.90,
    2: 0.85,
    1: 0.85,
    5: 0.80,
    4: 0.75   # clase más escasa
}

#scheduler
scheduler_patience = 1  # Número de épocas sin mejora antes de reducir LR
scheduler_factor = 0.5   # Factor de reducción del LR
min_lr = 1e-6           # LR mínimo

#Early Stopping
patience = 5        # Número de épocas sin mejora antes de parar
min_delta = 0.001   # Mejora mínima para considerar que hay progreso

#dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

Usando dispositivo: cuda


In [53]:
def Dividir_datos(data, test_size: float = 0.2, validation: bool = True, random_state: int = 42
                 ):
    """
    Lee CSV y divide en train / (val, test).
    Si validation=True: primero separa train / temp (test_size),
    luego divide temp en validation y test a partes iguales.
    Devuelve (train_df, val_df, test_df).
    """
    df = data
    if df.shape[0] == 0:
        raise ValueError(f"CSV vacío: {ruta_csv}")

    y=df.iloc[:,-1].values

    datos_entrenamiento, datos_prueba = train_test_split(df, test_size=test_size, random_state=random_state, shuffle=True,stratify=y)
    if validation:
        # dividir el 'prueba' en validación y prueba (mitades)
        datos_validacion, datos_prueba = train_test_split(datos_prueba, test_size=0.5, random_state=random_state, shuffle=True)
        print(f"Datos de entrenamiento: {len(datos_entrenamiento)}, Datos de validacion {len(datos_validacion)}, Datos de prueba {len(datos_prueba)}")
        return datos_entrenamiento.reset_index(drop=True), datos_validacion.reset_index(drop=True), datos_prueba.reset_index(drop=True)

    print(f"Datos de entrenamiento: {len(datos_entrenamiento)}, Datos de prueba: {len(datos_prueba)}")
    return datos_entrenamiento.reset_index(drop=True), datos_prueba.reset_index(drop=True)

In [54]:
class CSV_Dataset(Dataset):
    def __init__(self, root, dataframe,filename_col=None, label_col=None, transform=None,
                 loader=default_loader, extensions=IMG_EXTENSIONS):
        """
        Dataset a partir de un DataFrame de pandas.
        - root: ruta base de las imágenes
        - dataframe: objeto pd.DataFrame con columnas de ruta y etiqueta
        - filename_col / label_col: nombres de columnas (si no se especifican, usa primera y última)
        """
        self.root = root
        self.loader = loader
        self.transform = transform

        df = dataframe
        if df.shape[1] < 2:
            raise ValueError("El DataFrame debe tener al menos 2 columnas (filename y clase).")

        # Columnas a usar
        filename_col = filename_col or df.columns[0]
        label_col = label_col or df.columns[-1]

        # Clases y mapeo
        self.classes = df[label_col].value_counts().index.tolist()
        self.class_to_idx = {cls: i for i, cls in enumerate(self.classes)}

        # Construir lista de muestras válidas
        self.samples = []
        for fname, label in zip(df[filename_col], df[label_col]):
            if pd.isna(fname) or pd.isna(label):
                continue
            path = os.path.join(root, f"{fname}.jpeg")
            if os.path.exists(path) and has_file_allowed_extension(path, extensions):
                self.samples.append((path, self.class_to_idx[label]))

        self.targets = [s[1] for s in self.samples]

    def add_samples(self, paths, labels):
        """
        Agrega nuevas muestras (por ejemplo, pseudolabels) al dataset.
        - paths: lista de rutas de imágenes
        - labels: lista de etiquetas numéricas
        """
        if len(paths) != len(labels):
            raise ValueError("El número de paths y labels debe coincidir.")
        for path, label in zip(paths, labels):
            if os.path.exists(path):
                self.samples.append((path, label))
                self.targets.append(label)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, target = self.samples[idx]
        image = self.loader(path)
        if self.transform:
            image = self.transform(image)
        return image, target


class SubsetView(Dataset):
    def __init__(self, base_dataset, indices, labels=None):
        self.base = base_dataset
        self.indices = list(indices)
        self.labels = list(labels) if labels is not None else [-1]*len(indices)

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, i):
        idx = self.indices[i]
        image, _ = self.base[idx]
        label = self.labels[i]
        return image, label

    def add_samples(self, new_indices, new_labels):
        self.indices.extend(new_indices)
        self.labels.extend(new_labels)

    def remove_indices(self, remove_indices):
        keep = [i for i, idx in enumerate(self.indices) if idx not in remove_indices]
        self.indices = [self.indices[i] for i in keep]
        self.labels = [self.labels[i] for i in keep]

In [55]:
import torchvision.models as models
import torch.nn as nn # Added import statement

# ======================
# MODELO: ConvNeXt-Tiny
# ======================

# Load a pre-trained ConvNeXt-Tiny model
convnext_model = models.convnext_tiny(weights=models.ConvNeXt_Tiny_Weights.IMAGENET1K_V1)

# Get the number of input features for the classifier
num_ftrs = convnext_model.classifier[2].in_features

# Replace the classifier with a new one
convnext_model.classifier[2] = nn.Sequential(
    nn.Dropout(0.5), # Added dropout layer
    nn.Linear(num_ftrs, num_classes)
)

# Move the model to the device
convnext_model = convnext_model.to(device)

print("ConvNeXt-Tiny model with modified final layer:")
print(convnext_model.classifier)

# Freeze all parameters in the network
for param in convnext_model.parameters():
    param.requires_grad = False

# Unfreeze the parameters in the classifier and the last few blocks
for name, param in convnext_model.named_parameters():
    if 'classifier' in name or 'features.7' in name or 'features.6' in name: # Adjust layer names based on ConvNeXt architecture
        param.requires_grad = True

ConvNeXt-Tiny model with modified final layer:
Sequential(
  (0): LayerNorm2d((768,), eps=1e-06, elementwise_affine=True)
  (1): Flatten(start_dim=1, end_dim=-1)
  (2): Linear(in_features=768, out_features=6, bias=True)
)


In [56]:
# ======================
# MODELO: EfficientNetNet-V2-S
# ======================

efficientnetV2_model = models.efficientnet_v2_s(weights=models.EfficientNet_V2_S_Weights.DEFAULT)

# Reemplazar la capa final (classifier) and add dropout
num_features = efficientnetV2_model.classifier[1].in_features
efficientnetV2_model.classifier = nn.Sequential(
    nn.Dropout(0.5), # Added dropout layer
    nn.Linear(num_features, num_classes)
)
efficientnetV2_model = efficientnetV2_model.to(device)

# Congelar capas base, descongelando las últimas capas y el clasificador
# Note: Layer names may vary between EfficientNet versions.
# You might need to inspect model.named_parameters() to fine-tune unfreezing.
ft_parameters = ['features.5','features.6','features.7','features.8'] # These may need adjustment for V2-S

for name, param in efficientnetV2_model.named_parameters():
    if 'classifier' in name:
        param.requires_grad = True
    elif any(ft_param in name for ft_param in ft_parameters):
        param.requires_grad = True
    else:
        param.requires_grad = False


# ======================
# MODELO: EfficientNetB0
# ======================

# efficientnetB0_model= models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT)

# # Reemplazar la capa final (classifier) and add dropout
# num_features = efficientnetB0_model.classifier[1].in_features
# efficientnetB0_model.classifier = nn.Sequential(
#     nn.Dropout(0.5), # Added dropout layer
#     nn.Linear(num_features, num_classes)
# )
# efficientnetB0_model = efficientnetB0_model.to(device)

# # Congelar capas base, descongelando las últimas capas y el clasificador
# # Note: Layer names may vary between EfficientNet versions.
# # You might need to inspect model.named_parameters() to fine-tune unfreezing.
# ft_parameters = ['features.5','features.6','features.7','features.8']

# for name, param in efficientnetB0_model.named_parameters():
#     if 'classifier' in name:
#         param.requires_grad = True
#     elif any(ft_param in name for ft_param in ft_parameters):
#         param.requires_grad = True
#     else:
#         param.requires_grad = False

In [57]:
# ========================================
# FUNCIÓN DE PÉRDIDA, OPTIMIZADOR Y SCHEDULER
# ========================================

def build_training_components(models, learning_rate, scheduler_factor,
                              scheduler_patience, min_lr, device):
    """
    Crea automáticamente criterion, optimizer y scheduler para cada modelo.
    """
    optimizers, schedulers = {}, {}

    for name, model in models.items():
        optimizers[name] = optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()),
            lr=learning_rate
        )

        schedulers[name] = ReduceLROnPlateau(
            optimizers[name],
            mode='max',
            factor=scheduler_factor,
            patience=scheduler_patience,
            min_lr=min_lr
        )

    return optimizers, schedulers

#criterion cross entropy




In [58]:
def train_one_epoch(model, optimizer, criterion, dataloader, device, scaler):
    model.train() # Keep dropout active for training
    running_loss = 0.0
    loop = tqdm(dataloader, desc="Entrenando", leave=False)

    for inputs, labels in loop:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()

        with torch.amp.autocast(device_type=device.type):
            outputs = model(inputs)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    return running_loss / len(dataloader)


@torch.no_grad()
def validate(models, dataloader, device, n_passes=10): # Added n_passes for MC Dropout
    """Evalúa varios modelos y calcula el F1-score de cada uno."""
    f1_scores = {}
    all_labels = []
    all_preds = {name: [] for name in models.keys()}

    for inputs, labels in tqdm(dataloader, desc="Validación", leave=False):
        inputs, labels = inputs.to(device), labels.to(device)
        all_labels.extend(labels.cpu().numpy())

        for name, model in models.items():
            model.train() # Keep dropout active for MC Dropout
            batch_preds = []
            for _ in range(n_passes): # Perform multiple passes
                outputs = model(inputs)
                preds = torch.argmax(outputs, dim=1)
                batch_preds.append(preds.cpu().numpy())

            # Simple majority vote for prediction for validation
            batch_preds = np.array(batch_preds)
            final_preds = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=batch_preds)
            all_preds[name].extend(final_preds)


    for name, preds in all_preds.items():
        f1_scores[name] = f1_score(all_labels, preds, average='weighted')

    return f1_scores

In [64]:
@torch.no_grad()
def pseudolabeling(models, dataloader, inv_frequencies, device, thresholds, n_passes=10): # Added n_passes
    """Genera pseudolabels en el conjunto no etiquetado."""
    all_indices, all_probs, all_pseudolabels = [], [], []

    # Assuming the dataloader is built from a SubsetView of the unlabeled data
    # We need the original indices from the base dataset
    base_dataset_indices = dataloader.dataset.indices

    for idx, (inputs, _) in enumerate(tqdm(dataloader, desc="Pseudolabeling", leave=False)):
        inputs = inputs.to(device)
        probs_sum = 0

        for name, model in models.items():
            model.train() # Keep dropout active for MC Dropout
            avg_probs = 0
            for _ in range(n_passes): # Perform multiple passes
                outputs = model(inputs)
                avg_probs += torch.softmax(outputs, dim=1)
            probs_sum += avg_probs / n_passes


        probs_mean = probs_sum / len(models)
        confs, preds = torch.max(probs_mean, dim=1)

        # Get the corresponding base dataset indices for this batch
        batch_base_indices = base_dataset_indices[idx * dataloader.batch_size : (idx + 1) * dataloader.batch_size]

        all_probs.extend(confs.cpu().numpy())
        all_pseudolabels.extend(preds.cpu().numpy())
        all_indices.extend(batch_base_indices)


    df = pd.DataFrame({
        'index': all_indices,
        'pseudolabel': all_pseudolabels,
        'conf': all_probs
    })

    #estratificación y filtrado
    selected_indices, selected_labels = [], []
    for label, group in df.groupby("pseudolabel"):
        th = thresholds.get(label, 0.9)
        group = group[group["conf"] > th]
        # Use inverse frequencies for sampling
        n = int(len(group) * inv_frequencies[label])
        n = max(10, n) # Ensure at least 10 samples if available
        group = group.sort_values("conf", ascending=False).head(n)
        selected_indices.extend(group["index"].tolist())
        selected_labels.extend(group["pseudolabel"].tolist())

    return selected_indices, selected_labels




def coTraining(models, datasets, criterion, device, inv_frequencies, thresholds, batch_size,
               max_epochs=10, patience=3):

    scaler = torch.amp.GradScaler()
    best_f1 = {name: 0.0 for name in models.keys()}
    best_model_wts={name: None for name in models.keys()}
    while True:
        #re/iniciar optimizadores, schedulers y lista de labels agragadas
        new_labels_added = {name: 0 for name in models.keys()}
        optimizers, schedulers = build_training_components(models, learning_rate, scheduler_factor,
                              scheduler_patience, min_lr, device)

        for name, model in models.items():
            other_name = [n for n in models.keys() if n != name][0]

            print(f"\n=== Entrenando {name} ===")
            loaders = {
                "label": DataLoader(datasets[name]["label"], batch_size=batch_size, shuffle=True),
                "val": DataLoader(datasets["val"], batch_size=batch_size, shuffle=False)
            }

            # --- Entrenamiento
            epoch=0
            counter = 0
            while epoch <= max_epochs and counter < patience:
                train_one_epoch(model, optimizers[name], criterion, loaders["label"], device, scaler)
                f1_scores = validate({name: model}, loaders["val"], device, n_passes=1) # validate with MC Dropout
                improved = False
                # Step the scheduler based on the validation F1 score
                schedulers[name].step(f1_scores[name])
                if f1_scores[name] > best_f1[name]:
                      best_f1[name] = f1_scores[name]
                      improved = True
                      best_model_wts[name] = deepcopy(models[name].state_dict()) # Use deepcopy


                if improved:
                    counter = 0
                else:
                    counter += 1
                epoch += 1


            print("\nEntrenamiento finalizado.")
            print(f"Mejor F1 {name}: {best_f1[name]:.4f}")
            # Load the best weights
            models[name].load_state_dict(best_model_wts[name])



            # --- Pseudolabeling del modelo A para el modelo B ---
            print(f"Generando pseudolabels de {name} → {other_name}")
            # Create a DataLoader for the *unlabeled* data of the current model
            unlabeled_dataloader = DataLoader(datasets[name]["unlabel"], batch_size=batch_size, shuffle=False)
            pseudo_indices, pseudo_labels = pseudolabeling(
                {name: model},
                unlabeled_dataloader,
                inv_frequencies,device, thresholds # pseudolabeling with MC Dropout
            )

            if len(pseudo_indices) == 0:
                print(f"No se generaron pseudolabels para {other_name} en esta iteración.")
                continue

            # Add pseudolabels to the *labeled* set of the other model
            datasets[other_name]["label"].add_samples(pseudo_indices, pseudo_labels)
            # Remove the pseudolabeled samples from the *unlabeled* set of the current model
            datasets[name]["unlabel"].remove_indices(pseudo_indices)

            new_labels_added[name] = len(pseudo_indices)
            print(f"{name} generó {len(pseudo_indices)} pseudolabels para {other_name}")

        # Detener si ningún modelo generó nuevos pseudolabels
        if all(v == 0 for v in new_labels_added.values()):
            print("\n✅ No se generaron nuevos pseudolabels. Co-training finalizado.")
            break
    return best_model_wts

In [68]:
# ======================
# TRANSFORMACIONES
# ======================
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                     [0.229, 0.224, 0.225])
                     ])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                     [0.229, 0.224, 0.225])
                     ])

# ======================
# CARGA DE DATOS
# ======================

training_data, validation_data, test_data = Dividir_datos(data=df, test_size=partition_size, validation=True)

base_dataset    = CSV_Dataset(root=data_dir,dataframe=training_data ,filename_col='OBJID', label_col='CLASS', transform=train_transform)
val_dataset     = CSV_Dataset(root=data_dir,dataframe=validation_data,filename_col='OBJID', label_col='CLASS', transform=val_transform)
test_dataset    = CSV_Dataset(root=data_dir,dataframe=test_data, transform=val_transform)

# Crear listas de índices y etiquetas desde base_dataset
all_indices = np.arange(len(base_dataset))
all_labels = [label for _, label in base_dataset.samples]

# Tomamos un porcentaje (20%) como datos etiquetados
labeled_indices, unlabeled_indices, labels, _ = train_test_split(
    all_indices, all_labels,
    test_size=0.8,              # 80% no etiquetados
    stratify=all_labels,        # mantener proporción por clase
    random_state=42
)

Datos de entrenamiento: 5331, Datos de validacion 666, Datos de prueba 667


In [61]:
# Get the frequency of the classes
class_frequency = df['CLASS'].value_counts()

# Display the normalized frequency
print("Class Frequency:")
print(class_frequency)


# Calculate the inverse of the normalized frequencies
inverse_frequency = 1 / class_frequency

# Normalize
normalized_inverse_frequency = inverse_frequency / inverse_frequency.sum()

# Convert to a PyTorch tensor and sort by class index
# Assuming class indices correspond to the index of the sorted normalized_class_frequency
class_weights = torch.tensor(normalized_inverse_frequency.sort_index().values, dtype=torch.float32)

print("Inverse Normalized Frequency (Class Weights):")
print(class_weights)


Class Frequency:
CLASS
0    4645
3     998
2     436
1     414
5     115
4      56
Name: count, dtype: int64
Inverse Normalized Frequency (Class Weights):
tensor([0.0066, 0.0744, 0.0706, 0.0309, 0.5498, 0.2677])


In [None]:
import numpy as np
# ========================================
# USO
# ========================================

frequencies = {0: 4645, 3: 998, 2: 436, 1: 414, 5: 115, 4: 56}
inv_freq = {cls: 1 / np.log1p(freq) for cls, freq in frequencies.items()}

# normalizamos a fracciones
total = sum(inv_freq.values())
fractions = {cls: val / total for cls, val in inv_freq.items()}

models = {
    #"ResNet50": resnet50_model,
    #"EfficientNetB0": efficientnetB0_model
    "ConvNeXt-Tiny": convnext_model,
    "EfficientNetV2-S": efficientnetV2_model
}

criterion=nn.CrossEntropyLoss(weight=class_weights.to(device))

datasets = {
    "val": val_dataset,
    "test": test_dataset
}

for name, model in models.items():
    datasets[name] = {
        "label": SubsetView(base_dataset, labeled_indices.copy(), labels.copy()),
        "unlabel": SubsetView(base_dataset, unlabeled_indices.copy()) }

models_wgts=coTraining(models, datasets, criterion,
               device, fractions, thresholds, batch_size, max_epochs=20, patience=5)

#========================================
# TEST
#========================================
dataloaderTest=DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

for name, model in models.items():
    model.load_state_dict(models_wgts[name])
f1_scores = validate(models, dataloaderTest, device)
print("\nResultados en el conjunto de prueba:")
for name, score in f1_scores.items():
    print(f"{name:15s} | F1: {score:.4f}")


=== Entrenando ConvNeXt-Tiny ===





Entrenamiento finalizado.
Mejor F1 ConvNeXt-Tiny: 0.7801
Generando pseudolabels de ConvNeXt-Tiny → EfficientNetV2-S




ConvNeXt-Tiny generó 355 pseudolabels para EfficientNetV2-S

=== Entrenando EfficientNetV2-S ===





Entrenamiento finalizado.
Mejor F1 EfficientNetV2-S: 0.7757
Generando pseudolabels de EfficientNetV2-S → ConvNeXt-Tiny




EfficientNetV2-S generó 349 pseudolabels para ConvNeXt-Tiny

=== Entrenando ConvNeXt-Tiny ===





Entrenamiento finalizado.
Mejor F1 ConvNeXt-Tiny: 0.7801
Generando pseudolabels de ConvNeXt-Tiny → EfficientNetV2-S




ConvNeXt-Tiny generó 307 pseudolabels para EfficientNetV2-S

=== Entrenando EfficientNetV2-S ===





Entrenamiento finalizado.
Mejor F1 EfficientNetV2-S: 0.7757
Generando pseudolabels de EfficientNetV2-S → ConvNeXt-Tiny


Pseudolabeling:   0%|          | 1/245 [00:00<02:25,  1.67it/s]