In [1]:
import os
import sys 
import torch
from torchvision import datasets, transforms
from collections import defaultdict
import numpy as np
import torch.nn as nn
import torch.optim as optim
sys.path.append(os.path.abspath('..'))
from base_trainer import Trainer
from model import SimpleCNN
from torch.utils.data import DataLoader, Subset, Dataset

In [2]:
class ActiveTrainer(Trainer):
    def select_samples(self, dataloader, num_samples_to_select):
        """
        Выбираем образцы на основе наименьшей уверенности модели.
        
        :param dataloader: Загрузчик данных для выборки
        :param num_samples_to_select: Количество образцов для выбора
        :return: Список индексов выбранных образцов
        """
        self.model.eval()  # Устанавливаем модель в режим оценки
        all_confidences = []
        
        with torch.no_grad():
            for inputs, _ in dataloader:
                inputs = inputs.to(self.device)  # Переносим данные на устройство модели
                outputs = self.model(inputs)
                probabilities = torch.softmax(outputs, dim=1)
                confidence_scores, _ = torch.max(probabilities, dim=1)  # Максимальная вероятность для каждого примера
                all_confidences.append(confidence_scores)

        # Конкатенируем все уверенности в один тензор
        confidence_scores = torch.cat(all_confidences)
        
        # Находим индексы образцов с наименьшей уверенностью
        least_confident_indices = torch.argsort(1 - confidence_scores)[:num_samples_to_select]
        self.update_dataloader(least_confident_indices.cpu().numpy())
        print("watch = ", least_confident_indices.cpu().numpy())
        return least_confident_indices.cpu().numpy()
    
    def fit(self, num_epochs):
        """
        Полный цикл обучения.
        :param num_epochs: Количество эпох
        """
        for epoch in range(num_epochs):
            train_loss = self.train_step()
            val_loss, accuracy, f1 = self.val_step()
            self.select_samples(dataloader = self.pool_loader, num_samples_to_select = num_epochs)
            if self.scheduler is not None:
                if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                    self.scheduler.step(val_loss)
                else:
                    self.scheduler.step()
            print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Acc: {accuracy:.4f}, F1: {f1:.4f}")

In [3]:
NUM_CLASSES = 10           # CIFAR-10
NUM_EPOCH = 10
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
percentages = [0.01, 0.10, 0.20]
initial_datasets = {}
pool_data_indices = []

# Создаем словарь для хранения индексов по классам
class_indices = defaultdict(list)

# Заполняем словарь индексами изображений по классам
for index, (_, label) in enumerate(train_dataset):
    class_indices[label].append(index)

# Формируем начальные наборы данных
for percentage in percentages:
    initial_indices = []
    num_samples_per_class = {label: int(len(indices) * percentage) for label, indices in class_indices.items()}
    
    for label, indices in class_indices.items():
        # Случайным образом выбираем индексы для каждого класса
        selected_indices = np.random.choice(indices, num_samples_per_class[label], replace=False)
        initial_indices.extend(selected_indices)
    
    # Сортируем индексы для создания подмножества
    initial_datasets[percentage] = sorted(initial_indices)

# Создаем pool_data с оставшимися данными
all_initial_indices = set(initial_datasets[0.01] + initial_datasets[0.10] + initial_datasets[0.20])
pool_data_indices = [i for i in range(len(train_dataset)) if i not in all_initial_indices]

# Создаем подмножества для начальных данных и оставшихся данных
initial_dataset_1_percent = torch.utils.data.Subset(train_dataset, initial_datasets[0.01])
initial_dataset_10_percent = torch.utils.data.Subset(train_dataset, initial_datasets[0.10])
initial_dataset_20_percent = torch.utils.data.Subset(train_dataset, initial_datasets[0.20])
pool_data = torch.utils.data.Subset(train_dataset, pool_data_indices)

# Проверяем размеры подмножеств
print(f"Initial dataset (1%): {len(initial_dataset_1_percent)}")
print(f"Initial dataset (10%): {len(initial_dataset_10_percent)}")
print(f"Initial dataset (20%): {len(initial_dataset_20_percent)}")
print(f"Pool data size: {len(pool_data)}")

train_dataloader = DataLoader(initial_dataset_1_percent, batch_size=64, shuffle=True)
pool_dataloader = DataLoader(pool_data, batch_size=64, shuffle=True)
val_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)
model = SimpleCNN(10)

Files already downloaded and verified
Files already downloaded and verified
Initial dataset (1%): 500
Initial dataset (10%): 5000
Initial dataset (20%): 10000
Pool data size: 35698


In [4]:
optimizer = optim.AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()
train = ActiveTrainer(model=model.to(DEVICE), optimizer=optimizer,criterion=criterion, train_loader=train_dataloader, val_loader=val_dataloader,pool_loader = pool_dataloader, device=DEVICE)

In [5]:
train.fit(2)


Training: 100%|██████████| 8/8 [00:00<00:00, 18.43it/s]
Validating: 100%|██████████| 157/157 [00:01<00:00, 81.70it/s]


Validation Loss: 2.2381, Accuracy: 0.1399, F1 Score: 0.0741
35698
35696
watch =  [21219  1797]
Epoch 1/2 - Train Loss: 2.3160, Val Loss: 2.2381, Acc: 0.1399, F1: 0.0741


Training: 100%|██████████| 8/8 [00:00<00:00, 59.70it/s]
Validating: 100%|██████████| 157/157 [00:02<00:00, 74.90it/s]


Validation Loss: 2.1598, Accuracy: 0.2267, F1 Score: 0.1657
35696
35694
watch =  [14922 13583]
Epoch 2/2 - Train Loss: 2.1588, Val Loss: 2.1598, Acc: 0.2267, F1: 0.1657
