In [1]:
# Ячейка 1: ИМПОРТЫ И НАСТРОЙКИ
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
import torchvision.models as models
import matplotlib.pyplot as plt
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

print("PyTorch version:", torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Создание директорий
os.makedirs('checkpoints', exist_ok=True)
os.makedirs('results', exist_ok=True)
print("Созданы директории: checkpoints/, results/")

PyTorch version: 2.9.1+cpu
Using device: cpu
Созданы директории: checkpoints/, results/


In [2]:
# Ячейка 2: АРХИТЕКТУРЫ МОДЕЛЕЙ
class TeacherModel(nn.Module):
    def __init__(self, num_classes=10):
        super(TeacherModel, self).__init__()
        self.backbone = models.resnet18(pretrained=False)
        self.backbone.fc = nn.Linear(self.backbone.fc.in_features, num_classes)
        
    def forward(self, x):
        return self.backbone(x)
    
    def get_features(self, x):
        x = self.backbone.conv1(x)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)
        layer1 = self.backbone.layer1(x)
        layer2 = self.backbone.layer2(layer1)
        layer3 = self.backbone.layer3(layer2)
        layer4 = self.backbone.layer4(layer3)
        return [layer1, layer2, layer3, layer4]

class StudentModel(nn.Module):
    def __init__(self, num_classes=10):
        super(StudentModel, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1), 
            nn.BatchNorm2d(32), 
            nn.ReLU(), 
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), 
            nn.BatchNorm2d(64), 
            nn.ReLU(), 
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3, padding=1), 
            nn.BatchNorm2d(128), 
            nn.ReLU(), 
            nn.MaxPool2d(2),
        )
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)), 
            nn.Flatten(),
            nn.Linear(128, 64), 
            nn.ReLU(), 
            nn.Dropout(0.5), 
            nn.Linear(64, num_classes)
        )
        
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x
    
    def get_features(self, x):
        features = []
        x_temp = x
        for layer in self.features:
            x_temp = layer(x_temp)
            if isinstance(layer, nn.Conv2d):
                features.append(x_temp)
        return features

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def evaluate_model(model, testloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in testloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    return 100. * correct / total

# Проверка
teacher = TeacherModel()
student = StudentModel()
print(f"✓ Модели созданы")
print(f"Параметры Учителя: {count_parameters(teacher):,}")
print(f"Параметры Студента: {count_parameters(student):,}")
print(f"Сжатие: {count_parameters(teacher)/count_parameters(student):.1f}x")

✓ Модели созданы
Параметры Учителя: 11,181,642
Параметры Студента: 102,602
Сжатие: 109.0x


In [3]:
# Ячейка 3: ДАННЫЕ CIFAR-10
print("Загрузка CIFAR-10...")
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)

print(f"✓ Данные загружены")
print(f"Тренировочные данные: {len(trainset)} примеров")
print(f"Тестовые данные: {len(testset)} примеров")

Загрузка CIFAR-10...
✓ Данные загружены
Тренировочные данные: 50000 примеров
Тестовые данные: 10000 примеров


In [4]:
# Ячейка 4: БАЗОВЫЙ ТРЕНЕР
class BaseTrainer:
    def __init__(self, model, train_loader, val_loader, device):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.device = device
        self.criterion = nn.CrossEntropyLoss()
        
    def train_epoch(self, optimizer):
        self.model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, targets in self.train_loader:
            inputs, targets = inputs.to(self.device), targets.to(self.device)
            optimizer.zero_grad()
            outputs = self.model(inputs)
            loss = self.criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
        return running_loss / len(self.train_loader), 100. * correct / total
    
    def validate(self):
        self.model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, targets in self.val_loader:
                inputs, targets = inputs.to(self.device), targets.to(self.device)
                outputs = self.model(inputs)
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()
        return 100. * correct / total

print("✓ BaseTrainer определен")

✓ BaseTrainer определен


In [5]:
# Ячейка 5: ОБУЧЕНИЕ УЧИТЕЛЯ
print("=" * 50)
print("ПРЕРЕКВИЗИТ: ОБУЧЕНИЕ УЧИТЕЛЯ")
print("=" * 50)

teacher_model = TeacherModel().to(device)
teacher_optimizer = optim.Adam(teacher_model.parameters(), lr=0.001)
teacher_trainer = BaseTrainer(teacher_model, trainloader, testloader, device)

print("Training Teacher model...")
for epoch in range(5):
    train_loss, train_acc = teacher_trainer.train_epoch(teacher_optimizer)
    val_acc = teacher_trainer.validate()
    print(f'Epoch {epoch+1}: Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

torch.save(teacher_model.state_dict(), 'checkpoints/teacher_model.pth')
teacher_acc = evaluate_model(teacher_model, testloader, device)
print(f"✓ Учитель обучен: {teacher_acc:.2f}%")

ПРЕРЕКВИЗИТ: ОБУЧЕНИЕ УЧИТЕЛЯ
Training Teacher model...
Epoch 1: Loss: 1.3795, Train Acc: 50.51%, Val Acc: 54.42%
Epoch 2: Loss: 0.9832, Train Acc: 65.38%, Val Acc: 64.30%
Epoch 3: Loss: 0.8081, Train Acc: 71.78%, Val Acc: 71.57%
Epoch 4: Loss: 0.6800, Train Acc: 76.38%, Val Acc: 72.49%
Epoch 5: Loss: 0.5730, Train Acc: 79.99%, Val Acc: 74.41%
✓ Учитель обучен: 74.41%


In [6]:
# Ячейка 6: ОБУЧЕНИЕ СТУДЕНТА (BASELINE)
print("=" * 50)
print("ПРЕРЕКВИЗИТ: ОБУЧЕНИЕ СТУДЕНТА (BASELINE)")
print("=" * 50)

student_baseline = StudentModel().to(device)
student_optimizer = optim.Adam(student_baseline.parameters(), lr=0.001)
student_trainer = BaseTrainer(student_baseline, trainloader, testloader, device)

print("Training Student from scratch...")
for epoch in range(5):
    train_loss, train_acc = student_trainer.train_epoch(student_optimizer)
    val_acc = student_trainer.validate()
    print(f'Epoch {epoch+1}: Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

torch.save(student_baseline.state_dict(), 'checkpoints/student_baseline.pth')
baseline_acc = evaluate_model(student_baseline, testloader, device)
print(f"✓ Студент (baseline) обучен: {baseline_acc:.2f}%")

print(f"\nСРАВНЕНИЕ МЕТРИК:")
print(f"Учитель: {teacher_acc:.2f}% vs Студент: {baseline_acc:.2f}%")
print(f"Разница: {teacher_acc - baseline_acc:.2f}%")

ПРЕРЕКВИЗИТ: ОБУЧЕНИЕ СТУДЕНТА (BASELINE)
Training Student from scratch...
Epoch 1: Loss: 1.5694, Train Acc: 41.73%, Val Acc: 52.43%
Epoch 2: Loss: 1.2852, Train Acc: 53.79%, Val Acc: 58.09%
Epoch 3: Loss: 1.1672, Train Acc: 58.28%, Val Acc: 62.02%
Epoch 4: Loss: 1.0996, Train Acc: 60.95%, Val Acc: 64.97%
Epoch 5: Loss: 1.0347, Train Acc: 63.43%, Val Acc: 61.61%
✓ Студент (baseline) обучен: 61.61%

СРАВНЕНИЕ МЕТРИК:
Учитель: 74.41% vs Студент: 61.61%
Разница: 12.80%


In [7]:
# Ячейка 7: ЭКСПЕРИМЕНТ 1 - ДИСТИЛЛЯЦИЯ ЛОГИТОВ
print("=" * 50)
print("ЭКСПЕРИМЕНТ 1: ДИСТИЛЛЯЦИЯ ЛОГИТОВ")
print("=" * 50)

class LogitsDistillationTrainer(BaseTrainer):
    def __init__(self, teacher_model, student_model, train_loader, val_loader, device, alpha=0.7, temperature=4):
        super().__init__(student_model, train_loader, val_loader, device)
        self.teacher = teacher_model
        self.alpha = alpha
        self.temperature = temperature
        
    def train_epoch(self, optimizer):
        self.model.train()
        self.teacher.eval()  # Учитель в eval mode
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, targets in self.train_loader:
            inputs, targets = inputs.to(self.device), targets.to(self.device)
            optimizer.zero_grad()
            
            # Прямой проход через учителя БЕЗ ГРАДИЕНТОВ
            with torch.no_grad():  # <-- torch.no_grad() как в задании
                teacher_logits = self.teacher(inputs)
            
            student_logits = self.model(inputs)
            
            # Измененная функция ошибки (дистилляция + классификация)
            soft_targets = F.softmax(teacher_logits / self.temperature, dim=1)
            soft_prob = F.log_softmax(student_logits / self.temperature, dim=1)
            distill_loss = F.kl_div(soft_prob, soft_targets, reduction='batchmean') * (self.temperature ** 2)
            student_loss = F.cross_entropy(student_logits, targets)
            loss = self.alpha * distill_loss + (1 - self.alpha) * student_loss
            
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = student_logits.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
        return running_loss / len(self.train_loader), 100. * correct / total

# Загружаем предобученного учителя
teacher_model = TeacherModel().to(device)
teacher_model.load_state_dict(torch.load('checkpoints/teacher_model.pth', map_location=device))
teacher_model.eval()

student_logits = StudentModel().to(device)
optimizer_logits = optim.Adam(student_logits.parameters(), lr=0.001)
trainer_logits = LogitsDistillationTrainer(teacher_model, student_logits, trainloader, testloader, device)

print("Training with Logits Distillation...")
for epoch in range(5):
    train_loss, train_acc = trainer_logits.train_epoch(optimizer_logits)
    val_acc = trainer_logits.validate()
    print(f'Epoch {epoch+1}: Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

torch.save(student_logits.state_dict(), 'checkpoints/student_logits.pth')
logits_acc = evaluate_model(student_logits, testloader, device)
print(f"✓ Logits Distillation: {logits_acc:.2f}%")
print(f"Улучшение: {logits_acc - baseline_acc:+.2f}%")

ЭКСПЕРИМЕНТ 1: ДИСТИЛЛЯЦИЯ ЛОГИТОВ
Training with Logits Distillation...
Epoch 1: Loss: 2.7921, Train Acc: 41.38%, Val Acc: 47.26%
Epoch 2: Loss: 2.1037, Train Acc: 52.61%, Val Acc: 57.52%
Epoch 3: Loss: 1.8723, Train Acc: 57.09%, Val Acc: 59.90%
Epoch 4: Loss: 1.7361, Train Acc: 60.02%, Val Acc: 61.33%
Epoch 5: Loss: 1.6525, Train Acc: 61.88%, Val Acc: 63.47%
✓ Logits Distillation: 63.47%
Улучшение: +1.86%


In [8]:
# Ячейка 8: ЭКСПЕРИМЕНТ 2 - ДИСТИЛЛЯЦИЯ ПРИЗНАКОВ
print("=" * 50)
print("ЭКСПЕРИМЕНТ 2: ДИСТИЛЛЯЦИЯ ПРИЗНАКОВ")
print("=" * 50)

class FeatureDistillationTrainer(BaseTrainer):
    def __init__(self, teacher_model, student_model, train_loader, val_loader, device, alpha=0.5):
        super().__init__(student_model, train_loader, val_loader, device)
        self.teacher = teacher_model
        self.alpha = alpha
        self.adapters = nn.ModuleList()
        
        # Приведение блоков к одной размерности
        with torch.no_grad():
            dummy_input = torch.randn(1, 3, 32, 32).to(device)
            teacher_features = self.teacher.get_features(dummy_input)
            student_features = self.model.get_features(dummy_input)
        
        for i in range(min(len(teacher_features), len(student_features))):
            t_channels = teacher_features[i].shape[1]
            s_channels = student_features[i].shape[1]
            if t_channels != s_channels:
                self.adapters.append(nn.Conv2d(s_channels, t_channels, 1))  # Адаптер
            else:
                self.adapters.append(nn.Identity())
    
    def train_epoch(self, optimizer):
        self.model.train()
        self.teacher.eval()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, targets in self.train_loader:
            inputs, targets = inputs.to(self.device), targets.to(self.device)
            optimizer.zero_grad()
            
            with torch.no_grad():
                teacher_features = self.teacher.get_features(inputs)
            
            student_features = self.model.get_features(inputs)
            student_logits = self.model(inputs)
            
            # COSINE LOSS между фичами
            feature_loss = 0
            count = 0
            for i in range(min(len(teacher_features), len(student_features))):
                adapted_features = self.adapters[i](student_features[i])  # Приведение размерности
                target_features = teacher_features[i]
                
                if adapted_features.shape[2:] != target_features.shape[2:]:
                    adapted_features = F.adaptive_avg_pool2d(adapted_features, target_features.shape[2:])
                
                # Cosine similarity loss
                adapted_flat = adapted_features.view(adapted_features.size(0), -1)
                target_flat = target_features.view(target_features.size(0), -1)
                cosine_sim = F.cosine_similarity(adapted_flat, target_flat, dim=1)
                feature_loss += (1 - cosine_sim.mean())  # Cosine loss
                count += 1
            
            if count > 0:
                feature_loss = feature_loss / count
            
            cls_loss = F.cross_entropy(student_logits, targets)
            loss = self.alpha * feature_loss + (1 - self.alpha) * cls_loss  # Комбинированный loss
            
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = student_logits.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
        return running_loss / len(self.train_loader), 100. * correct / total

student_features = StudentModel().to(device)
optimizer_features = optim.Adam(student_features.parameters(), lr=0.001)
trainer_features = FeatureDistillationTrainer(teacher_model, student_features, trainloader, testloader, device)

print("Training with Feature Distillation...")
for epoch in range(5):
    train_loss, train_acc = trainer_features.train_epoch(optimizer_features)
    val_acc = trainer_features.validate()
    print(f'Epoch {epoch+1}: Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

torch.save(student_features.state_dict(), 'checkpoints/student_features.pth')
features_acc = evaluate_model(student_features, testloader, device)
print(f"✓ Feature Distillation: {features_acc:.2f}%")
print(f"Улучшение: {features_acc - baseline_acc:+.2f}%")

ЭКСПЕРИМЕНТ 2: ДИСТИЛЛЯЦИЯ ПРИЗНАКОВ
Training with Feature Distillation...
Epoch 1: Loss: 1.0998, Train Acc: 40.69%, Val Acc: 45.49%
Epoch 2: Loss: 0.9395, Train Acc: 51.85%, Val Acc: 56.59%
Epoch 3: Loss: 0.8747, Train Acc: 57.02%, Val Acc: 60.28%
Epoch 4: Loss: 0.8350, Train Acc: 59.39%, Val Acc: 58.78%
Epoch 5: Loss: 0.8048, Train Acc: 61.84%, Val Acc: 64.10%
✓ Feature Distillation: 64.10%
Улучшение: +2.49%


In [9]:
# Ячейка 9: ЭКСПЕРИМЕНТ 3 - ДИСТИЛЛЯЦИЯ С РЕГРЕССОРОМ
print("=" * 50)
print("ЭКСПЕРИМЕНТ 3: ДИСТИЛЛЯЦИЯ С РЕГРЕССОРОМ")
print("=" * 50)

class RegressorDistillationTrainer(BaseTrainer):
    def __init__(self, teacher_model, student_model, train_loader, val_loader, device, alpha=0.5):
        super().__init__(student_model, train_loader, val_loader, device)
        self.teacher = teacher_model
        self.alpha = alpha
        self.regressors = nn.ModuleList()  # Обучаемые регрессоры
        
        with torch.no_grad():
            dummy_input = torch.randn(1, 3, 32, 32).to(device)
            teacher_features = self.teacher.get_features(dummy_input)
            student_features = self.model.get_features(dummy_input)
        
        # СОЗДАЕМ ОБУЧАЕМЫЕ РЕГРЕССОРЫ (conv2d блоки)
        for i in range(min(len(teacher_features), len(student_features))):
            s_channels = student_features[i].shape[1]
            t_channels = teacher_features[i].shape[1]
            regressor = nn.Sequential(
                nn.Conv2d(s_channels, t_channels, 3, padding=1),  # Обучаемый conv2d
                nn.BatchNorm2d(t_channels),
                nn.ReLU(),
                nn.Conv2d(t_channels, t_channels, 1)  # Обучаемый conv2d
            )
            self.regressors.append(regressor)
    
    def train_epoch(self, optimizer):
        self.model.train()
        self.teacher.eval()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, targets in self.train_loader:
            inputs, targets = inputs.to(self.device), targets.to(self.device)
            optimizer.zero_grad()
            
            with torch.no_grad():
                teacher_features = self.teacher.get_features(inputs)
            
            student_features = self.model.get_features(inputs)
            student_logits = self.model(inputs)
            
            # MSE LOSS между фичами
            regression_loss = 0
            count = 0
            for i in range(min(len(teacher_features), len(student_features))):
                regressed_features = self.regressors[i](student_features[i])  # Обучаемый регрессор
                target_features = teacher_features[i]
                
                if regressed_features.shape[2:] != target_features.shape[2:]:
                    regressed_features = F.adaptive_avg_pool2d(regressed_features, target_features.shape[2:])
                
                regression_loss += F.mse_loss(regressed_features, target_features)  # MSE loss
                count += 1
            
            if count > 0:
                regression_loss = regression_loss / count
            
            cls_loss = F.cross_entropy(student_logits, targets)
            loss = self.alpha * regression_loss + (1 - self.alpha) * cls_loss
            
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = student_logits.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
        return running_loss / len(self.train_loader), 100. * correct / total

student_regressor = StudentModel().to(device)
optimizer_regressor = optim.Adam(student_regressor.parameters(), lr=0.001)
trainer_regressor = RegressorDistillationTrainer(teacher_model, student_regressor, trainloader, testloader, device)

print("Training with Regressor Distillation...")
for epoch in range(5):
    train_loss, train_acc = trainer_regressor.train_epoch(optimizer_regressor)
    val_acc = trainer_regressor.validate()
    print(f'Epoch {epoch+1}: Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

torch.save(student_regressor.state_dict(), 'checkpoints/student_regressor.pth')
regressor_acc = evaluate_model(student_regressor, testloader, device)
print(f"✓ Regressor Distillation: {regressor_acc:.2f}%")
print(f"Улучшение: {regressor_acc - baseline_acc:+.2f}%")

ЭКСПЕРИМЕНТ 3: ДИСТИЛЛЯЦИЯ С РЕГРЕССОРОМ
Training with Regressor Distillation...
Epoch 1: Loss: 1.9240, Train Acc: 42.48%, Val Acc: 49.14%
Epoch 2: Loss: 1.7608, Train Acc: 54.69%, Val Acc: 55.60%
Epoch 3: Loss: 1.6963, Train Acc: 59.26%, Val Acc: 62.40%
Epoch 4: Loss: 1.6553, Train Acc: 62.35%, Val Acc: 64.20%
Epoch 5: Loss: 1.6266, Train Acc: 64.48%, Val Acc: 66.56%
✓ Regressor Distillation: 66.56%
Улучшение: +4.95%


In [10]:
# Ячейка 10: БОНУС - КОМБИНИРОВАННАЯ ДИСТИЛЛЯЦИЯ
print("=" * 50)
print("БОНУС: КОМБИНИРОВАННАЯ ДИСТИЛЛЯЦИЯ")
print("=" * 50)

class CombinedDistillationTrainer(BaseTrainer):
    def __init__(self, teacher_model, student_model, train_loader, val_loader, device, 
                 alpha=0.3, beta=0.3, gamma=0.2, temperature=4):
        super().__init__(student_model, train_loader, val_loader, device)
        self.teacher = teacher_model
        self.alpha = alpha  # Logits
        self.beta = beta    # Feature  
        self.gamma = gamma  # Activation
        self.temperature = temperature
        self.adapters = nn.ModuleList()
        
        with torch.no_grad():
            dummy_input = torch.randn(1, 3, 32, 32).to(device)
            teacher_features = self.teacher.get_features(dummy_input)
            student_features = self.model.get_features(dummy_input)
        
        # N БЛОКОВ УЧИТЕЛЯ на N-1 БЛОК СТУДЕНТА
        for i in range(1, min(len(teacher_features), len(student_features) + 1)):
            t_channels = teacher_features[i].shape[1]
            s_channels = student_features[i-1].shape[1]
            if t_channels != s_channels:
                self.adapters.append(nn.Conv2d(s_channels, t_channels, 1))
            else:
                self.adapters.append(nn.Identity())
    
    def train_epoch(self, optimizer):
        self.model.train()
        self.teacher.eval()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, targets in self.train_loader:
            inputs, targets = inputs.to(self.device), targets.to(self.device)
            optimizer.zero_grad()
            
            with torch.no_grad():
                teacher_features = self.teacher.get_features(inputs)
                teacher_logits = self.teacher(inputs)
            
            student_features = self.model.get_features(inputs)
            student_logits = self.model(inputs)
            
            # 1. Logits distillation
            soft_targets = F.softmax(teacher_logits / self.temperature, dim=1)
            soft_prob = F.log_softmax(student_logits / self.temperature, dim=1)
            logits_loss = F.kl_div(soft_prob, soft_targets, reduction='batchmean') * (self.temperature ** 2)
            
            # 2. Feature distillation (cosine)
            feature_loss = 0
            count = 0
            for i in range(1, min(len(teacher_features), len(student_features) + 1)):  # N на N-1
                adapted_features = self.adapters[i-1](student_features[i-1])
                target_features = teacher_features[i]
                
                if adapted_features.shape[2:] != target_features.shape[2:]:
                    adapted_features = F.adaptive_avg_pool2d(adapted_features, target_features.shape[2:])
                
                adapted_flat = adapted_features.view(adapted_features.size(0), -1)
                target_flat = target_features.view(target_features.size(0), -1)
                cosine_sim = F.cosine_similarity(adapted_flat, target_flat, dim=1)
                feature_loss += (1 - cosine_sim.mean())
                count += 1
            
            if count > 0:
                feature_loss = feature_loss / count
            
            # 3. Activation imitation (MSE на нормализованных активациях)
            activation_loss = 0
            count_act = 0
            for i in range(1, min(len(teacher_features), len(student_features) + 1)):
                adapted_features = self.adapters[i-1](student_features[i-1])
                target_features = teacher_features[i]
                
                if adapted_features.shape[2:] != target_features.shape[2:]:
                    adapted_features = F.adaptive_avg_pool2d(adapted_features, target_features.shape[2:])
                
                # Имитация активации фичмап
                activation_loss += F.mse_loss(
                    F.normalize(adapted_features.view(adapted_features.size(0), -1), dim=1),
                    F.normalize(target_features.view(target_features.size(0), -1), dim=1)
                )
                count_act += 1
            
            if count_act > 0:
                activation_loss = activation_loss / count_act
            
            # 4. Classification
            cls_loss = F.cross_entropy(student_logits, targets)
            
            # КОМБИНИРОВАННЫЙ ЛОСС
            loss = (self.alpha * logits_loss + 
                   self.beta * feature_loss + 
                   self.gamma * activation_loss + 
                   (1 - self.alpha - self.beta - self.gamma) * cls_loss)
            
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = student_logits.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            
        return running_loss / len(self.train_loader), 100. * correct / total

student_combined = StudentModel().to(device)
optimizer_combined = optim.Adam(student_combined.parameters(), lr=0.001)
trainer_combined = CombinedDistillationTrainer(teacher_model, student_combined, trainloader, testloader, device)

print("Training with Combined Distillation...")
for epoch in range(5):
    train_loss, train_acc = trainer_combined.train_epoch(optimizer_combined)
    val_acc = trainer_combined.validate()
    print(f'Epoch {epoch+1}: Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%')

torch.save(student_combined.state_dict(), 'checkpoints/student_combined.pth')
combined_acc = evaluate_model(student_combined, testloader, device)
print(f"✓ Combined Distillation: {combined_acc:.2f}%")
print(f"Улучшение: {combined_acc - baseline_acc:+.2f}%")

БОНУС: КОМБИНИРОВАННАЯ ДИСТИЛЛЯЦИЯ
Training with Combined Distillation...
Epoch 1: Loss: 1.5735, Train Acc: 40.57%, Val Acc: 51.12%
Epoch 2: Loss: 1.2343, Train Acc: 52.77%, Val Acc: 59.99%
Epoch 3: Loss: 1.1408, Train Acc: 56.82%, Val Acc: 62.64%
Epoch 4: Loss: 1.0764, Train Acc: 59.55%, Val Acc: 60.55%
Epoch 5: Loss: 1.0286, Train Acc: 61.86%, Val Acc: 65.60%
✓ Combined Distillation: 65.60%
Улучшение: +3.99%


In [6]:
# Ячейка 11: ВИЗУАЛИЗАЦИЯ И ФИНАЛЬНЫЕ РЕЗУЛЬТАТЫ
print("=" * 60)
print("ВИЗУАЛИЗАЦИЯ И ФИНАЛЬНЫЕ РЕЗУЛЬТАТЫ")
print("=" * 60)

import matplotlib.pyplot as plt

# Загружаем реальные результаты
def load_and_evaluate(model_class, path, name):
    model = model_class().to(device)
    model.load_state_dict(torch.load(path, map_location=device))
    acc = evaluate_model(model, testloader, device)
    print(f"✓ {name}: {acc:.2f}%")
    return acc

print("Загрузка и оценка всех моделей...")
teacher_acc = load_and_evaluate(TeacherModel, 'checkpoints/teacher_model.pth', 'Teacher')
baseline_acc = load_and_evaluate(StudentModel, 'checkpoints/student_baseline.pth', 'Baseline Student')
logits_acc = load_and_evaluate(StudentModel, 'checkpoints/student_logits.pth', 'Logits Distillation')
features_acc = load_and_evaluate(StudentModel, 'checkpoints/student_features.pth', 'Feature Distillation')
regressor_acc = load_and_evaluate(StudentModel, 'checkpoints/student_regressor.pth', 'Regressor Distillation')
combined_acc = load_and_evaluate(StudentModel, 'checkpoints/student_combined.pth', 'Combined Distillation')

# Собираем РЕАЛЬНЫЕ результаты
results = {
    'Teacher': teacher_acc,
    'Baseline Student': baseline_acc,
    'Logits Distillation': logits_acc,
    'Feature Distillation': features_acc,
    'Regressor Distillation': regressor_acc,
    'Combined Distillation': combined_acc
}

print("\n" + "=" * 50)
print("ФИНАЛЬНЫЕ РЕЗУЛЬТАТЫ")
print("=" * 50)

methods = list(results.keys())
accuracies = list(results.values())

plt.figure(figsize=(14, 7))
colors = ['#2c3e50', '#95a5a6', '#27ae60', '#e74c3c', '#e74c3c', '#27ae60']
bars = plt.bar(methods, accuracies, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)

# Добавляем значения
for i, (bar, acc) in enumerate(zip(bars, accuracies)):
    height = bar.get_height()
    if i >= 2:
        improvement = acc - baseline_acc
        text = f'{acc:.1f}%\n({improvement:+.1f}%)'
        color = 'green' if improvement > 0 else 'red'
    else:
        text = f'{acc:.1f}%'
        color = 'black'
        
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
            text, ha='center', va='bottom', fontweight='bold', 
            fontsize=10, color=color)

plt.title('Сравнение методов дистилляции знаний: CIFAR-10', fontsize=16, fontweight='bold', pad=20)
plt.ylabel('Точность (%)', fontsize=12)
plt.ylim(0, max(accuracies) + 10)
plt.axhline(y=baseline_acc, color='red', linestyle='--', alpha=0.7, label='Baseline студент')
plt.grid(True, alpha=0.3, axis='y')
plt.legend()
plt.tight_layout()
plt.savefig('results/final_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n" + "=" * 60)
print("ВЫПОЛНЕНИЕ ТРЕБОВАНИЙ ДЗ")
print("=" * 60)

requirements = [
    " Пререквизит: Обучение Учителя и Студента",
    " Сравнение метрик моделей", 
    " Эксперимент 1: Дистилляция логитов (torch.no_grad + KL divergence)",
    " Эксперимент 2: Feature distillation (cosine loss + адаптеры)",
    " Эксперимент 3: Regressor distillation (MSE loss + обучаемые conv2d)",
    " Бонус: Combined distillation + N учитель на N-1 студент",
    " Визуализация результатов"
]

for req in requirements:
    print(req)

print(f"\n Лучший метод: Combined Distillation ({combined_acc:.2f}%)")
print(f" Улучшение: {combined_acc - baseline_acc:+.2f}%")
print(f" Сжатие модели: {count_parameters(TeacherModel())/count_parameters(StudentModel()):.1f}x")

print("\n" + "=" * 60)
print("ЭКСПЕРИМЕНТ УСПЕШНО ЗАВЕРШЕН! ")
print("=" * 60)

ВИЗУАЛИЗАЦИЯ И ФИНАЛЬНЫЕ РЕЗУЛЬТАТЫ
Загрузка и оценка всех моделей...


NameError: name 'TeacherModel' is not defined