In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score
from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm
from PIL import ImageFile

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [3]:
# Путь к данным animals
train_path = 'D:\\ProgPrj\\dsProjects\\gazprom-media\\ml\\dataset_kaggle\\food-101\\images'
#test_path = 'D:\\ProgPrj\\dsProjects\\urfo_hack_deer\\DatasetForCLIP\\Val'

In [4]:
# Преобразования для тренировочного и валидационного наборов данных
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomApply(torch.nn.ModuleList([transforms.ColorJitter()]), p=0.25),
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.RandomRotation(degrees=(-10, 10)),
    transforms.RandomGrayscale(p=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    transforms.RandomErasing(p=0.1, value='random')
])

In [5]:
val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [6]:
# Загрузка данных
train_dataset = datasets.ImageFolder(train_path, transform=train_transform)
#val_dataset = datasets.ImageFolder(test_path, transform=val_transform)

In [7]:
print(len(train_dataset.class_to_idx))

101


In [8]:
batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
#val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

In [9]:
# Загрузка модели и процессора CLIP
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

  return self.fget.__get__(instance, owner)()


In [10]:
# Размерность выходных признаков из модели CLIP
hidden_size = model.config.projection_dim

In [11]:
# Добавляем новый классификационный слой
class CustomCLIPModel(nn.Module):
    def __init__(self, clip_model, num_classes):
        super(CustomCLIPModel, self).__init__()
        self.clip_model = clip_model
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        with torch.no_grad():
            features = self.clip_model.get_image_features(x)
        x = self.fc(features)
        return x

In [12]:
num_classes = len(train_dataset.classes)
custom_model = CustomCLIPModel(model, num_classes).to(DEVICE)

In [13]:
# Оптимизатор и функция потерь
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(custom_model.parameters(), lr=0.0005)

In [14]:
# Обучение с использованием градиентного скейлера
from torch.cuda.amp import GradScaler, autocast

In [15]:
num_epochs = 50
train_loss_history = []
val_accuracy_history = []
val_f1_history = []

scaler = GradScaler()

In [16]:
for epoch in range(num_epochs):
    ImageFile.LOAD_TRUNCATED_IMAGES = True
    custom_model.train()
    running_loss = 0.0
    train_loader_tqdm = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')

    for inputs, labels in train_loader_tqdm:
        inputs, labels = inputs.to(DEVICE, non_blocking=True), labels.to(DEVICE, non_blocking=True)

        optimizer.zero_grad()

        with autocast():
            outputs = custom_model(inputs)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item() * inputs.size(0)
        train_loader_tqdm.set_postfix(loss=loss.item())

    epoch_loss = running_loss / len(train_dataset)
    train_loss_history.append(epoch_loss)

Epoch 1/50: 100%|██████████| 1579/1579 [01:54<00:00, 13.77it/s, loss=1.7] 
Epoch 2/50: 100%|██████████| 1579/1579 [02:01<00:00, 12.97it/s, loss=0.499]
Epoch 3/50: 100%|██████████| 1579/1579 [01:53<00:00, 13.91it/s, loss=0.925]
Epoch 4/50: 100%|██████████| 1579/1579 [01:52<00:00, 14.02it/s, loss=0.953]
Epoch 5/50: 100%|██████████| 1579/1579 [01:56<00:00, 13.56it/s, loss=1.22] 
Epoch 6/50: 100%|██████████| 1579/1579 [01:52<00:00, 14.04it/s, loss=0.558]
Epoch 7/50: 100%|██████████| 1579/1579 [01:54<00:00, 13.80it/s, loss=1.2]  
Epoch 8/50: 100%|██████████| 1579/1579 [01:54<00:00, 13.73it/s, loss=1.03] 
Epoch 9/50: 100%|██████████| 1579/1579 [01:56<00:00, 13.52it/s, loss=0.808]
Epoch 10/50: 100%|██████████| 1579/1579 [01:56<00:00, 13.54it/s, loss=1.57] 
Epoch 11/50:   6%|▌         | 92/1579 [00:15<04:09,  5.95it/s, loss=1.03]  


FileNotFoundError: Caught FileNotFoundError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "C:\Users\Geo\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "C:\Users\Geo\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "C:\Users\Geo\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "C:\Users\Geo\AppData\Local\Programs\Python\Python310\lib\site-packages\torchvision\datasets\folder.py", line 229, in __getitem__
    sample = self.loader(path)
  File "C:\Users\Geo\AppData\Local\Programs\Python\Python310\lib\site-packages\torchvision\datasets\folder.py", line 268, in default_loader
    return pil_loader(path)
  File "C:\Users\Geo\AppData\Local\Programs\Python\Python310\lib\site-packages\torchvision\datasets\folder.py", line 246, in pil_loader
    with open(path, "rb") as f:
FileNotFoundError: [Errno 2] No such file or directory: 'D:\\ProgPrj\\dsProjects\\gazprom-media\\ml\\dataset_kaggle\\food-101\\images\\apple_pie\\3501006.jpg'
