In [1]:
import warnings
warnings.filterwarnings('ignore')

import torch
import pandas as pd
import numpy as np
import torchvision
import cv2
import pytorch_lightning as pl
import torchmetrics as tm

from torch.utils.data import Dataset
from ZeroShotDataset import ZeroShotDataset
from params import *
from transformers import CLIPProcessor, CLIPModel
from LossFunc import *
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from torch.utils.data import random_split
from CLIPConditionedSegFormerModel import CLIPConditionedSegFormer

In [2]:
train_df = pd.read_csv(TrainParams.TRAIN_CSV_PATH)
clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch16')

In [3]:
train_df

Unnamed: 0,image,mask,label,category_id
0,000000558840.jpg,000000558840_0.jpg,hot dog,58
1,000000558840.jpg,000000558840_1.jpg,bottle,44
2,000000558840.jpg,000000558840_2.jpg,cup,47
3,000000558840.jpg,000000558840_3.jpg,person,1
4,000000558840.jpg,000000558840_4.jpg,spoon,50
...,...,...,...,...
973173,000000581929.jpg,000000581929_973173.jpg,bush,97
973174,000000581929.jpg,000000581929_973174.jpg,cage,99
973175,000000581929.jpg,000000581929_973175.jpg,clouds,106
973176,000000581929.jpg,000000581929_973176.jpg,grass,124


In [4]:
label_freqs = train_df["label"].value_counts()
min_freq = label_freqs.min()
max_freq = label_freqs.max()

balanced_train_df = pd.DataFrame(columns=train_df.columns)
for label in train_df["label"].unique():
    balanced_train_df = balanced_train_df.append(train_df[train_df["label"] == label].sample(min_freq), ignore_index=True)

balanced_train_df["label"].value_counts()


hot dog     121
cabinet     121
house       121
platform    121
railroad    121
           ... 
broccoli    121
mouse       121
keyboard    121
horse       121
moss        121
Name: label, Length: 171, dtype: int64

In [5]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

In [6]:
print(TrainParams.DATASET_IMAGE_FOLDER_TRAIN, TrainParams.DATASET_MASK_FOLDER_TRAIN,)

ProcessedDatasetStuff512/images/train/ ProcessedDatasetStuff512/masks/train/


In [7]:
train_dataset = ZeroShotDataset(
    df = balanced_train_df, 
    image_folder = TrainParams.DATASET_IMAGE_FOLDER_TRAIN,
    mask_folder = TrainParams.DATASET_MASK_FOLDER_TRAIN,
    image_size = TrainParams.IMAGE_DIM,
    mask_size = TrainParams.MASK_SIZE,
    templates = TrainParams.TEMPLATES, 
    unseen_classes = TrainParams.UNSEEN_CLASSES, 
    image_processor = clip_processor, 
    tokenizer = clip_processor.tokenizer, 
    filter_unseen = False,
    filter_seen = True
)

val_dataset = ZeroShotDataset(
    df = balanced_train_df, 
    image_folder = TrainParams.DATASET_IMAGE_FOLDER_TRAIN,
    mask_folder = TrainParams.DATASET_MASK_FOLDER_TRAIN,
    image_size = TrainParams.IMAGE_DIM,
    mask_size = TrainParams.MASK_SIZE,
    templates = TrainParams.TEMPLATES, 
    unseen_classes = TrainParams.UNSEEN_CLASSES, 
    image_processor = clip_processor, 
    tokenizer = clip_processor.tokenizer, 
    filter_unseen = True,
    filter_seen = False
)

In [8]:
print("Batch size:", TrainParams.BATCH_SIZE)
print("Num workers:", TrainParams.NUM_WORKERS)

Batch size: 8
Num workers: 1


In [9]:
print(f"Number of training images: {len(train_dataset)}")   
print(f"Number of val images: {len(val_dataset)}")   

Number of training images: 18997
Number of val images: 1694


In [10]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=TrainParams.BATCH_SIZE, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=TrainParams.NUM_WORKERS)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=TrainParams.BATCH_SIZE, shuffle=False, collate_fn=val_dataset.collate_fn, num_workers=TrainParams.NUM_WORKERS)
test_model = CLIPConditionedSegFormer()

In [11]:
torch.set_float32_matmul_precision('medium')

In [12]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_iou',
    filename='transformer-{epoch:02d}-{val_loss:.3f}-{val_iou:.2f}',
    save_top_k=3,
    mode='max',
    # dirpath='checkpoints/',
    save_last=True,
    verbose=True
)

trainer = pl.Trainer(
    accelerator='gpu',
    max_epochs=30,
    callbacks=[
        checkpoint_callback,
        LearningRateMonitor(logging_interval='step')
    ]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [13]:
trainer.fit(test_model, train_loader, val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                 | Params
---------------------------------------------------
0 | clip      | CLIPModel            | 149 M 
1 | segformer | ConditionedSegFormer | 18.5 M
2 | neloss    | NELoss               | 0     
3 | acc       | Accuracy             | 0     
4 | dice      | DiceLoss             | 0     
5 | iou       | IoULoss              | 0     
6 | f1score   | F1Score              | 0     
---------------------------------------------------
18.5 M    Trainable params
149 M     Non-trainable params
168 M     Total params
672.563   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]