## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
from torchvision.ops import StochasticDepth, sigmoid_focal_loss
import torchvision.models as models
from einops.layers.torch import Rearrange

from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings(action='ignore') 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [3]:
CFG = {
    'VIDEO_LENGTH':50, # 10프레임 * 5초
    'IMG_SIZE':256,
    'EPOCHS':10,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':4,
    'SEED':41
}

## Fixed RandomSeed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Data Load

In [5]:
df = pd.read_csv('/root/Competitions/DACON/Carcarsh_video_classification/data/train.csv')

## Train / Validation Split

In [6]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CFG['SEED'])

## CustomDataset

In [7]:
class CustomDataset(Dataset):
    def __init__(self, video_path_list, label_list):
        self.video_path_list = video_path_list
        self.label_list = label_list
        
    def __getitem__(self, index):
        frames = self.get_video(self.video_path_list[index])
        
        if self.label_list is not None:
            label = self.label_list[index]
            return frames, label
        else:
            return frames
        
    def __len__(self):
        return len(self.video_path_list)
    
    def get_video(self, path):
        frames = []
        path =  f'/root/Competitions/DACON/Carcarsh_video_classification/data/{path[2:]}'
        cap = cv2.VideoCapture(path)
        for _ in range(CFG['VIDEO_LENGTH']):
            _, img = cap.read()
            img = cv2.resize(img, (CFG['IMG_SIZE'], CFG['IMG_SIZE']))
            img = img / 255.
            frames.append(img)
        return torch.FloatTensor(np.array(frames)).permute(3, 0, 1, 2)

In [8]:
train_dataset = CustomDataset(train['video_path'].values, train['label'].values)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val['video_path'].values, val['label'].values)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

## Model Define

In [9]:
class SqueezeExcitation(nn.Module):
    def __init__(self, in_dim, sqz_dim) -> None:
        super(SqueezeExcitation, self).__init__()

        self.pool = nn.AdaptiveAvgPool3d(output_size=1)
        self.fc1 = nn.Conv3d(in_dim, sqz_dim, kernel_size=1, stride=1)
        self.fc2 = nn.Conv3d(sqz_dim, in_dim, kernel_size=1, stride=1)
        self.act = nn.SiLU()
        self.scale_act = nn.Sigmoid()

    
    def forward(self, x):

        squeezed = self.pool(x)

        e = self.fc1(squeezed)
        e = self.act(e)
        e = self.fc2(e)
        e = self.scale_act(e)

        out = x * e
        
        return out

class MBConv(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, kernel_size, stride, padding, scale=True) -> None:
        super(MBConv, self).__init__()

        self.scale = scale

        if self.scale:
            self.bottleneck = nn.Sequential(
                nn.Conv3d(in_dim, hidden_dim, kernel_size=1, stride=1, bias=False),
                nn.BatchNorm3d(hidden_dim),
                nn.SiLU()
            )

            self.conv1 = nn.Sequential(
                nn.Conv3d(hidden_dim, hidden_dim, kernel_size=kernel_size, stride=stride, padding=padding, groups=hidden_dim, bias=False),
                nn.BatchNorm3d(hidden_dim),
                nn.SiLU()
            )
        else:
            self.conv1 = nn.Sequential(
                nn.Conv3d(in_dim, hidden_dim, kernel_size=kernel_size, stride=stride, padding=padding, groups=in_dim, bias=False),
                nn.BatchNorm3d(hidden_dim),
                nn.SiLU()
            )
        
        self.SqueezeExcitation = SqueezeExcitation(hidden_dim, 8 if hidden_dim == 32 else hidden_dim//24)

        self.conv2 = nn.Sequential(
            nn.Conv3d(hidden_dim, out_dim, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm3d(out_dim)
        )
    
    def forward(self, x):

        if self.scale:
            x = self.bottleneck(x)
            
        h = self.conv1(x)
        h = self.SqueezeExcitation(h)
        h = self.conv2(h)

        return h



In [10]:
class efficientNet3D(nn.Module):
    def __init__(self, num_classes=13) -> None:
        super(efficientNet3D, self).__init__()

        self.conv1 = nn.Sequential(
            nn.Conv3d(3, 32, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm3d(32),
            nn.SiLU()
        )

        self.conv2 = nn.Sequential(
            MBConv(in_dim=32, hidden_dim=32, out_dim=16, kernel_size=3, stride=1, padding=1, scale=False),
            StochasticDepth(p=0.0, mode='row')
        )

        self.conv3 = nn.Sequential(
            MBConv(in_dim=16, hidden_dim=96, out_dim=24, kernel_size=3, stride=2, padding=1, scale=True),
            StochasticDepth(p=0.0125, mode='row'),
            MBConv(in_dim=24, hidden_dim=144, out_dim=24, kernel_size=3, stride=1, padding=1, scale=True),
            StochasticDepth(p=0.025, mode='row')
        )

        self.conv4 = nn.Sequential(
            MBConv(in_dim=24, hidden_dim=144, out_dim=40, kernel_size=5, stride=2, padding=2, scale=True),
            StochasticDepth(p=0.0375, mode='row'),
            MBConv(in_dim=40, hidden_dim=240, out_dim=40, kernel_size=5, stride=1, padding=2, scale=True),
            StochasticDepth(p=0.05, mode='row'),
        )

        self.conv5 = nn.Sequential(
            MBConv(in_dim=40, hidden_dim=240, out_dim=80, kernel_size=3, stride=2, padding=1, scale=True),
            StochasticDepth(p=0.0625, mode='row'),
            MBConv(in_dim=80, hidden_dim=480, out_dim=80, kernel_size=3, stride=1, padding=1, scale=True),
            StochasticDepth(p=0.075, mode='row'),
            MBConv(in_dim=80, hidden_dim=480, out_dim=80, kernel_size=3, stride=1, padding=1, scale=True),
            StochasticDepth(p=0.0875, mode='row'),
        )

        self.conv6 = nn.Sequential(
            MBConv(in_dim=80, hidden_dim=480, out_dim=112, kernel_size=5, stride=1, padding=2, scale=True),
            StochasticDepth(p=0.1, mode='row'),
            MBConv(in_dim=112, hidden_dim=672, out_dim=112, kernel_size=5, stride=1, padding=2, scale=True),
            StochasticDepth(p=0.1125, mode='row'),
            MBConv(in_dim=112, hidden_dim=672, out_dim=112, kernel_size=5, stride=1, padding=2, scale=True),
            StochasticDepth(p=0.125, mode='row'),
        )

        self.conv7 = nn.Sequential(
            MBConv(in_dim=112, hidden_dim=672, out_dim=192, kernel_size=5, stride=2, padding=2, scale=True),
            StochasticDepth(p=0.1375, mode='row'),
            MBConv(in_dim=192, hidden_dim=1152, out_dim=192, kernel_size=5, stride=1, padding=2, scale=True),
            StochasticDepth(p=0.15, mode='row'),
            MBConv(in_dim=192, hidden_dim=1152, out_dim=192, kernel_size=5, stride=1, padding=2, scale=True),
            StochasticDepth(p=0.1625, mode='row'),
            MBConv(in_dim=192, hidden_dim=1152, out_dim=192, kernel_size=5, stride=1, padding=2, scale=True),
            StochasticDepth(p=0.175, mode='row'),
        )

        self.conv8 = nn.Sequential(
            MBConv(in_dim=192, hidden_dim=1152, out_dim=320, kernel_size=3, stride=1, padding=1, scale=False),
            StochasticDepth(p=0.1875, mode='row')
        )

        self.conv9 = nn.Sequential(
            nn.Conv3d(320, 1280, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm3d(1280),
            nn.SiLU()
        )

        self.pool = nn.AdaptiveAvgPool3d(output_size=1)
        self.drop = nn.Dropout(p=0.2)
        self.clf  = nn.Linear(1280, out_features=num_classes)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = self.conv7(x)
        x = self.conv8(x)
        x = self.conv9(x)

        p = self.pool(x)
        # print(p.shape)
        # p = Rearrange(p, 'b ')
        # p = Rearrange(p, ' c t w h -> (c t w h)')
        p = p.view(x.shape[0], -1)
        # [4, 1280, 1, 1, 1]

        out = self.clf(p)

        return out

        

In [11]:
class BaseModel(nn.Module):
    def __init__(self, num_classes=13):
        super(BaseModel, self).__init__()
        self.feature_extract = models.video.r3d_18(weights=models.video.R3D_18_Weights.DEFAULT)
        self.drop = nn.Dropout(p=0.2)
        self.act  = nn.SiLU()
        self.classifier = nn.Linear(400, num_classes)
        
    def forward(self, x):
        batch_size = x.size(0)
        x = self.feature_extract(x)
        # x = x.view(batch_size, -1)
        x = self.drop(x)
        x = self.act(x)
        x = self.classifier(x)
        return x

## Train

In [12]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    criterion = FocalLoss().to(device)
    
    best_val_score = 0
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for videos, labels in tqdm(iter(train_loader)):
            videos = videos.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            
            output = model(videos)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _val_loss, _val_score = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val F1 : [{_val_score:.5f}]')
        
        if scheduler is not None:
            scheduler.step(_val_score)
            
        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model
    
    return best_model

In [13]:
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss = []
    preds, trues = [], []
    
    with torch.no_grad():
        for videos, labels in tqdm(iter(val_loader)):
            videos = videos.to(device)
            labels = labels.to(device)
            
            logit = model(videos)
            
            loss = criterion(logit, labels)
            
            val_loss.append(loss.item())
            
            preds += logit.argmax(1).detach().cpu().numpy().tolist()
            trues += labels.detach().cpu().numpy().tolist()
        
        _val_loss = np.mean(val_loss)
    
    _val_score = f1_score(trues, preds, average='macro')
    return _val_loss, _val_score

In [14]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2, logits=False, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.logits = logits
        self.reduce = reduce

    def forward(self, inputs, targets):
    
        ce_loss = nn.CrossEntropyLoss()(inputs, targets)

        pt = torch.exp(-ce_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * ce_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

## Run!!

In [15]:
# model = efficientNet3D()
model = BaseModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

100%|██████████| 540/540 [15:51<00:00,  1.76s/it]
100%|██████████| 135/135 [02:33<00:00,  1.14s/it]


Epoch [1], Train Loss : [0.11035] Val Loss : [0.07052] Val F1 : [0.17948]


100%|██████████| 540/540 [16:42<00:00,  1.86s/it]
100%|██████████| 135/135 [02:30<00:00,  1.12s/it]


Epoch [2], Train Loss : [0.07768] Val Loss : [0.04551] Val F1 : [0.20748]


100%|██████████| 540/540 [16:27<00:00,  1.83s/it]
100%|██████████| 135/135 [02:26<00:00,  1.09s/it]


Epoch [3], Train Loss : [0.06246] Val Loss : [0.05001] Val F1 : [0.23219]


100%|██████████| 540/540 [16:38<00:00,  1.85s/it]
100%|██████████| 135/135 [02:30<00:00,  1.12s/it]


Epoch [4], Train Loss : [0.06206] Val Loss : [0.04530] Val F1 : [0.27475]


 61%|██████▏   | 331/540 [10:04<06:26,  1.85s/it]

## Inference

In [None]:
test = pd.read_csv('/root/Competitions/DACON/Carcarsh_video_classification/data/test.csv')

In [None]:
test_dataset = CustomDataset(test['video_path'].values, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    preds = []
    with torch.no_grad():
        for videos in tqdm(iter(test_loader)):
            videos = videos.to(device)
            
            logit = model(videos)

            preds += logit.argmax(1).detach().cpu().numpy().tolist()
    return preds

In [None]:
preds = inference(model, test_loader, device)

## Submission

In [None]:
submit = pd.read_csv('/root/Competitions/DACON/Carcarsh_video_classification/data/sample_submission.csv')

In [None]:
submit['label'] = preds
submit.head()

In [None]:
submit.to_csv('/root/Competitions/DACON/Carcarsh_video_classification/prediction/efficientNet3d(x2 resolution)_submit.csv', index=False)