In [1]:
import random
import pandas as pd
import numpy as np
import os
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

#import albumentations as A
#from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from torchsummary import summary

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
df = pd.read_csv('./train.csv')

In [4]:
df

Unnamed: 0,id,path,label
0,TRAIN_000,./train/TRAIN_000.mp4,3
1,TRAIN_001,./train/TRAIN_001.mp4,0
2,TRAIN_002,./train/TRAIN_002.mp4,1
3,TRAIN_003,./train/TRAIN_003.mp4,4
4,TRAIN_004,./train/TRAIN_004.mp4,4
...,...,...,...
605,TRAIN_605,./train/TRAIN_605.mp4,0
606,TRAIN_606,./train/TRAIN_606.mp4,2
607,TRAIN_607,./train/TRAIN_607.mp4,1
608,TRAIN_608,./train/TRAIN_608.mp4,4


In [5]:
class CustomDataset(Dataset):
    def __init__(self, video_path_list, label_list):
        self.video_path_list = video_path_list
        self.label_list = label_list
        
    def __getitem__(self, index):
        frames = self.get_video(self.video_path_list[index])
        
        if self.label_list is not None:
            label = self.label_list[index]
            return frames, label
        else:
            return frames
        
    def __len__(self):
        return len(self.video_path_list)
    
    def get_video(self, path):
        frames = []
        cap = cv2.VideoCapture(path)
        for _ in range(CFG['FPS']):
            _, img = cap.read()
            img = cv2.resize(img, (CFG['IMG_SIZE'], CFG['IMG_SIZE']))
            img = img / 255.
            frames.append(img)
        return torch.FloatTensor(np.array(frames)).permute(3, 0, 1, 2)


In [6]:
CFG = {
    'FPS':30,
    'IMG_SIZE':128,
    'EPOCHS':10,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':4,
    'SEED':41
}

In [7]:
def get_video(path):
        frames = []
        cap = cv2.VideoCapture(path)
        for _ in range(CFG['FPS']):
            _, img = cap.read()
            img = cv2.resize(img, (CFG['IMG_SIZE'], CFG['IMG_SIZE']))
            frames.append(img)
        return torch.FloatTensor(np.array(frames)).permute(0, 3, 1, 2)

In [8]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CFG['SEED'])

In [9]:
df

Unnamed: 0,id,path,label
0,TRAIN_000,./train/TRAIN_000.mp4,3
1,TRAIN_001,./train/TRAIN_001.mp4,0
2,TRAIN_002,./train/TRAIN_002.mp4,1
3,TRAIN_003,./train/TRAIN_003.mp4,4
4,TRAIN_004,./train/TRAIN_004.mp4,4
...,...,...,...
605,TRAIN_605,./train/TRAIN_605.mp4,0
606,TRAIN_606,./train/TRAIN_606.mp4,2
607,TRAIN_607,./train/TRAIN_607.mp4,1
608,TRAIN_608,./train/TRAIN_608.mp4,4


In [10]:
train_dataset = CustomDataset(train['path'].values, train['label'].values)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=4)

val_dataset = CustomDataset(val['path'].values, val['label'].values)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=4)

In [11]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    #model을 cuda로 불러오기
    model.to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    
    best_val_score = 0
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for videos, labels in tqdm(iter(train_loader)):
            #videos는 dataset(video)
            #label은 video 의 label
            videos = videos.to(device)
            labels = labels.to(device)
            
            #predict한 후 오차 계산
            #
            output = model(videos)
            loss = criterion(output, labels)
            
            #한 세트
            #grad를 0으로 바꾸고 역전파.
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _val_loss, _val_score = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val F1 : [{_val_score:.5f}]')
        
        if scheduler is not None:
            scheduler.step(_val_score)
            
        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model
    
    return best_model


In [12]:
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss = []
    preds, trues = [], []
    
    with torch.no_grad():
        for videos, labels in tqdm(iter(val_loader)):
            videos = videos.to(device)
            print(videos.dtype)
            print(videos.shape)
            labels = labels.to(device)
            
            logit = model(videos)
            
            loss = criterion(logit, labels)
            
            val_loss.append(loss.item())
            
            preds += logit.argmax(1).detach().cpu().numpy().tolist()
            trues += labels.detach().cpu().numpy().tolist()
        
        _val_loss = np.mean(val_loss)
    
    _val_score = f1_score(trues, preds, average='macro')
    return _val_loss, _val_score


In [13]:
model = models.video.r3d_18(pretrained=True) # https://pytorch.org/vision/0.8/models.html
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 5)

model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

  0%|          | 0/122 [00:00<?, ?it/s]

RuntimeError: DataLoader worker (pid(s) 37588, 18136, 19612, 33068) exited unexpectedly

In [16]:
model

VideoResNet(
  (stem): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1):

In [17]:
summary(model, (3, 60, 124, 124))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv3d-1       [-1, 64, 60, 62, 62]          28,224
       BatchNorm3d-2       [-1, 64, 60, 62, 62]             128
              ReLU-3       [-1, 64, 60, 62, 62]               0
      Conv3DSimple-4       [-1, 64, 60, 62, 62]         110,592
       BatchNorm3d-5       [-1, 64, 60, 62, 62]             128
              ReLU-6       [-1, 64, 60, 62, 62]               0
      Conv3DSimple-7       [-1, 64, 60, 62, 62]         110,592
       BatchNorm3d-8       [-1, 64, 60, 62, 62]             128
              ReLU-9       [-1, 64, 60, 62, 62]               0
       BasicBlock-10       [-1, 64, 60, 62, 62]               0
     Conv3DSimple-11       [-1, 64, 60, 62, 62]         110,592
      BatchNorm3d-12       [-1, 64, 60, 62, 62]             128
             ReLU-13       [-1, 64, 60, 62, 62]               0
     Conv3DSimple-14       [-1, 64, 60,

In [51]:
test = pd.read_csv('./test.csv')


In [52]:
test_dataset = CustomDataset(test['path'].values, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)


In [53]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    preds = []
    with torch.no_grad():
        for videos in tqdm(iter(test_loader)):
            videos = videos.to(device)
            logit = model(videos)

            preds += logit.argmax(1).detach().cpu().numpy().tolist()
    return preds


In [54]:
preds = inference(model, test_loader, device)

  0%|          | 0/39 [00:00<?, ?it/s]

In [55]:
submit = pd.read_csv('./sample_submission.csv')


In [56]:
submit['label'] = preds
submit.head()


Unnamed: 0,id,label
0,TEST_000,1
1,TEST_001,3
2,TEST_002,0
3,TEST_003,2
4,TEST_004,4


In [57]:
submit.to_csv('./baseline_submit.csv', index=False) CNN + LSTM