In [1]:
import pickle
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torchvision import transforms
import numpy as np
from einops import rearrange

In [2]:
with open('test_set.dat', "rb") as training_file:
    train_set_data = pickle.load(training_file)

In [3]:
import math
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable
from functools import partial


class C3D(nn.Module):

    """
    This is the c3d implementation with batch norm.

    [1] Tran, Du, et al. "Learning spatiotemporal features with 3d convolutional networks."
    Proceedings of the IEEE international conference on computer vision. 2015.
    """

    def __init__(self,num_classes=10, in_channels=3):

        super(C3D, self).__init__()
        self.group1 = nn.Sequential(
            nn.Conv3d(in_channels, 64, kernel_size=3, padding=1),
            nn.BatchNorm3d(64),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(1, 2, 2)))
        self.group2 = nn.Sequential(
            nn.Conv3d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm3d(128),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
        self.group3 = nn.Sequential(
            nn.Conv3d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm3d(256),
            nn.ReLU(),
            nn.Conv3d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm3d(256),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
        self.group4 = nn.Sequential(
            nn.Conv3d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm3d(512),
            nn.ReLU(),
            nn.Conv3d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm3d(512),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
        self.group5 = nn.Sequential(
            nn.Conv3d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm3d(512),
            nn.ReLU(),
            nn.Conv3d(512, 512, kernel_size=3, padding=1),
            nn.BatchNorm3d(512),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1)))

        self.fc1 = nn.Sequential(
            nn.Linear(65536, 4096),
            nn.ReLU(),
            nn.Dropout(0.5))
        self.fc2 = nn.Sequential(
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(0.5))
        self.fc = nn.Sequential(
            nn.Linear(4096, num_classes))         

    def forward(self, x):
        out = self.group1(x)
        out = self.group2(out)
        out = self.group3(out)
        out = self.group4(out)
        out = self.group5(out)
        out = out.view(out.size(0), -1)
        out = self.fc1(out)
        out = self.fc2(out)
        out = self.fc(out)
        return out


In [4]:
USE_CUDA = torch.cuda.is_available()
print(USE_CUDA)

device = torch.device('cuda:1' if USE_CUDA else 'cpu')
print('학습을 진행하는 기기:', device)

True
학습을 진행하는 기기: cuda:1


In [5]:
cnn = C3D(num_classes=10, in_channels=3).to(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(cnn.parameters(), lr=0.01)

In [6]:
from torchinfo import summary

summary(cnn, input_size = (16,3,30,224,224), col_names = ['input_size','output_size','num_params'])

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
C3D                                      [16, 3, 30, 224, 224]     [16, 10]                  --
├─Sequential: 1-1                        [16, 3, 30, 224, 224]     [16, 64, 29, 112, 112]    --
│    └─Conv3d: 2-1                       [16, 3, 30, 224, 224]     [16, 64, 30, 224, 224]    5,248
│    └─BatchNorm3d: 2-2                  [16, 64, 30, 224, 224]    [16, 64, 30, 224, 224]    128
│    └─ReLU: 2-3                         [16, 64, 30, 224, 224]    [16, 64, 30, 224, 224]    --
│    └─MaxPool3d: 2-4                    [16, 64, 30, 224, 224]    [16, 64, 29, 112, 112]    --
├─Sequential: 1-2                        [16, 64, 29, 112, 112]    [16, 128, 14, 56, 56]     --
│    └─Conv3d: 2-5                       [16, 64, 29, 112, 112]    [16, 128, 29, 112, 112]   221,312
│    └─BatchNorm3d: 2-6                  [16, 128, 29, 112, 112]   [16, 128, 29, 112, 112]   256
│    └─ReLU: 2-7         

In [7]:
transform = transforms.Compose([
    # transforms.ToPILImage(),
    # transforms.CenterCrop(224),
    transforms.ToTensor(),
    # transforms.Normalize(mean=[0.485, 0.456, 0.406],
    #                      std=[0.229, 0.224, 0.225]),
])


In [8]:
class SignLanGuageDataset(Dataset):
    def __init__(self,imagedata,tagdata,transform):
        self.imagedata=imagedata
        self.tagdata=tagdata
        self.transform=transform

    def __len__(self):
        return len(self.imagedata)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image_data=(self.imagedata[idx])
        image_data=torch.FloatTensor(image_data)
        label=self.tagdata[idx]
        label = torch.FloatTensor(label)
        return image_data,label

In [9]:
batchsz = 16
num_workerssz = 4
epochs = 120

In [10]:
train_dataset = SignLanGuageDataset(imagedata=train_set_data[0],tagdata=train_set_data[1],transform=transform)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batchsz, shuffle=True, num_workers=num_workerssz)

In [12]:
with torch.cuda.device(1):
    cnn.train()
    for epoch in range(epochs):
        for data, target in tqdm(train_dataloader):
            data = rearrange(data, 'b d h w c -> b c d h w')
            data = data.to(device)
            optimizer.zero_grad()
            output = cnn(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

  0%|          | 0/2 [00:00<?, ?it/s]


RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/ssrlab/anaconda3/envs/kyuwon_video_swin_transformer/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/ssrlab/anaconda3/envs/kyuwon_video_swin_transformer/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "/home/ssrlab/anaconda3/envs/kyuwon_video_swin_transformer/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 265, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
  File "/home/ssrlab/anaconda3/envs/kyuwon_video_swin_transformer/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 142, in collate
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/home/ssrlab/anaconda3/envs/kyuwon_video_swin_transformer/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 142, in <listcomp>
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/home/ssrlab/anaconda3/envs/kyuwon_video_swin_transformer/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 119, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
  File "/home/ssrlab/anaconda3/envs/kyuwon_video_swin_transformer/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 162, in collate_tensor_fn
    return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [4] at entry 0 and [2] at entry 1


In [None]:
input_var = Variable(torch.randn(8, 3, 30, 224, 224)).to(device)
output = cnn(input_var)
print(output.shape)

torch.Size([8, 10])
