In [1]:
import os
import pytorch_lightning
import pytorchvideo.data
import torch.utils.data
from torch.utils.data import DistributedSampler
from pytorchvideo.transforms import ApplyTransformToKey
from torchvision.transforms import Compose, RandomCrop

In [3]:
class DataLoader(pytorch_lightning.LightningDataModule):
    _DATA_PATH = "D:\\AlphaAI\\kinetics400\\kinetics400_1\\"
    _CLIP_DURATION = 5
    _BATCH_SIZE = 8
    def __init__(self):
        super().__init__()
        self.transform = Compose([ApplyTransformToKey(
                key="video",
                transform=Compose([RandomCrop(128)]))]
            )
    def train_dataloader(self):
        train_dataset = pytorchvideo.data.Kinetics(
            data_path=os.path.join(self._DATA_PATH, "train"),
            clip_sampler=pytorchvideo.data.make_clip_sampler("random", self._CLIP_DURATION),
            decode_audio=False,
            transform=self.transform,
        )
        return torch.utils.data.DataLoader(train_dataset,batch_size=self._BATCH_SIZE)

    def val_dataloader(self):
        val_dataset = pytorchvideo.data.Kinetics(
            data_path=os.path.join(self._DATA_PATH, "valid"),
            clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", self._CLIP_DURATION),
            decode_audio=False,
            transform=self.transform,
        )
        return torch.utils.data.DataLoader(val_dataset,batch_size=self._BATCH_SIZE)

In [4]:

import pytorchvideo.models.resnet

In [10]:
def make_kinetics_resnet():
  return pytorchvideo.models.resnet.create_resnet(
      input_channel=3,
      model_depth=50,
      model_num_class=2,
      norm=nn.BatchNorm3d,
      activation=nn.ReLU,
  )

In [11]:

import torch
import torch.nn as nn
import torch.nn.functional as F

In [12]:
class VideoClassify(pytorch_lightning.LightningModule):
  def __init__(self):
      super().__init__()
      self.model = make_kinetics_resnet()

  def forward(self, x):
      return self.model(x)

  def training_step(self, batch, batch_idx):
      y_hat = self.model(batch["video"])
      loss = F.cross_entropy(y_hat, batch["label"])
      self.log("train_loss", loss.item())
      return loss

  def validation_step(self, batch, batch_idx):
      y_hat = self.model(batch["video"])
      loss = F.cross_entropy(y_hat, batch["label"])
      self.log("val_loss", loss)
      return loss

  def configure_optimizers(self):
      return torch.optim.Adam(self.parameters(), lr=1e-1)

In [13]:
def train():
    classification_module = VideoClassify()
    data_module = DataLoader()
    trainer = pytorch_lightning.Trainer(gpus=1)
    trainer.fit(classification_module, data_module)

In [14]:
if __name__ ==  '__main__':
    train()

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type | Params
-------------------------------
0 | model | Net  | 31.7 M
-------------------------------
31.7 M    Trainable params
0         Non-trainable params
31.7 M    Total params
126.630   Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


RuntimeError: stack expects each tensor to be equal size, but got [3, 128, 128, 128] at entry 0 and [3, 127, 128, 128] at entry 1