In [2]:
import os
import numpy as np
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
# from torch.nn.functional import InterpolationMode
import torchvision.models as models
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
import cv2
import torch
from pytorchvideo.data.encoded_video import EncodedVideo
import torchviz
from pytorchvideo.data import LabeledVideoDataset, make_clip_sampler
from torchvision.models import squeezenet1_1, SqueezeNet1_1_Weights
from torchvision import transforms

from pytorch_lightning import LightningModule, seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from torch.optim.lr_scheduler import CosineAnnealingLR, ReduceLROnPlateau, CosineAnnealingWarmRestarts
from sklearn.metrics import classification_report
import torchmetrics

from pytorchvideo.data import labeled_video_dataset

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    UniformTemporalSubsample,
    Permute,   
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomAdjustSharpness,
    Resize,
    RandomHorizontalFlip
)

from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo
)



In [3]:

video_transform = Compose([
    ApplyTransformToKey(key="video",
    transform=Compose([
        UniformTemporalSubsample(25),
        Lambda(lambda x: x/255),
        Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
        # RandomShortSideScale(min_size=256, max_size=512),
        CenterCropVideo(256),
        RandomHorizontalFlip(p=0.5),
    ]),
    ),
])

In [4]:

class VideoModel(LightningModule):
    def __init__(self):
        super(VideoModel, self).__init__()
        
        self.video_model = torch.hub.load('facebookresearch/pytorchvideo', 'efficient_x3d_xs', pretrained=True)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(400, 1)

        self.lr = 1e-3
        self.batch_size = 8
        self.num_worker = 4
        self.num_steps_train = 0
        self.num_steps_val = 0

        # self.metric = torchmetrics.classification.MultilabelAccuracy(num_labels=num_classes)
        self.metric = torchmetrics.Accuracy()
        
        #loss
        self.criterion = nn.BCEWithLogitsLoss()

    def forward(self, x):
        x = self.video_model(x)
        x = self.relu(x)
        x = self.fc(x)
        return x

    def configure_optimizers(self):
        opt = torch.optim.AdamW(params=self.parameters(), lr = self.lr)
        scheduler = ReduceLROnPlateau(opt, mode="min", factor=0.05, patience=2, min_lr=1e-6)
        # scheduler = CosineAnnealingLR(opt, T_max=10, eta_min=1e-6, last_epoch=-1)
        return {'optimizer': opt,
                'lr_scheduler': scheduler, 
                "monitor": "val_loss"}



In [5]:


def word_level_prediction(path_to_model, frames_list):
    
    model = VideoModel()
    model.load_state_dict(torch.load(path_to_model))
    # model = VideoModel.load_from_checkpoint(
    # checkpoint_path="/home/toghrul/SLR/sign-lang/checkpoints/epoch=14-step=375.ckpt",
    # hparams_file="/home/toghrul/SLR/sign-lang/lightning_logs/version_45/hparams.yaml",
    # map_location=None,
    # )
    
    model = model.cuda()
    
    # print(video['label'])
    video = torch.stack(frames_list)
    video = video.permute(3, 0, 1, 2)
    video_data = {"video": video}
    video_data = video_transform(video_data)

    inputs = video_data["video"].cuda()
    inputs = inputs.unsqueeze(0)
    
    preds = model(inputs).detach().cpu().numpy()
    # preds = np.where(preds > 0, 1, 0)
    
    return preds

In [6]:
import cv2

In [7]:
model_path = "../models/eff3d_bin.pt"
video_path = "../data/binary-data/val/VAR/2022-05-21 15-53-16.mp4"

In [8]:
word_level_prediction(model_path, video_path)

Using cache found in /home/toghrul/.cache/torch/hub/facebookresearch_pytorchvideo_main


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [22]:
cap = cv2.VideoCapture(video_path)
frames = []

if (cap.isOpened()== False): 
  print("Error opening video stream or file")
ret = True
# # Read until video is completed
while(ret):
  # Capture frame-by-frame
    ret, frame = cap.read()
    if ret:
        frames.append(torch.from_numpy(frame))
 
    # Display the resulting frame
    # cv2.imshow('Frame',frame)

    # Press Q on keyboard to  exit
    if cv2.waitKey(25) & 0xFF == ord('q'):
        break
 
cap.release()
 
cv2.destroyAllWindows()

In [25]:
video = torch.stack(frames)
print(video.shape)
video = video.permute(3, 0, 1, 2)
print(video.shape)

torch.Size([230, 960, 1280, 3])
torch.Size([3, 230, 960, 1280])


In [29]:
import pprint

In [35]:
pp = pprint.PrettyPrinter(indent=4)
frame = frames[0]
pp.pprint(frame)
print(frame.shape)
frame_perm = frames[0].permute(2, 0, 1)
pp.pprint(frame_perm)
print(frame_perm.shape)

tensor([[[210, 188, 185],
         [210, 188, 185],
         [206, 188, 185],
         ...,
         [174, 171, 170],
         [174, 171, 170],
         [174, 171, 170]],

        [[210, 188, 185],
         [210, 188, 185],
         [206, 188, 185],
         ...,
         [174, 171, 170],
         [174, 171, 170],
         [174, 171, 170]],

        [[210, 188, 185],
         [210, 188, 185],
         [206, 188, 185],
         ...,
         [174, 171, 170],
         [174, 171, 170],
         [174, 171, 170]],

        ...,

        [[144, 141, 142],
         [144, 141, 142],
         [142, 139, 140],
         ...,
         [ 94, 100, 109],
         [ 95, 101, 110],
         [ 96, 102, 111]],

        [[160, 157, 158],
         [160, 157, 158],
         [159, 156, 157],
         ...,
         [ 99, 105, 114],
         [ 98, 104, 113],
         [ 96, 102, 111]],

        [[154, 151, 152],
         [154, 151, 152],
         [153, 150, 151],
         ...,
         [106, 112, 121],
        

In [1]:
from pytorchvideo.data.encoded_video import EncodedVideo

In [9]:
video_path = "../data/cam2/100/2022-05-31 12-49-06.mp4"

In [15]:
data = cv2.VideoCapture(video_path)

# count the number of frames
frames = data.get(cv2.CAP_PROP_FRAME_COUNT)
fps = data.get(cv2.CAP_PROP_FPS)

# calculate duration of the video
seconds = round(frames / fps)
seconds

8

In [16]:
video = EncodedVideo.from_path(video_path)

video_data = video.get_clip(0, 9)
print(video_data['video'].shape)
# video_data = video_transform(video_data)

# inputs = video_data["video"].cuda()
# inputs = inputs.unsqueeze(0)
# inputs.shape

torch.Size([3, 230, 960, 1280])


In [19]:
clip_duration = 50

In [20]:
for i in range(0, video_data['video'].shape[1], clip_duration):
    
    video = video_data['video'][:, i: i+clip_duration, :, :]
    print(video.shape)

torch.Size([3, 50, 960, 1280])
torch.Size([3, 50, 960, 1280])
torch.Size([3, 50, 960, 1280])
torch.Size([3, 50, 960, 1280])
torch.Size([3, 30, 960, 1280])


In [46]:
video_data = {"video": video}

model = VideoModel()
# model.load_state_dict(torch.load(path_to_model))

model = VideoModel.load_from_checkpoint(
checkpoint_path="/home/toghrul/SLR/sign-lang/checkpoints/epoch=14-step=375.ckpt",
hparams_file="/home/toghrul/SLR/sign-lang/lightning_logs/version_45/hparams.yaml",
map_location=None,
)

model = model.cuda()

# video = EncodedVideo.from_path(path_to_video)
# print(video['label'])
# video_data = video.get_clip(0, 1)
video_data = video_transform(video_data)

inputs = video_data["video"].cuda()
inputs = inputs.unsqueeze(0)

preds = model(inputs).detach().cpu().numpy()
preds

Using cache found in /home/toghrul/.cache/torch/hub/facebookresearch_pytorchvideo_main
Using cache found in /home/toghrul/.cache/torch/hub/facebookresearch_pytorchvideo_main


array([[-5.427988]], dtype=float32)

In [47]:
video_data['video'].shape

torch.Size([3, 25, 256, 256])