In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/hackathon_swf/module/')

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import cv2
import pandas as pd
import numpy as np

from pathlib import Path

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import torchvision
from torchvision import transforms

from model import EventDetector
from eval import ToTensor, Normalize

In [5]:
model = EventDetector(pretrain=True,
                      width_mult=1.,
                      lstm_layers=1,
                      lstm_hidden=256,
                      bidirectional=True,
                      dropout=False)

save_dict = torch.load('/content/drive/MyDrive/hackathon_swf/weights/swingnet_1800.pth.tar')
model.load_state_dict(save_dict['model_state_dict'])
model.cuda()
model.eval()

EventDetector(
  (cnn): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
        (3): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (4): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
        (3

In [26]:
class SampleVideo(Dataset):
    def __init__(self, path, input_size=160, transform=None):
        self.path = path
        self.input_size = input_size
        self.transform = transform

    def __len__(self):
        return 1

    def __getitem__(self, idx):
        cap = cv2.VideoCapture(self.path)
        frame_size = [cap.get(cv2.CAP_PROP_FRAME_HEIGHT), cap.get(cv2.CAP_PROP_FRAME_WIDTH)]
        ratio = self.input_size / max(frame_size)
        new_size = tuple([int(x * ratio) for x in frame_size])
        delta_w = self.input_size - new_size[1]
        delta_h = self.input_size - new_size[0]
        top, bottom = delta_h // 2, delta_h - (delta_h // 2)
        left, right = delta_w // 2, delta_w - (delta_w // 2)

        # preprocess and return frames
        images = []
        for pos in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
            ret, img = cap.read()
            if not ret: break
            resized = cv2.resize(img, (new_size[1], new_size[0]))
            b_img = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT,
                                       value=[0.406 * 255, 0.456 * 255, 0.485 * 255])  # ImageNet means (BGR)
            b_img_rgb = cv2.cvtColor(b_img, cv2.COLOR_BGR2RGB)

            images.append(b_img_rgb)
        cap.release()
        labels = np.zeros(len(images)) # only for compatibility with transforms
        sample = {'images': np.asarray(images), 'labels': np.asarray(labels)}
        if self.transform:
            sample = self.transform(sample)
        return sample

In [27]:
ds = SampleVideo('/content/drive/MyDrive/hackathon_swf/test_video_golf.mp4', transform=transforms.Compose([ToTensor(),
                                Normalize([0.485, 0.456, 0.406],
                                          [0.229, 0.224, 0.225])]))

dl = DataLoader(ds, batch_size=1, shuffle=False, drop_last=False)


In [28]:
seq_length = 64

In [29]:
print('Testing...')
for sample in dl:
    images = sample['images']
    # full samples do not fit into GPU memory so evaluate sample in 'seq_length' batches
    batch = 0
    while batch * seq_length < images.shape[1]:
        if (batch + 1) * seq_length > images.shape[1]:
            image_batch = images[:, batch * seq_length:, :, :, :]
        else:
            image_batch = images[:, batch * seq_length:(batch + 1) * seq_length, :, :, :]
        logits = model(image_batch.cuda())
        if batch == 0:
            probs = F.softmax(logits.data, dim=1).cpu().numpy()
        else:
            probs = np.append(probs, F.softmax(logits.data, dim=1).cpu().numpy(), 0)
        batch += 1

events = np.argmax(probs, axis=0)[:-1]
print('Predicted event frames: {}'.format(events))

confidence = []
for i, e in enumerate(events):
    confidence.append(probs[e, i])
print('Condifence: {}'.format([np.round(c, 3) for c in confidence]))


Testing...
Predicted event frames: [127 249 255 320 256 328 528 448]
Condifence: [0.008, 0.015, 0.017, 0.006, 0.077, 0.03, 0.089, 0.018]


### ONNX export and test

In [None]:
import torch.onnx

X = torch.randn(1, 64, 3, 160, 160).cuda()
dynamic_axes = {'input': {1: 'batch'}, 'output': {1: 'batch'}}
torch.onnx.export(model, X, "swing_stages.onnx", input_names=["input"], output_names=["output"], dynamic_axes=dynamic_axes)



verbose: False, log level: Level.ERROR



In [None]:
import onnx

onnx_model = onnx.load("swing_stages.onnx")
onnx.checker.check_model(onnx_model)

In [None]:
import onnxruntime as ort

In [None]:
ort_sess = ort.InferenceSession('swing_stages.onnx', providers=['CUDAExecutionProvider'])

In [None]:
from scipy.special import softmax

In [None]:
for sample in dl:
    images = sample['images']
    # full samples do not fit into GPU memory so evaluate sample in 'seq_length' batches
    batch = 0
    while batch * seq_length < images.shape[1]:
        if (batch + 1) * seq_length > images.shape[1]:
            image_batch = images[:, batch * seq_length:, :, :, :]
        else:
            image_batch = images[:, batch * seq_length:(batch + 1) * seq_length, :, :, :]
        logits = ort_sess.run(["output"], {'input': image_batch.numpy()})[0]

        if batch == 0:
            probs = softmax(logits, axis=1)
        else:
            probs = np.append(probs, softmax(logits, axis=1), 0)
        batch += 1

events = np.argmax(probs, axis=0)[:-1]
print('Predicted event frames: {}'.format(events))

confidence = []
for i, e in enumerate(events):
    confidence.append(probs[e, i])
print('Condifence: {}'.format([np.round(c, 3) for c in confidence]))

Predicted event frames: [ 74  86  98 114 132 143 151 236]
Condifence: [0.095, 0.593, 0.795, 0.718, 0.865, 0.975, 0.765, 0.159]


In [None]:
ort_sess.get_inputs()[0].name

'input'

In [None]:
logits = ort_sess.run(["output"], {'input': image_batch.numpy()})[0]

In [None]:
logits.shape

(64, 9)

In [None]:
x, y = test_data[0][0], test_data[0][1]
ort_sess = ort.InferenceSession('swing_stages.onnx')
outputs = ort_sess.run(None, {'input': image_batch.numpy()})

# Print Result
predicted, actual = classes[outputs[0][0].argmax(0)], classes[y]
print(f'Predicted: "{predicted}", Actual: "{actual}"')

# TEST DATASET EVAL

In [13]:
from pathlib import Path

In [15]:
from zipfile import ZipFile

with ZipFile('/content/drive/MyDrive/hackathon_swf/test_dataset.zip', 'r') as zipObj:
   zipObj.extractall(path='/content/drive/MyDrive/hackathon_swf/test_dataset/')

In [30]:
class SampleVideo(Dataset):
    def __init__(self, path, input_size=160, transform=None):
        self.path = path
        self.input_size = input_size
        self.transform = transform

    def __len__(self):
        return 1

    def __getitem__(self, idx):
        cap = cv2.VideoCapture(self.path)
        frame_size = [cap.get(cv2.CAP_PROP_FRAME_HEIGHT), cap.get(cv2.CAP_PROP_FRAME_WIDTH)]
        ratio = self.input_size / max(frame_size)
        new_size = tuple([int(x * ratio) for x in frame_size])
        delta_w = self.input_size - new_size[1]
        delta_h = self.input_size - new_size[0]
        top, bottom = delta_h // 2, delta_h - (delta_h // 2)
        left, right = delta_w // 2, delta_w - (delta_w // 2)

        # preprocess and return frames
        images = []
        for pos in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
            ret, img = cap.read()
            if not ret: break
            resized = cv2.resize(img, (new_size[1], new_size[0]))
            b_img = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT,
                                       value=[0.406 * 255, 0.456 * 255, 0.485 * 255])  # ImageNet means (BGR)
            b_img_rgb = cv2.cvtColor(b_img, cv2.COLOR_BGR2RGB)

            images.append(b_img_rgb)
        cap.release()
        labels = np.zeros(len(images)) # only for compatibility with transforms
        sample = {'images': np.asarray(images), 'labels': np.asarray(labels)}
        if self.transform:
            sample = self.transform(sample)
        return sample

In [31]:
def process_video(model, dl, seq_length):
    for sample in dl:
        images = sample['images']
        # full samples do not fit into GPU memory so evaluate sample in 'seq_length' batches
        batch = 0
        while batch * seq_length < images.shape[1]:
            if (batch + 1) * seq_length > images.shape[1]:
                image_batch = images[:, batch * seq_length:, :, :, :]
            else:
                image_batch = images[:, batch * seq_length:(batch + 1) * seq_length, :, :, :]
            logits = model(image_batch.cuda())
            if batch == 0:
                probs = F.softmax(logits.data, dim=1).cpu().numpy()
            else:
                probs = np.append(probs, F.softmax(logits.data, dim=1).cpu().numpy(), 0)
            batch += 1

    events = np.argmax(probs, axis=0)[:-1]
    print('Predicted event frames: {}'.format(events))

    confidence = []
    for i, e in enumerate(events):
        confidence.append(probs[e, i])
    print('Condifence: {}'.format([np.round(c, 3) for c in confidence]))

    return events, confidence


In [None]:
model = EventDetector(pretrain=True,
                      width_mult=1.,
                      lstm_layers=1,
                      lstm_hidden=256,
                      bidirectional=True,
                      dropout=False)

save_dict = torch.load('/content/drive/MyDrive/hackathon_swf/weights/swingnet_1800.pth.tar')
model.load_state_dict(save_dict['model_state_dict'])
model.cuda()
model.eval()

In [33]:
import csv

with open("/content/drive/MyDrive/hackathon_swf/test_submission.csv", 'w') as f:
    # create the csv writer
    writer = csv.writer(f)
    writer.writerow(["VideoName", "P1", "P2", "P3", "P4", "P5", "P7", "P8", "P10"])


In [None]:
seq_length = 64
test_video_path = Path("/content/drive/MyDrive/hackathon_swf/test_dataset/files/")
for vp in test_video_path.iterdir():
    print(vp.name)
    ds = SampleVideo(str(vp), transform=transforms.Compose([ToTensor(),
                                Normalize([0.485, 0.456, 0.406],
                                          [0.229, 0.224, 0.225])]))

    dl = DataLoader(ds, batch_size=1, shuffle=False, drop_last=False)
    events, confidence = process_video(model, dl, seq_length)
    row = [str(vp.name)] + [events]

    with open("/content/drive/MyDrive/hackathon_swf/test_submission.csv", 'a') as f:
        # create the csv writer
        writer = csv.writer(f)
        writer.writerow(row)

00858b25-c5c3-4092-8ae7-5c61ada25097.mp4
Predicted event frames: [ 0 40 49 40 61 71 92 40]
Condifence: [0.0, 0.023, 0.049, 0.06, 0.112, 0.019, 0.016, 0.051]
0ae17d29-242b-40ec-bff6-fe6b23ac8824.mp4
Predicted event frames: [66  5 12 61 67 49 55 67]
Condifence: [0.01, 0.07, 0.089, 0.526, 0.016, 0.025, 0.024, 0.021]
1366a9b8-ec83-4f9a-9593-235ee4623269.mp4
Predicted event frames: [497 655 665 641 641 650 665 641]
Condifence: [0.006, 0.023, 0.009, 0.003, 0.022, 0.037, 0.031, 0.006]
229f1a5d-a573-41d2-a1c5-155bd3b643dc.mp4
Predicted event frames: [67 25 67 46 67 54 56 64]
Condifence: [0.018, 0.011, 0.012, 0.014, 0.016, 0.064, 0.056, 0.004]
27e9571c-2170-4178-90a8-a1108aa3a977.mp4


In [None]:
ds = SampleVideo('/content/drive/MyDrive/hackathon_swf/videos/test_video.mp4', transform=transforms.Compose([ToTensor(),
                                Normalize([0.485, 0.456, 0.406],
                                          [0.229, 0.224, 0.225])]))

dl = DataLoader(ds, batch_size=1, shuffle=False, drop_last=False)


In [None]:
seq_length = 64
events, confidence = process_video(model, dl, seq_length)

Predicted event frames: [ 74  86  98 114 132 143 151 236]
Condifence: [0.095, 0.593, 0.795, 0.718, 0.865, 0.975, 0.765, 0.159]
