In [1]:
import pandas as pd

In [2]:
def change_time(x):
    hour, minutes, seconds = x.split(':')
    return int(hour) * 3600 + int(minutes) * 60 + int(seconds)

df = pd.read_excel('../data/разметка 1 тайм.xlsx')
df['time'] = df['time'].apply(change_time)

s = {x: y for x,y in zip(df['time'], df['action'])}
max_value = df['time'].max()

In [3]:
import cv2

video_path = '../data/хоккей_матч.mp4'
cap = cv2.VideoCapture(video_path)
    
video_fps = cap.get(cv2.CAP_PROP_FPS)
frame_interval = int(video_fps / 1)

current_frame = 0
extracted_frames = 0

frames = []
labels = []
cur_pos = -1
while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Если текущий кадр кратен интервалу, сохраняем его
    if current_frame % frame_interval == 0:
        frame_num = current_frame / frame_interval

        if frame_num > max_value:
            break
        else:

            if cur_pos == -1 and frame_num not in s:
                current_frame += 1
                continue
            
            if frame_num in s:
                cur_pos = s[frame_num]


            labels.append(cur_pos) 
            frames.append(frame)

        extracted_frames += 1

        print(frame_num / max_value)
    
    current_frame += 1

cap.release()

0.20226757369614512
0.20272108843537415
0.20317460317460317
0.2036281179138322
0.20408163265306123
0.20453514739229026
0.20498866213151928
0.2054421768707483
0.2058956916099773
0.20634920634920634
0.20680272108843537
0.2072562358276644
0.20770975056689342
0.20816326530612245
0.20861678004535147
0.2090702947845805
0.20952380952380953
0.20997732426303856
0.21043083900226758
0.2108843537414966
0.2113378684807256
0.21179138321995464
0.21224489795918366
0.2126984126984127
0.21315192743764172
0.21360544217687075
0.21405895691609977
0.2145124716553288
0.21496598639455783
0.21541950113378686
0.21587301587301588
0.2163265306122449
0.2167800453514739
0.21723356009070294
0.21768707482993196
0.218140589569161
0.21859410430839002
0.21904761904761905
0.21950113378684807
0.2199546485260771
0.22040816326530613
0.22086167800453516
0.22131519274376418
0.2217687074829932
0.2222222222222222
0.22267573696145124
0.22312925170068026
0.2235827664399093
0.22403628117913832
0.22448979591836735
0.224943310657596

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import albumentations as albu
from albumentations.pytorch import ToTensorV2

DEVICE = torch.device('mps')

# Custom dataset class for loading video frames
class VideoDataset(Dataset):
    def __init__(self, frames, labels, video_len=5, transform=None):
        self.frames = frames
        self.labels = labels
        self.transform = transform
        self.video_len = video_len

    def __len__(self):
        return len(self.frames) - self.video_len

    def __getitem__(self, idx):
        video = [self.frames[idx + i] for i in range(self.video_len)]
        label = self.labels[idx + self.video_len - 1]
        if self.transform:
            # print(self.transform, video[0])
            video = [self.transform(image=frame)['image'] for frame in video]
        video = torch.stack(video)
        return video, torch.tensor(label)

transform = albu.Compose([
    albu.Resize(368, 368),
    albu.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ToTensorV2(),
])

In [5]:
new_frames = [frames[i] for i in range(len(frames)) if i % 1 == 0]
new_labels = [labels[i] for i in range(len(frames)) if i % 1 == 0]

test_size = 0.3


train_frames = new_frames[:int(len(new_frames) * (1 - test_size))]
train_labels = new_labels[:int(len(new_labels) * (1 - test_size))]
val_frames = new_frames[int(len(new_frames) * (1 - test_size)):]
val_labels = new_labels[int(len(new_labels) * (1 - test_size)):]

train_ds = VideoDataset(train_frames, train_labels, transform=transform)
valid_ds = VideoDataset(train_frames, train_labels, transform=transform)

train_dataloader = DataLoader(train_ds, batch_size=8)
valid_dataloader = DataLoader(valid_ds, batch_size=8)

In [6]:
# Define the CNN-LSTM model
class CNN_LSTM(nn.Module):
    def __init__(self, cnn_model, hidden_dim, lstm_layers, num_classes):
        super(CNN_LSTM, self).__init__()
        self.cnn = cnn_model
        self.lstm = nn.LSTM(input_size=cnn_model.classifier[0].out_features, 
                            hidden_size=hidden_dim, 
                            num_layers=lstm_layers, 
                            batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, lstm_input=None):
        batch_size, seq_len, C, H, W = x.size()
        c_in = x.view(batch_size * seq_len, C, H, W)  # Flatten to [batch_size*seq_len, C, H, W]
        c_out = self.cnn(c_in)  # Pass through the CNN
        c_out = c_out.view(batch_size, seq_len, -1)  # Reshape to [batch_size, seq_len, feature_dim]

        if lstm_input is None:
            lstm_out, hidden_lstm = self.lstm(c_out)  # LSTM output
        else:
            lstm_out, hidden_lstm = self.lstm(c_out, (lstm_input[0][:,-1:,:], lstm_input[1][:,-1:,:]))

        # lstm_out, _ = self.lstm(c_out)  # LSTM output
        lstm_out = lstm_out[:, -1, :]  # Take the last output of the LSTM
        out = self.fc(lstm_out)  # Fully connected layer
        return out, hidden_lstm


# Load the pretrained MobileNetV3 and modify it
mobilenet_v3 = models.mobilenet_v3_small(pretrained=True)
# Remove the last classifier layer
mobilenet_v3.classifier = nn.Sequential(*list(mobilenet_v3.classifier.children())[:-1])

# Model hyperparameters
hidden_dim = 128
lstm_layers = 1
num_classes = 2

# Instantiate the model
model = CNN_LSTM(mobilenet_v3, hidden_dim, lstm_layers, num_classes)
model.to(DEVICE)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0007)



In [10]:
from tqdm import tqdm

model.train()

for epoch in range(5):
    running_loss = 0.0
    for inputs, labels in tqdm(train_dataloader, total=len(train_ds)):
        optimizer.zero_grad()
        inputs = inputs.to(DEVICE)
        outputs, _ = model(inputs)
        loss = criterion(outputs.cpu(), labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}/{3}, Loss: {running_loss/len(train_dataloader)}')

 13%|█▎        | 154/1227 [00:36<04:16,  4.19it/s]


Epoch 1/3, Loss: 0.6297362709278845


 13%|█▎        | 154/1227 [00:32<03:45,  4.76it/s]


Epoch 2/3, Loss: 0.4809111713510394


 13%|█▎        | 154/1227 [00:33<03:51,  4.64it/s]


Epoch 3/3, Loss: 0.5562541012216795


 13%|█▎        | 154/1227 [00:32<03:48,  4.69it/s]


Epoch 4/3, Loss: 0.5757207312173658


 13%|█▎        | 154/1227 [00:32<03:47,  4.71it/s]

Epoch 5/3, Loss: 0.44281669198112056





In [11]:
torch.save(mobilenet_v3.state_dict(), 'model.pth')

In [14]:
model.eval()

preds = []
val_labels = []
for inputs, labels in tqdm(valid_dataloader, total=len(valid_dataloader)):
    optimizer.zero_grad()
    inputs = inputs.to(DEVICE)
    with torch.no_grad():
        outputs, _ = model(inputs)
        outputs = outputs.cpu().numpy()[:, 1]
    val_labels.append(labels)
    preds.append(outputs)

100%|██████████| 154/154 [00:13<00:00, 11.30it/s]


In [15]:
import numpy as np
from sklearn import metrics

metrics.roc_auc_score(np.concatenate(val_labels), np.concatenate(preds))

0.905142405063291

In [16]:
val_frames_transforms = torch.stack([transform(image=x)['image'] for x in val_frames])

In [19]:
mobilenet_v3(val_frames_transforms[:5].to(DEVICE))

tensor([[ 1.2413,  0.5902, -0.1082,  ...,  0.2751, -0.2755,  0.0563],
        [ 1.3169,  0.4147,  0.3289,  ...,  0.1845, -0.2745,  0.1564],
        [ 2.2707,  0.1501,  0.4816,  ...,  0.2332, -0.3149,  0.6129],
        [ 1.1871, -0.0776,  0.5309,  ...,  0.6983,  0.0854,  0.4708],
        [ 0.1075,  0.0744,  0.1393,  ...,  0.9069,  0.5010,  0.2018]],
       device='mps:0', grad_fn=<HardswishBackward0>)

In [24]:
val_frames_transforms[:5].unsqueeze(0).shape

torch.Size([1, 5, 3, 368, 368])

In [25]:
output, hidden_state = model(val_frames_transforms[:5].unsqueeze(0).to(DEVICE))

In [31]:
for i in range(len(val_frames_transforms)):
    output, hidden_state = model(val_frames_transforms[5+i:6+i].unsqueeze(0).to(DEVICE), hidden_state)

RuntimeError: MPS backend out of memory (MPS allocated: 18.12 GB, other allocations: 9.86 MB, max allowed: 18.13 GB). Tried to allocate 793.50 KB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).