In [1]:
!pip install opendatasets --quiet

In [2]:
# по API подключаемся к kaggle и скачиваем нужный датасет
# для этого в colab надо подгрузить файл kaggle.json, который содержит токен и пароль
# после этого в проводнике colab появится папка с данными
import opendatasets as od
import pandas as pd

# username	"sergses" key	"e07c32b04e8117da59c5dc5f21ed2087"
od.download( "https://www.kaggle.com/competitions/what-on-the-video")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: daniillosev
Your Kaggle Key: ··········
Downloading what-on-the-video.zip to ./what-on-the-video


100%|██████████| 690M/690M [00:03<00:00, 206MB/s]



Extracting archive ./what-on-the-video/what-on-the-video.zip to ./what-on-the-video


In [3]:
import os
import cv2
import torch
import torch.nn as nn
import numpy as np
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score

In [4]:
os.remove('/content/what-on-the-video/train/Humming_Bird_1_preview.mp4')

In [None]:
print("train:", len(os.listdir('/content/what-on-the-video/train')))
print("test:", len(os.listdir('/content/what-on-the-video/test')))

train: 427
test: 435


In [5]:
df = pd.read_csv('/content/what-on-the-video/train.csv')
df['labels'] = df['labels'].apply(lambda x: x.replace('cloud. water', 'cloud, water'))
df['labels'] = df['labels'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['labels'])
y = pd.DataFrame(y, columns=mlb.classes_)
df = pd.concat([df, y], axis=1)
df['labels'] = df['labels'].apply(lambda x: x[0])
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df['labels'])
train_df = train_df.drop(['labels'], axis=1).reset_index(drop=True)
val_df = val_df.drop(['labels'], axis=1).reset_index(drop=True)
print(train_df.shape, val_df.shape)

(246, 10) (44, 10)


In [6]:
test_path = '/content/what-on-the-video/test'

test_df = pd.DataFrame(sorted(os.listdir(test_path)), columns = ['path'])
test_df

Unnamed: 0,path
0,000464896-guatemala-antigua-church-festi_previ...
1,000691821-mexico-puerto-vallarta-ocean_preview...
2,000692230-panama-canal-clouds-over-gatun_previ...
3,000745494-florida-anhinga-dead-tree_preview.mp4
4,000764644-sunset-and-boat_preview.mp4
...,...
430,myrdalssandur_iceland_one_preview.mp4
431,reed1_preview.mp4
432,rifugio_becco_preview.mp4
433,rover_and_rocks_medium_preview.mp4


In [8]:
class VideoDataset(Dataset):
    def __init__(self, dataframe, video_dir, transform=None, frames_per_clip=64, resize=(128, 128), mode='train'):
        self.data = dataframe.reset_index(drop=True)
        self.video_dir = video_dir
        self.transform = transform
        self.frames_per_clip = frames_per_clip
        self.resize = resize
        self.mode = mode
        self.label_columns = ['animal', 'car', 'cloud',
                             'dance', 'fire', 'flower',
                             'food', 'sunset', 'water']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        video_path = os.path.join(self.video_dir, row['path'])
        frames = self.get_uniform_frames(video_path)
        tensor_frames = [self.transform(frame) for frame in frames]
        video_tensor = torch.stack(tensor_frames)

        if self.mode in ['train', 'val']:
            label = torch.tensor(row[self.label_columns].values.astype(np.float32))
            return video_tensor, label
        else:
            return video_tensor, row['path']

    def get_uniform_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        step = total_frames / self.frames_per_clip
        indices = [int(i * step) for i in range(self.frames_per_clip)]

        frames = []
        current_idx = 0
        target_set = set(indices)

        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if current_idx in target_set:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, self.resize)
                frames.append(frame)
            current_idx += 1
            if len(frames) >= self.frames_per_clip:
                break

        cap.release()
        return frames


In [9]:
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

train_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.Resize((112, 112)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])


test_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((112, 112)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

train_dataset = VideoDataset(train_df, "/content/what-on-the-video/train", transform=train_transform)
val_dataset = VideoDataset(val_df, "/content/what-on-the-video/train", transform=test_transform, mode='val')
test_dataset = VideoDataset(test_df, "/content/what-on-the-video/test", transform=test_transform, mode='test')

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [10]:
for images, labels in train_loader:
    print("Batch shape:", images.shape)
    print("Labels shape:", labels.shape)
    print("Пример меток:", labels[:5])
    break


Batch shape: torch.Size([8, 64, 3, 112, 112])
Labels shape: torch.Size([8, 9])
Пример меток: tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1.]])


In [11]:
class ResNetGRU(nn.Module):
    def __init__(self, hidden_dim=256, num_layers=2, num_classes=9, pretrained=True):
        super(ResNetGRU, self).__init__()
        resnet = models.resnet50(pretrained=pretrained)
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.feature_dim = resnet.fc.in_features

        self.gru = nn.GRU(
            input_size=self.feature_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )

        self.classifier = nn.Linear(hidden_dim, num_classes)


    def forward(self, x):
        B, T, C, H, W = x.size()
        x = x.view(B * T, C, H, W)
        feats = self.resnet(x)
        feats = feats.view(B, T, -1)

        gru_out, _ = self.gru(feats)
        final_hidden = gru_out[:, -1, :]

        out = self.classifier(final_hidden)
        return out

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNetGRU().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 94.2MB/s]


In [12]:
from sklearn.metrics import accuracy_score, f1_score
import torch

def train_epoch(model, dataloader, criterion, optimizer, device, threshold=0.5):
    model.train()
    running_loss = 0.0
    all_labels = []
    all_preds = []

    for videos, labels in dataloader:
        videos = videos.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(videos)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * videos.size(0)

        probs = torch.sigmoid(outputs)
        preds = (probs > threshold).float()

        all_labels.append(labels.cpu())
        all_preds.append(preds.cpu())

    all_labels = torch.cat(all_labels).numpy()
    all_preds = torch.cat(all_preds).numpy()

    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = accuracy_score(all_labels, all_preds)
    epoch_f1 = f1_score(all_labels, all_preds, average='macro')

    return epoch_loss, epoch_acc, epoch_f1



In [13]:
from sklearn.metrics import accuracy_score, f1_score
import torch

def validate_epoch(model, dataloader, criterion, device, threshold=0.5):
    model.eval()
    running_loss = 0.0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for videos, labels in dataloader:
            videos = videos.to(device)
            labels = labels.to(device)

            outputs = model(videos)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * videos.size(0)

            probs = torch.sigmoid(outputs)
            preds = (probs > threshold).float()

            all_labels.append(labels.cpu())
            all_preds.append(preds.cpu())

    all_labels = torch.cat(all_labels).numpy()
    all_preds = torch.cat(all_preds).numpy()

    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = accuracy_score(all_labels, all_preds)
    epoch_f1 = f1_score(all_labels, all_preds, average='macro')

    return epoch_loss, epoch_acc, epoch_f1


In [14]:
num_epochs = 20

for epoch in range(num_epochs):
    train_loss, train_acc, train_f1 = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc, val_f1 = validate_epoch(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs} — "
          f"Train loss: {train_loss:.4f}, Train acc: {train_acc:.4f}, Train F1: {train_f1:.4f} — "
          f"Val loss: {val_loss:.4f}, Val acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")

Epoch 1/20 — Train loss: 0.4331, Train acc: 0.0000, Train F1: 0.0556 — Val loss: 0.3296, Val acc: 0.0000, Val F1: 0.0000
Epoch 2/20 — Train loss: 0.2992, Train acc: 0.0041, Train F1: 0.0091 — Val loss: 0.2919, Val acc: 0.0682, Val F1: 0.0838
Epoch 3/20 — Train loss: 0.2497, Train acc: 0.2236, Train F1: 0.2333 — Val loss: 0.2766, Val acc: 0.3182, Val F1: 0.2923
Epoch 4/20 — Train loss: 0.2015, Train acc: 0.5000, Train F1: 0.4860 — Val loss: 0.2857, Val acc: 0.3636, Val F1: 0.3101
Epoch 5/20 — Train loss: 0.1677, Train acc: 0.7033, Train F1: 0.6547 — Val loss: 0.2796, Val acc: 0.3636, Val F1: 0.2810
Epoch 6/20 — Train loss: 0.1403, Train acc: 0.7520, Train F1: 0.7141 — Val loss: 0.2901, Val acc: 0.3409, Val F1: 0.2764
Epoch 7/20 — Train loss: 0.1126, Train acc: 0.8293, Train F1: 0.8023 — Val loss: 0.2794, Val acc: 0.4318, Val F1: 0.3634
Epoch 8/20 — Train loss: 0.1111, Train acc: 0.8049, Train F1: 0.7876 — Val loss: 0.3109, Val acc: 0.3636, Val F1: 0.2792
Epoch 9/20 — Train loss: 0.0879,

In [15]:
model.eval()
idx_to_label = {
    0: 'animal', 1: 'car', 2: 'cloud',
    3: 'dance', 4: 'fire', 5: 'flower',
    6: 'food', 7: 'sunset', 8: 'water'
}

all_preds = []
all_probs = []
all_filenames = []

threshold = 0.5

with torch.no_grad():
    for videos, filenames in test_loader:
        videos = videos.to(device)
        outputs = model(videos)

        probs = torch.sigmoid(outputs)
        preds = (probs > threshold).cpu().numpy()

        all_preds.extend(preds)
        all_probs.extend(probs.cpu().numpy())
        all_filenames.extend(filenames)

all_labels = []
for pred, prob in zip(all_preds, all_probs):
    labels = [idx_to_label[i] for i, val in enumerate(pred) if val == 1]

    if not labels:
        max_idx = prob.argmax()
        labels = [idx_to_label[max_idx]]

    all_labels.append(", ".join(labels))

submission_df = pd.DataFrame({'file_name': all_filenames, 'label': all_labels})
submission_df.index.name = 'index'
submission_df.to_csv('submission.csv', index=True)
