In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import cv2
from torch.utils.data import Dataset

In [None]:
!pip install opendatasets --quiet
import opendatasets as od

od.download( "https://www.kaggle.com/competitions/automated-video-captioning/data")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: anizuro
Your Kaggle Key: ··········
Downloading automated-video-captioning.zip to ./automated-video-captioning


100%|██████████| 1.08G/1.08G [00:08<00:00, 132MB/s]



Extracting archive ./automated-video-captioning/automated-video-captioning.zip to ./automated-video-captioning


In [None]:
train = pd.read_csv("/content/automated-video-captioning/train.csv")
test = pd.read_csv("/content/automated-video-captioning/test.csv")
train

Unnamed: 0,index,file_name,caption
0,0,0.mp4,A man is working out on a seated chest press m...
1,1,1.mp4,Preparing a bowl with yogurt and assorted fres...
2,2,2.mp4,A man with a muscular build is seen from behin...
3,3,3.mp4,Man exercising by jogging on a pedestrian brid...
4,4,4.mp4,Wristwatch hands moving forward close-up views.
...,...,...,...
598,598,598.mp4,Terrifying Halloween pumpkin with a carved evi...
599,599,599.mp4,Cheerful girl sending messages on cell phone w...
600,600,600.mp4,A person in a light blue shirt is using a lapt...
601,601,601.mp4,"A top-down view of an aesthetic desk setup, wi..."


In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

MAX_LENGTH = 50
PAD_TOKEN = '<PAD>'
SOS_TOKEN = '<SOS>'
EOS_TOKEN = '<EOS>'
OOV_TOKEN = '<UNK>'

def tokenize_and_lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(str(text).lower())
    return [lemmatizer.lemmatize(token) for token in tokens]

train['tokens'] = train['caption'].apply(
    lambda x: [SOS_TOKEN] + tokenize_and_lemmatize(x) + [EOS_TOKEN]
)
train['text_for_keras'] = train['tokens'].apply(' '.join)

tokenizer = Tokenizer(
    num_words=None,
    oov_token=OOV_TOKEN,
    filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
)
tokenizer.fit_on_texts(train['text_for_keras'])

if PAD_TOKEN not in tokenizer.word_index:
    tokenizer.word_index[PAD_TOKEN] = len(tokenizer.word_index) + 1
if SOS_TOKEN not in tokenizer.word_index:
    tokenizer.word_index[SOS_TOKEN] = len(tokenizer.word_index) + 1
if EOS_TOKEN not in tokenizer.word_index:
    tokenizer.word_index[EOS_TOKEN] = len(tokenizer.word_index) + 1

tokenizer.index_word = {v: k for k, v in tokenizer.word_index.items()}

sequences = tokenizer.texts_to_sequences(train['text_for_keras'])

padded_sequences = pad_sequences(
    sequences,
    maxlen=MAX_LENGTH,
    padding='post',
    truncating='post',
    value=tokenizer.word_index[PAD_TOKEN]
)

train['token_ids'] = list(padded_sequences)
train = train.drop(columns=['caption', 'text_for_keras'])

print("Результат (первые 5 строк):")
print(train[['tokens', 'token_ids']].head())

print("\nСловарь (ключевые токены):")
special_tokens = {
    PAD_TOKEN: tokenizer.word_index[PAD_TOKEN],
    SOS_TOKEN: tokenizer.word_index[SOS_TOKEN],
    EOS_TOKEN: tokenizer.word_index[EOS_TOKEN],
    OOV_TOKEN: tokenizer.word_index[OOV_TOKEN]
}
print(special_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Результат (первые 5 строк):
                                              tokens  \
0  [<SOS>, a, man, is, working, out, on, a, seate...   
1  [<SOS>, preparing, a, bowl, with, yogurt, and,...   
2  [<SOS>, a, man, with, a, muscular, build, is, ...   
3  [<SOS>, man, exercising, by, jogging, on, a, p...   
4  [<SOS>, wristwatch, hand, moving, forward, clo...   

                                           token_ids  
0  [3, 2, 16, 24, 102, 54, 10, 2, 320, 432, 204, ...  
1  [3, 205, 2, 321, 8, 371, 7, 794, 1066, 1067, 8...  
2  [3, 2, 16, 8, 2, 90, 434, 24, 112, 26, 113, 24...  
3  [3, 16, 104, 27, 127, 10, 2, 323, 186, 6, 5, 6...  
4  [3, 1068, 23, 249, 518, 33, 30, 36, 4, 1943, 1...  

Словарь (ключевые токены):
{'<PAD>': 1943, '<SOS>': 1944, '<EOS>': 1945, '<UNK>': 1}


In [None]:
class VideoCaptionDataset(Dataset):
    def __init__(self, dataframe, video_dir, transform=None, num_frames=16):
        self.dataframe = dataframe
        self.video_dir = video_dir
        self.transform = transform
        self.num_frames = num_frames

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        video_path = os.path.join(self.video_dir, self.dataframe.iloc[idx]['file_name'])
        token_ids = self.dataframe.iloc[idx]['token_ids']

        frames = self.extract_frames(video_path)

        if self.transform:
            frames = [self.transform(frame) for frame in frames]

        frames = torch.stack(frames)

        src_tokens = token_ids[:-1]
        tgt_tokens = token_ids[:-1]
        targets = token_ids[1:]

        return {
            'frames': frames,
            'src_tokens': torch.tensor(src_tokens, dtype=torch.long),
            'tgt_tokens': torch.tensor(tgt_tokens, dtype=torch.long),
            'targets': torch.tensor(targets, dtype=torch.long),
            'video_path': video_path
        }

    def extract_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frame_indices = np.linspace(0, total_frames-1, self.num_frames, dtype=int)

        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if ret:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(frame)

        cap.release()
        return frames

In [None]:
class VideoTestDataset(Dataset):
    def __init__(self, dataframe, video_dir, transform=None, num_frames=16):
        self.dataframe = dataframe
        self.video_dir = video_dir
        self.transform = transform
        self.num_frames = num_frames

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        video_path = os.path.join(self.video_dir, self.dataframe.iloc[idx]['file_name'])
        frames = self.extract_frames(video_path)

        if self.transform:
            frames = [self.transform(frame) for frame in frames]

        frames = torch.stack(frames)

        return {
            'frames': frames,
            'video_path': video_path
        }

    def extract_frames(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frame_indices = np.linspace(0, total_frames-1, self.num_frames, dtype=int)

        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if ret:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(frame)

        cap.release()
        return frames

In [None]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    try:
        frames = torch.stack([item['frames'] for item in batch])
        src_tokens = torch.stack([item['src_tokens'] for item in batch])
        tgt_tokens = torch.stack([item['tgt_tokens'] for item in batch])
        targets = torch.stack([item['targets'] for item in batch])
        video_paths = [item['video_path'] for item in batch]

        return {
            'frames': frames,
            'src_tokens': src_tokens,
            'tgt_tokens': tgt_tokens,
            'targets': targets,
            'video_paths': video_paths
        }
    except Exception as e:
        print("Ошибка при создании батча:", e)
        raise

dataset = VideoCaptionDataset(
    dataframe=train,
    video_dir='/content/automated-video-captioning/train_videos',
    transform=transform,
    num_frames=16
)

dataloader = DataLoader(
    dataset,
    batch_size=4,
    shuffle=True,
    num_workers=2,
    collate_fn=collate_fn
)

In [None]:
def collate_fn_test(batch):
    try:
        frames = torch.stack([item['frames'] for item in batch])
        video_paths = [item['video_path'] for item in batch]

        return {
            'frames': frames,
            'video_paths': video_paths
        }
    except Exception as e:
        print("Ошибка при создании батча (test):", e)
        raise

test_dataset = VideoTestDataset(
    dataframe=test,
    video_dir='/content/automated-video-captioning/test_videos',
    transform=transform,
    num_frames=16
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=4,
    shuffle=False,
    num_workers=2,
    collate_fn=collate_fn_test
)

In [None]:
print("Проверка данных:")
print("Тип token_ids:", type(train['token_ids'].iloc[0]))
print("Длина token_ids:", len(train['token_ids'].iloc[0]))
print("Пример token_ids:", train['token_ids'].iloc[0])

Проверка данных:
Тип token_ids: <class 'numpy.ndarray'>
Длина token_ids: 50
Пример token_ids: [   3    2   16   24  102   54   10    2  320  432  204   88    6    2
   80   17   85   24   52    7   48    2   35  793  247    7  370    5
   17  433  222  514  184    7  149   59    6    5   21    4 1943 1943
 1943 1943 1943 1943 1943 1943 1943 1943]


In [None]:
for batch in dataloader:
    print("Размеры батча:")
    print("Кадры:", batch['frames'].shape)
    print("src_tokens:", batch['src_tokens'].shape)
    print("tgt_tokens:", batch['tgt_tokens'].shape)
    print("targets:", batch['targets'].shape)
    print("Пример src_tokens:", [tokenizer.index_word[i.item()] for i in batch['src_tokens'][0]])
    print("Пример targets:", [tokenizer.index_word[i.item()] for i in batch['targets'][0]])
    break

Размеры батча:
Кадры: torch.Size([4, 16, 3, 224, 224])
src_tokens: torch.Size([4, 49])
tgt_tokens: torch.Size([4, 49])
targets: torch.Size([4, 49])
Пример src_tokens: ['<sos>', 'a', 'young', 'shirtless', 'man', 'work', 'out', 'on', 'the', 'elliptical', 'machine', 'his', 'gaze', 'fixed', 'ahead', 'with', 'unwavering', 'determination', 'his', 'movement', 'are', 'steady', 'and', 'rhythmic', 'each', 'stride', 'a', 'testament', 'to', 'his', 'commitment', 'to', 'cardiovascular', 'fitness', '<eos>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
Пример targets: ['a', 'young', 'shirtless', 'man', 'work', 'out', 'on', 'the', 'elliptical', 'machine', 'his', 'gaze', 'fixed', 'ahead', 'with', 'unwavering', 'determination', 'his', 'movement', 'are', 'steady', 'and', 'rhythmic', 'each', 'stride', 'a', 'testament', 'to', 'his', 'commitment', 'to', 'cardiovascular', 'fitness', '<eos>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<

Модель

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import resnet50

class Bumblebee(nn.Module):
    def __init__(self, vocab_size, d_model=512, lstm_hidden=512, lstm_layers=2, dropout=0.1, max_len=100):
        super().__init__()
        resnet = resnet50(pretrained=True)
      modules = list(resnet.children())[:-1]
        self.cnn = nn.Sequential(*modules)
        self.cnn_out_dim = 2048

        self.visual_projection = nn.Linear(self.cnn_out_dim, d_model)

        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout, max_len)

        self.lstm_encoder = nn.LSTM(
            input_size=d_model, hidden_size=lstm_hidden, num_layers=lstm_layers,
            batch_first=True, dropout=dropout, bidirectional=False
        )

        self.lstm_decoder = nn.LSTM(
            input_size=d_model, hidden_size=lstm_hidden, num_layers=lstm_layers,
            batch_first=True, dropout=dropout, bidirectional=False
        )

        self.output_layer = nn.Linear(lstm_hidden, vocab_size)

        self.max_len = max_len
        self.d_model = d_model
        self.lstm_hidden = lstm_hidden
        self.lstm_layers = lstm_layers

    def forward(self, frames, src_tokens, tgt_tokens):
        B, T, C, H, W = frames.shape

        frames = frames.view(B * T, C, H, W)
        with torch.no_grad():
            cnn_feats = self.cnn(frames).squeeze(-1).squeeze(-1)
        cnn_feats = cnn_feats.view(B, T, self.cnn_out_dim)

        video_feat = cnn_feats.mean(dim=1)
        video_feat = self.visual_projection(video_feat).unsqueeze(1)

        src_emb = self.token_embedding(src_tokens)
        src_emb = self.positional_encoding(src_emb)

        tgt_emb = self.token_embedding(tgt_tokens)
        tgt_emb = self.positional_encoding(tgt_emb)

        encoder_input = torch.cat([video_feat, src_emb], dim=1)

        encoder_outputs, (h_n, c_n) = self.lstm_encoder(encoder_input)

        decoder_outputs, _ = self.lstm_decoder(tgt_emb, (h_n, c_n))

        logits = self.output_layer(decoder_outputs)
        return logits

    def generate(self, frames, src_tokens, start_token_id, end_token_id, max_length=20, temperature=1.0):
        self.eval()
        B, T, C, H, W = frames.shape
        device = frames.device

        with torch.no_grad():
            frames_reshaped = frames.view(B * T, C, H, W)
            cnn_feats = self.cnn(frames_reshaped).squeeze(-1).squeeze(-1)
            cnn_feats = cnn_feats.view(B, T, self.cnn_out_dim)
            video_feat = cnn_feats.mean(dim=1)
            video_feat = self.visual_projection(video_feat).unsqueeze(1)

            src_emb = self.token_embedding(src_tokens)
            src_emb = self.positional_encoding(src_emb)

            encoder_input = torch.cat([video_feat, src_emb], dim=1)
            encoder_outputs, (h_n, c_n) = self.lstm_encoder(encoder_input)

            generated = torch.full((B, 1), start_token_id, dtype=torch.long, device=device)
            hidden = (h_n, c_n)

            finished = torch.zeros(B, dtype=torch.bool, device=device)

            for _ in range(max_length):
                tgt_emb = self.token_embedding(generated[:, -1:])
                tgt_emb = self.positional_encoding(tgt_emb)

                output, hidden = self.lstm_decoder(tgt_emb, hidden)
                logits = self.output_layer(output.squeeze(1))
                probs = F.softmax(logits / temperature, dim=-1)

                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
                next_tokens = next_tokens.masked_fill(finished, end_token_id)

                generated = torch.cat([generated, next_tokens.unsqueeze(1)], dim=1)
                finished |= next_tokens == end_token_id

                if finished.all():
                    break

            return [seq.tolist()[1:] for seq in generated]



class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=500):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [None]:
import nltk
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from tqdm import tqdm

nltk.download('punkt')

def decode_tokens(token_ids, tokenizer):
    if isinstance(token_ids, torch.Tensor):
        token_ids = token_ids.cpu().tolist()

    words = []
    for idx in token_ids:
        if idx == tokenizer.word_index['<PAD>']:
            continue
        word = tokenizer.index_word.get(idx, '<unk>')
        if word in ['<PAD>', '<SOS>', '<EOS>']:
            continue
        words.append(word)
    return words

bleu_N = 10000

def calculate_bleu(model, dataloader, tokenizer, device):
    print('Calculating BLEU...', end=' ')

    model.eval()
    references = []
    hypotheses = []
    smoothing = SmoothingFunction().method4

    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader)):
            if i > bleu_N:
              break

            frames = batch['frames'].to(device)
            src_tokens = batch['src_tokens'].to(device)
            tgt_tokens = batch['tgt_tokens'].to(device)
            targets = batch['targets'].to(device)

            preds = model.generate(frames, src_tokens, tokenizer.word_index['<SOS>'], tokenizer.word_index['<EOS>'])

            for i in range(targets.size(0)):
                ref = decode_tokens(targets[i], tokenizer)
                hyp = decode_tokens(preds[i], tokenizer)

                if len(hyp) == 0:
                    hyp = ['<unk>']

                references.append([ref])
                hypotheses.append(hyp)

    bleu_score = corpus_bleu(
        references,
        hypotheses,
        smoothing_function=smoothing,
        weights=(0.25, 0.25, 0.25, 0.25)
    )
    return bleu_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = len(tokenizer.word_index) + 1
model = Bumblebee(vocab_size=vocab_size).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

EPOCHS = 10

pad_idx = tokenizer.word_index['<PAD>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 193MB/s]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        frames = batch['frames'].to(device)
        src_tokens = batch['src_tokens'].to(device)
        tgt_tokens = batch['tgt_tokens'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(frames, src_tokens, tgt_tokens)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    torch.save(model.state_dict(), f'/content/drive/MyDrive/model_weights_epoch_{epoch}.pth')
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

100%|██████████| 151/151 [10:58<00:00,  4.36s/it]


Epoch 1, Loss: 6.1607


100%|██████████| 151/151 [11:03<00:00,  4.39s/it]


Epoch 2, Loss: 5.6171


100%|██████████| 151/151 [11:00<00:00,  4.37s/it]


Epoch 3, Loss: 5.4490


100%|██████████| 151/151 [11:00<00:00,  4.38s/it]


Epoch 4, Loss: 5.3005


100%|██████████| 151/151 [10:56<00:00,  4.35s/it]


Epoch 5, Loss: 5.1379


100%|██████████| 151/151 [10:57<00:00,  4.35s/it]


Epoch 6, Loss: 4.9914


  9%|▊         | 13/151 [01:02<11:03,  4.81s/it]


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), 'model_weights_2.pth')

Генерация ответа

In [None]:
model.eval()

start_token_id, end_token_id = tokenizer.word_index['<SOS>'], tokenizer.word_index['<EOS>']
pad_token_id = tokenizer.word_index['<PAD>']

results = []
index = 0

for batch in tqdm(test_dataloader):
    frames = batch['frames'].to(device)
    video_paths = batch['video_paths']

    B = frames.size(0)
    src_tokens = torch.full((B, 1), start_token_id, dtype=torch.long, device=device)

    generated_ids = model.generate(
        frames=frames,
        src_tokens=src_tokens,
        start_token_id=start_token_id,
        end_token_id=end_token_id,
        max_length=30
    )
    for video_path, token_ids in zip(video_paths, generated_ids):
        caption = decode_tokens(token_ids, tokenizer)
        file_name = os.path.basename(video_path)
        results.append((index, file_name, caption))
        index += 1

df = pd.DataFrame(results, columns=["index", "file_name", "caption"])
df.to_csv("submission.csv", index=False)

100%|██████████| 131/131 [08:59<00:00,  4.12s/it]


In [None]:
df

Unnamed: 0,index,file_name,caption
0,0,0.mp4,"[skyline, highlight, entire, lip, <eos>, <eos>..."
1,1,1.mp4,"[sunny, give, companion, she, for, <eos>, jade..."
2,2,2.mp4,"[valentine, enhancing, during, <eos>, <eos>, f..."
3,3,3.mp4,"[cinematic, powerfully, weight, colorful, brid..."
4,4,4.mp4,"[magnificent, cell, underwear, summer, banana,..."
...,...,...,...
516,516,516.mp4,"[sweat, smoothly, desk, lift, with, a, <eos>, ..."
517,517,517.mp4,"[sprint, los, fluently, training, while, intwi..."
518,518,518.mp4,"[amidst, start, through, afternoon, background..."
519,519,519.mp4,"[with, reporter, stable, while, ups, <eos>, <e..."


In [None]:
df['caption'] = df['caption'].apply(lambda x: ' '.join(x))

In [None]:
df.to_csv("submission.csv", index=False)