## Contrastive Learning CLIP

In [9]:
import numpy as np
import torch
import datetime
import torch.nn.functional as F
from torch import nn
from transformers import AutoImageProcessor, TimesformerModel, TimesformerConfig
from torch import einsum
from einops import rearrange, reduce, repeat
from torch.utils.data._utils.collate import default_collate
import timesformer.models.optimizer as optim

In [10]:
from timesformer.datasets.rtmri75s import Rtmri75s
from timesformer.utils.parser import load_config, parse_args
# from timesformer.datasets.loader import detection_collate
from torch.utils.data._utils.collate import default_collate

In [11]:
class CLIP(nn.Module):
    def __init__(self,
                 device = 'cuda:0'):
        super().__init__()
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
        self.device = device
    def build_attention_mask(self):
        # lazily create causal attention mask, with full attention between the vision tokens
        # pytorch uses additive attention mask; fill with -inf
        mask = torch.empty(self.context_length, self.context_length)
        mask.fill_(float("-inf"))
        mask.triu_(1)  # zero out the lower diagonal
        return mask

    @property
    def dtype(self):
        return self.visual.conv1.weight.dtype

    def encode_image(self, video):
        cfg = TimesformerConfig(hidden_size=1024, num_attention_heads = 16)
        video_model = TimesformerModel(cfg).to(self.device)
        # video_model = TimesformerModel.from_pretrained("facebook/timesformer-hr-finetuned-k600")
        return video_model(video)

    def forward(self, video, audio_features):
        video_features = torch.mean(self.encode_image(video).last_hidden_state, dim=1)
        # print(video_features.shape)
        # audio_features = self.encode_text(text)

        # normalized features
        # print(video_features.shape)
        video_features = torch.nn.functional.normalize(video_features, p=2, dim=1)#video_features / video_features.norm(dim=1, keepdim=True) 
        audio_features = torch.nn.functional.normalize(audio_features, p=2, dim=1)#audio_features / audio_features.norm(dim=1, keepdim=True)

        # cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * video_features @ audio_features.t()
        logits_per_text = logits_per_image.t()

        # shape = [global_batch_size, global_batch_size]
        return logits_per_image, logits_per_text

class ClipLoss(nn.Module):

    def __init__(
            self,
            local_loss=False,
            gather_with_grad=False,
            cache_labels=False,
            rank=0,
            world_size=1,
            use_horovod=False,
    ):
        super().__init__()
        self.local_loss = local_loss
        self.gather_with_grad = gather_with_grad
        self.cache_labels = cache_labels
        self.rank = rank
        self.world_size = world_size
        self.use_horovod = use_horovod

        # cache state
        self.prev_num_logits = 0
        self.labels = {}

    def get_ground_truth(self, device, num_logits) -> torch.Tensor:
        # calculated ground-truth and cache if enabled
        if self.prev_num_logits != num_logits or device not in self.labels:
            labels = torch.arange(num_logits, device=device, dtype=torch.long)
            if self.world_size > 1 and self.local_loss:
                labels = labels + num_logits * self.rank
            if self.cache_labels:
                self.labels[device] = labels
                self.prev_num_logits = num_logits
        else:
            labels = self.labels[device]
        return labels

    def forward(self, logits_per_image, logits_per_text, output_dict=False):
        device = logits_per_image.device
        # logits_per_image, logits_per_text = self.get_logits(image_features, text_features, logit_scale)

        labels = self.get_ground_truth(device, logits_per_image.shape[0])

        total_loss = (
            F.cross_entropy(logits_per_image, labels) +
            F.cross_entropy(logits_per_text, labels)
        ) / 2

        return {"contrastive_loss": total_loss} if output_dict else total_loss
    
# model = CLIP()
# loss_fun = ClipLoss()
# videos, audios =dataset[2]

In [12]:
# logits_per_image, logits_per_text = model(rearrange(videos[1].unsqueeze(0), 'b c t h w -> b t c h w'), audios[0].unsqueeze(0))
# loss = loss_fun(logits_per_image, logits_per_text)
# loss.backward()

In [13]:
# video_model = TimesformerModel.from_pretrained("facebook/timesformer-base-finetuned-k600")
# video_model(rearrange(videos[1].unsqueeze(0), 'b c t h w -> b t c h w')).last_hidden_state.shape

In [14]:
# import timesformer.models.optimizer as optim
# optimizer = optim.construct_optimizer(model, cfg)

In [15]:
from tqdm import tqdm

In [25]:
def train_epoch(
    train_loader, model, optimizer, cur_epoch, cfg
):
    total_loss = 0
    loss_fun = ClipLoss()
    cur_iter = 0
    for videos, audios, _, _ in tqdm(train_loader):
        data_size = len(train_loader)
        # print(data_size)
        # model = model.train()
        # model = model.cuda()

        videos = rearrange(videos[1], 'b c t h w -> b t c h w').cuda(non_blocking=True)
        audios_embs = audios.cuda(non_blocking=True)

        lr = optim.get_epoch_lr(cur_epoch + float(cur_iter) / data_size, cfg)

        logits_per_image, logits_per_text = model(videos, audios_embs)

        loss = loss_fun(logits_per_image, logits_per_text)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        cur_iter += 1
        
    return total_loss

def train(cfg):
    batch_size = int(cfg.TRAIN.BATCH_SIZE / max(1, cfg.NUM_GPUS))
    # Construct the dataset
    dataset = Rtmri75s(cfg, "train")

    train_loader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=cfg.DATA_LOADER.NUM_WORKERS,
        pin_memory=cfg.DATA_LOADER.PIN_MEMORY,
        collate_fn=default_collate,
    )

    model = CLIP()
    # loss_fun = ClipLoss()
    device = 'cuda:0'
    optimizer = optim.construct_optimizer(model, cfg)
    
    model = torch.load("/data2/hongn/TimeSformer/lowest_lost_2024-12-13.pth")
    model.train()
    model = model.to(device)

    min_loss = 100000

    for e in range(1000):
        total_loss = train_epoch(train_loader, model, optimizer, cur_epoch = e, cfg=cfg)
        print(f"EPOCH {e} with total loss {total_loss}")
        if(total_loss < min_loss):
            min_loss = total_loss
            today = datetime.datetime.today().strftime('%Y-%m-%d')
            torch.save(model, f'lowest_lost_{today}.pth')

In [26]:
cfg = load_config(cfg_file = "/data2/hongn/TimeSformer/configs/Rtmri75s/simple_cfg.yaml")
train(cfg)

  0%|          | 0/75 [00:00<?, ?it/s]

100%|██████████| 75/75 [03:58<00:00,  3.18s/it]


EPOCH 0 with total loss 257.564825296402


 40%|████      | 30/75 [01:37<02:24,  3.22s/it]

Failed to meta load audio idx 1490 from /data1/span_data/rtmri75s/sub050/2drt/video/sub050_2drt_01_vcv1_r1_video.mp4; trial 0


100%|██████████| 75/75 [04:01<00:00,  3.22s/it]


EPOCH 1 with total loss 257.56669449806213


 49%|████▉     | 37/75 [02:00<02:01,  3.21s/it]

Failed to meta load audio idx 1709 from /data1/span_data/rtmri75s/sub068/2drt/video/sub068_2drt_09_northwind1_r2_video.mp4; trial 0


 76%|███████▌  | 57/75 [03:04<00:57,  3.21s/it]

Failed to meta load audio idx 2358 from /data1/span_data/rtmri75s/sub047/2drt/video/sub047_2drt_15_picture4_video.mp4; trial 0


100%|██████████| 75/75 [04:01<00:00,  3.22s/it]


EPOCH 2 with total loss 257.56507873535156


100%|██████████| 75/75 [04:01<00:00,  3.21s/it]


EPOCH 3 with total loss 257.5650157928467


  0%|          | 0/75 [00:00<?, ?it/s]

Failed to meta load audio idx 24 from /data1/span_data/rtmri75s/sub007/2drt/video/sub007_2drt_07_grandfather1_r1_video.mp4; trial 0


100%|██████████| 75/75 [04:00<00:00,  3.21s/it]


EPOCH 4 with total loss 257.56096029281616


100%|██████████| 75/75 [04:01<00:00,  3.21s/it]


EPOCH 5 with total loss 257.56329321861267


  0%|          | 0/75 [00:00<?, ?it/s]

Failed to meta load audio idx 19 from /data1/span_data/rtmri75s/sub007/2drt/video/sub007_2drt_14_picture3_video.mp4; trial 0
Failed to meta load audio idx 94 from /data1/span_data/rtmri75s/sub063/2drt/video/sub063_2drt_03_vcv3_r1_video.mp4; trial 0


100%|██████████| 75/75 [04:01<00:00,  3.22s/it]


EPOCH 6 with total loss 257.5659623146057


100%|██████████| 75/75 [04:01<00:00,  3.22s/it]


EPOCH 7 with total loss 257.56448125839233


100%|██████████| 75/75 [04:01<00:00,  3.22s/it]


EPOCH 8 with total loss 257.5644246339798


  3%|▎         | 2/75 [00:08<04:47,  3.93s/it]

Failed to meta load audio idx 599 from /data1/span_data/rtmri75s/sub062/2drt/video/sub062_2drt_03_vcv3_r1_video.mp4; trial 0


 21%|██▏       | 16/75 [00:55<03:25,  3.49s/it]


KeyboardInterrupt: 

In [11]:
data_size

NameError: name 'data_size' is not defined

In [None]:
from transformers import TimesformerConfig, TimesformerModel
cfg = TimesformerConfig(hidden_size=1024, num_attention_heads = 16)
video_model = TimesformerModel(cfg)
video_model(rearrange(videos[1].unsqueeze(0), 'b c t h w -> b t c h w')).last_hidden_state.shape

torch.Size([1, 1569, 1024])

In [None]:
videos[1]

In [None]:
aud = dataset[2][1]
aud.shape

/data1/span_data/rtmri75s/sub007/2drt/video/sub007_2drt_13_picture2_video.mp4
video_size 2738 start_idx 305.5214309103746 end_idx 331.17034256210314
video_start_pts 238612 video_end_pts 258644 frames_length 2738 duration 2138378 start_idx 305.5214309103746 end_idx 331.17034256210314
video_size 27 start_idx 0.0 end_idx 25.648911651728554


torch.Size([16, 1024])

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import torch

# load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
    
# load dummy dataset and read soundfiles
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

# tokenize
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values

Generating validation split: 0 examples [00:00, ? examples/s]


DatasetGenerationError: An error occurred while generating the dataset

In [None]:
videos[1][:,0,:]

tensor([[[-1.9145, -1.9044, -1.8860,  ..., -0.8597, -0.9200, -1.0656],
         [-1.9020, -1.8946, -1.8788,  ..., -0.8741, -0.9341, -1.0755],
         [-1.8912, -1.8882, -1.8748,  ..., -0.9746, -1.0234, -1.1309],
         ...,
         [-1.7798, -1.7348, -1.6976,  ..., -1.3505, -1.3566, -1.3685],
         [-1.7778, -1.7343, -1.7171,  ..., -1.3809, -1.3952, -1.4233],
         [-1.7759, -1.7339, -1.7366,  ..., -1.4113, -1.4337, -1.4780]],

        [[-1.8797, -1.8695, -1.8511,  ..., -0.8249, -0.8852, -1.0308],
         [-1.8671, -1.8597, -1.8440,  ..., -0.8392, -0.8993, -1.0406],
         [-1.8563, -1.8533, -1.8399,  ..., -0.9398, -0.9885, -1.0960],
         ...,
         [-1.7449, -1.6999, -1.6628,  ..., -1.3157, -1.3217, -1.3336],
         [-1.7430, -1.6995, -1.6822,  ..., -1.3461, -1.3603, -1.3884],
         [-1.7411, -1.6990, -1.7017,  ..., -1.3765, -1.3989, -1.4432]],

        [[-1.9319, -1.9218, -1.9034,  ..., -0.8771, -0.9375, -1.0831],
         [-1.9194, -1.9120, -1.8963,  ..., -0

In [None]:
model

TimesformerModel(
  (embeddings): TimesformerEmbeddings(
    (patch_embeddings): TimesformerPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (time_drop): Dropout(p=0.0, inplace=False)
  )
  (encoder): TimesformerEncoder(
    (layer): ModuleList(
      (0): TimesformerLayer(
        (drop_path): Identity()
        (attention): TimeSformerAttention(
          (attention): TimesformerSelfAttention(
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (attn_drop): Dropout(p=0.0, inplace=False)
          )
          (output): TimesformerSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): TimesformerIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (dropout): Dropout(p=0.0, inplace=False

##  Sample of Wav2vec Phoneme

In [37]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import torch

# load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
    
# load dummy dataset and read soundfiles
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

# tokenize
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values

# retrieve logits
with torch.no_grad():
    logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


In [38]:
import librosa
in_path = "/data1/span_data/rtmri75s/sub001/2drt/audio/"
name = "sub001_2drt_19_topic3_audio"
input_audio, sample_rate = librosa.load(f"{in_path}/{name}.wav",  sr=16000)

input_values = processor(input_audio, return_tensors="pt", sampling_rate=sample_rate).input_values

# retrieve logits
with torch.no_grad():
    logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

In [42]:
librosa.load("/data1/span_data/rtmri75s/sub007/2drt/audio/sub007_2drt_12_picture1_audio.wav", sr=16000)

(array([ 1.5315111e-04, -1.8992375e-03, -4.2232666e-03, ...,
         2.5964671e-01,  4.1573516e-01,  3.2189554e-01], dtype=float32),
 16000)

In [36]:
o.last_hidden_state

AttributeError: 'CausalLMOutput' object has no attribute 'last_hidden_state'

In [30]:
predicted_ids[0][1100:1150]

tensor([ 0,  0,  0,  0,  5,  0,  5,  0,  0, 33,  0,  0,  0,  0, 13,  0,  0,  0,
         0,  0, 27, 27,  0, 33,  0,  0,  0,  0,  0,  0,  0, 23,  0,  0,  0,  0,
         6,  0,  0,  0, 44,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [20]:
1682 * 0.02

33.64

## Data Processing

Divide wav clips into 200ms video-audio segments? Or should I random sample video-audiop pairs

In [45]:
import glob
import os
import pandas as pd

In [46]:
folders = glob.glob("/data1/span_data/rtmri75s/sub0[6-8]*")
video_paths = []
audio_paths = []
for i in folders:
    sub_video_paths = glob.glob(i + "/2drt/video/*")
    video_paths = video_paths + sub_video_paths


In [47]:
audio_paths = []
for path in video_paths:
    # name = "/data1/hongn/wav2vec_75speaker/" + path.split('/')[-1].split('.')[0] + '.pt'
    name = path.split('video')[0] + 'audio' + path.split('video')[1] + 'audio.wav'
    audio_paths.append(name)
    try:
        # name = path.split('video')[0] + 'audio' + path.split('video')[1] + 'audio.wav'
        assert os.path.exists(name), f"Path not exist {name}"
    except Exception as e:
        print(f"WARNING: {e}")
        os.system(f"ffmpeg -i {path} {name}")
        continue

In [49]:
audio_paths

['/data1/span_data/rtmri75s/sub063/2drt/audio/sub063_2drt_06_rainbow_r2_audio.wav',
 '/data1/span_data/rtmri75s/sub063/2drt/audio/sub063_2drt_12_picture1_audio.wav',
 '/data1/span_data/rtmri75s/sub063/2drt/audio/sub063_2drt_10_northwind2_r2_audio.wav',
 '/data1/span_data/rtmri75s/sub063/2drt/audio/sub063_2drt_14_picture3_audio.wav',
 '/data1/span_data/rtmri75s/sub063/2drt/audio/sub063_2drt_21_topic5_audio.wav',
 '/data1/span_data/rtmri75s/sub063/2drt/audio/sub063_2drt_17_topic1_audio.wav',
 '/data1/span_data/rtmri75s/sub063/2drt/audio/sub063_2drt_05_shibboleth_r2_audio.wav',
 '/data1/span_data/rtmri75s/sub063/2drt/audio/sub063_2drt_09_northwind1_r1_audio.wav',
 '/data1/span_data/rtmri75s/sub063/2drt/audio/sub063_2drt_09_northwind1_r2_audio.wav',
 '/data1/span_data/rtmri75s/sub063/2drt/audio/sub063_2drt_07_grandfather1_r1_audio.wav',
 '/data1/span_data/rtmri75s/sub063/2drt/audio/sub063_2drt_11_postures_r1_audio.wav',
 '/data1/span_data/rtmri75s/sub063/2drt/audio/sub063_2drt_04_bvt_r1_au

In [68]:
video_audio_paths = pd.DataFrame(
    {'video_paths': video_paths,
     'audio_paths': audio_paths,
    })
video_audio_paths.head()

video_audio_paths.to_csv('/data1/hongn/val.csv', index=False, header=False)

Unnamed: 0,video_paths,audio_paths
0,/data1/span_data/rtmri75s/sub063/2drt/video/su...,/data1/span_data/rtmri75s/sub063/2drt/audio/su...
1,/data1/span_data/rtmri75s/sub063/2drt/video/su...,/data1/span_data/rtmri75s/sub063/2drt/audio/su...
2,/data1/span_data/rtmri75s/sub063/2drt/video/su...,/data1/span_data/rtmri75s/sub063/2drt/audio/su...
3,/data1/span_data/rtmri75s/sub063/2drt/video/su...,/data1/span_data/rtmri75s/sub063/2drt/audio/su...
4,/data1/span_data/rtmri75s/sub063/2drt/video/su...,/data1/span_data/rtmri75s/sub063/2drt/audio/su...


In [69]:
video_audio_paths.to_csv('/data1/hongn/val.csv', index=False, header=False)

In [75]:
# pd.read_csv("/data1/hongn/train.csv")

## Split videos into chunks

In [102]:
import cv2

def video_len_with_opencv(filename):
    video = cv2.VideoCapture(filename)

    fps = video.get(cv2.CAP_PROP_FPS)
    frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT)
    duration = frame_count/fps
    return duration, frame_count, fps

In [103]:
video_len_with_opencv(video_paths[0])

(42.82051045510455, 3566.0, 83.27784891165173)

In [104]:
video_paths[0]

'/data1/span_data/rtmri75s/sub007/2drt/video/sub007_2drt_01_vcv1_r2_video.mp4'

In [None]:
for vidpath in video_paths:
    i = 0
    duration, frame_count, fps = video_len_with_opencv(vidpath)
    while i < duration:
        chunk_name = '/data1/hongn/rtmri75s_processed/video/' + video_paths[0].split('/')[-1].split('.')[0] + '_chunk_{:.1f}.mp4'.format(i)
        os.system(f"ffmpeg -i {vidpath} -ss {i} -t {i + 0.2} {chunk_name}")
        i += 0.2

ffmpeg version 6.1.1 Copyright (c) 2000-2023 the FFmpeg developers
  built with gcc 12.3.0 (conda-forge gcc 12.3.0-7)
  configuration: --prefix=/data2/hongn/miniconda3/envs/sapiens_lite --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1716145014501/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1716145014501/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1716145014501/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1716145014501/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libharfbuzz --enable-libfontconfig --enable-libopenh264 --enable-libdav1d --enable-gnutls --enable-libmp3lame --enable-libvpx --enable-libass --enable-pthreads --enable-vaapi --enable-libopenvino --enable-gpl --enable-libx264 --enable-libx265 --enable-libaom --e

In [136]:
i = 0
'/data1/hongn/rtmri75s_processed/video/' + video_paths[0].split('/')[-1].split('.')[0] + '_chunk_{:.1f}.mp4'.format(i)

'/data1/hongn/rtmri75s_processed/video/sub007_2drt_01_vcv1_r2_video_chunk_0.0.mp4'

In [87]:
os.system("ffmpeg -i source-file.mp4 -ss 0 -t 1 first-10-min.m4v")

'/data1/span_data/rtmri75s/sub007/2drt/audio/sub007_2drt_01_vcv1_r2_audio.wav'

In [112]:
import numpy as np
np.concatenate(([0], [0], [0]))

array([0, 0, 0])

## pre-Extract Wav2Vec Phoneme logits to .pt files

In [51]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import torch
import librosa

# load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")



for audio_path in audio_paths:
    input_audio, sample_rate = librosa.load(audio_path,  sr=16000)

    input_values = processor(input_audio, return_tensors="pt", sampling_rate=sample_rate).input_values

    # retrieve logits
    with torch.no_grad():
        logits = model(input_values).logits

    # take argmax and decode
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)

    torch.save(predicted_ids, '/data1/hongn/rtmri75s_processed/audio/' + audio_path.split('.')[0].split('/')[-1] + '.pt')

In [99]:
'/data1/hongn/rtmri75s_processed/audio/' + audio_paths[0].split('.')[0].split('/')[-1] + '.pt'

'/data1/hongn/rtmri75s_processed/audio/sub007_2drt_01_vcv1_r2_audio.pt'

In [100]:
torch.save(predicted_ids, '/data1/hongn/rtmri75s_processed/audio/' + audio_paths[0].split('.')[0].split('/')[-1] + '.pt')

## Create train.py val.py for rtmridata

In [115]:
import glob
import torch

chunks_paths = glob.glob('/data1/hongn/rtmri75s_processed/video/sub0[0-5]*')
chunks_paths[0]
# audio_emb_paths = glob.glob('/data1/hongn/rtmri75s_processed/audio/*')

'/data1/hongn/rtmri75s_processed/video/sub036_2drt_19_topic3_video_chunk_23.2.mp4'

In [73]:
# temp = chunks_paths[430].split('/')[-1].split('_chunk_')
# start_idx = int(float(temp[-1].split('.mp4')[0])/0.02)
# audio_path = '/data1/hongn/rtmri75s_processed/audio/' + temp[0].split('_video')[0] + '_audio.pt'
# audio_logits = torch.load(audio_path)
# audio_logits[0,start_idx:start_idx+10].numpy()

In [77]:
# start_idx = 0
# logit_not0 = [x for x in audio_logits[0,start_idx:start_idx+10].numpy() if x != 0]
# if logit_not0 == []:
#     logit_not0 = [0]
# logit_not0

[0]

In [116]:
video_chunks_processed = []
audio_labels_processed = []

for path in chunks_paths:
    temp = path.split('/')[-1].split('_chunk_')
    start_idx = int(float(temp[-1].split('.mp4')[0])/0.02)
    audio_path = '/data1/hongn/rtmri75s_processed/audio/' + temp[0].split('_video')[0] + '_audio.pt'
    audio_logits = torch.load(audio_path)

    logit_not0 = [x for x in audio_logits[0,start_idx:start_idx+10].numpy() if x != 0]
    if logit_not0 == []:
        logit_not0 = [0]
    for logit in logit_not0:
        video_chunks_processed.append(path)
        audio_labels_processed.append(logit)

In [117]:
len(video_chunks_processed)

441388

In [118]:
video_audio_paths = pd.DataFrame(
    {'video_paths': video_chunks_processed,
     'audio_paths': audio_labels_processed,
    })

video_audio_paths.to_csv('/data1/hongn/rtmri75s_processed/train.csv', index=False, header=False)

In [27]:
audio_logits[0,start_idx:start_idx+10]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [4]:
from timesformer.models.vit import MOOSE
import argparse
from timesformer.models.moose import MOOSE_Encoder, CustomAttentionWithResidual
# from 
parser = argparse.ArgumentParser()
parser.add_argument('--model', help="restore checkpoint")
parser.add_argument('--path', help="dataset for evaluation")
parser.add_argument('--small', action='store_true', help='use small model')
parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision')
parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation')

raft_args = parser.parse_args(['--model', '/data2/hongn/RAFT/models/raft-things.pth', 
                        '--path', '/data2/hongn/RAFT/demo-frames/care'])
moose = MOOSE(raft_args)

  warn(f"Failed to load image Python extension: {e}")
  motion_model.load_state_dict(torch.load(args.model))


Since no pretrained weights have been provided, we load the reference pretrained DINO weights.


In [5]:
moose

MOOSE(
  (crossatt): CustomAttentionWithResidual(
    (attention): CustomAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (fc_out): Linear(in_features=768, out_features=768, bias=True)
    )
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (moose_encoder): MOOSE_Encoder(
    (motion_model): RAFT(
      (fnet): BasicEncoder(
        (norm1): InstanceNorm2d(64, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
        (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
        (relu1): ReLU(inplace=True)
        (layer1): Sequential(
          (0): ResidualBlock(
            (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1)

In [7]:
for name, p in moose.named_parameters():
    print(name)

crossatt.attention.query.weight
crossatt.attention.query.bias
crossatt.attention.key.weight
crossatt.attention.key.bias
crossatt.attention.value.weight
crossatt.attention.value.bias
crossatt.attention.fc_out.weight
crossatt.attention.fc_out.bias
crossatt.norm.weight
crossatt.norm.bias
moose_encoder.motion_model.fnet.conv1.weight
moose_encoder.motion_model.fnet.conv1.bias
moose_encoder.motion_model.fnet.layer1.0.conv1.weight
moose_encoder.motion_model.fnet.layer1.0.conv1.bias
moose_encoder.motion_model.fnet.layer1.0.conv2.weight
moose_encoder.motion_model.fnet.layer1.0.conv2.bias
moose_encoder.motion_model.fnet.layer1.1.conv1.weight
moose_encoder.motion_model.fnet.layer1.1.conv1.bias
moose_encoder.motion_model.fnet.layer1.1.conv2.weight
moose_encoder.motion_model.fnet.layer1.1.conv2.bias
moose_encoder.motion_model.fnet.layer2.0.conv1.weight
moose_encoder.motion_model.fnet.layer2.0.conv1.bias
moose_encoder.motion_model.fnet.layer2.0.conv2.weight
moose_encoder.motion_model.fnet.layer2.0.c