In [3]:
import warnings
warnings.filterwarnings("ignore")


import sys
sys.path.append("models/tensorflow_models/research/audioset/")
sys.path.append("models/tensorflow_models/research/audioset/vggish")

import torch
torch.set_grad_enabled(False)

import scipy.io.wavfile as scio
import io
import numpy as np
import vggish.vggish_input as vggish_input
import subprocess


from dumper import ffmpeg_audio_reader
from dumper import read_frames_center_crop_batch

from models.vggish_model import VGGish
from models.vmz_model import VMZ_irCSN_152
from models.clip_model import CLIP
from models.mmt import BertTXT, BertVID

# Code for showing video segment in notebook

In [4]:
from IPython.display import Javascript
from IPython.display import HTML, display
import base64

def read_video_segm(abspath, t_beg, t_end):
    cmd = f'ffmpeg -y -ss {t_beg} -i {abspath} -max_muxing_queue_size 9999  -loglevel error -f mp4 -vf scale="(floor(112/ih * iw/2))*2:112"  -c:a copy  -movflags frag_keyframe+empty_moov -t {t_end - t_beg} pipe:1 -nostats -hide_banner -nostdin'
    p = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
    assert p.returncode == 0, cmd
    buf = p.stdout
    return buf

video_id_cnt = 0    
class VideoElem:
    def __init__(self, fname, t_start=0, t_end=999):
        with open(fname, 'rb') as f:
            #data = base64.standard_b64encode(f.read())
            buf = read_video_segm(fname, t_start, t_end)
            data = base64.standard_b64encode(buf)
        global video_id_cnt
        video_id_cnt += 1
        self.video_id_cnt = video_id_cnt
        elem = HTML(f"""
            <video id="video_{self.video_id_cnt}" autoplay loop muted>
                <source src="data:video/mp4;base64,{data.decode('ascii')}" type="video/mp4">
            </video>        
        """)
        display(elem)
    
    def hide(self):
        js = f'$("#video_{self.video_id_cnt}").hide()'
        display(Javascript(js))
        
    def show(self):
        js = f'$("#video_{self.video_id_cnt}").show()'
        display(Javascript(js))

    def remove(self):
        js = f'$("#video_{self.video_id_cnt}").remove()'
        display(Javascript(js))

# Audio utils

In [5]:
class NoAudio(Exception):
    pass

def unpack_wav(wav):
    sr, data = scio.read(io.BytesIO(wav))
    data = data / 32768.0
    segms = vggish_input.waveform_to_examples(data, sr)
    t_start = np.arange(len(segms), dtype=np.float32) * 0.96
    t_end = t_start + 0.96
    timings = np.concatenate([t_start[..., None], t_end[..., None]], axis=1) # (nsegm, 2)
    return timings, segms


def vggish_compute_embs(model, path, t_start, t_end, batch_size=32):
    wav = ffmpeg_audio_reader(path, t_start, t_end)
    if wav is None:
        # no audio channel
        raise NoAudio
    timings, segms = unpack_wav(wav)
    
    embs = []
    idxs = range(0, len(segms), batch_size)
    for idx in idxs:
        embs.append(model(segms[idx: idx + batch_size]))
    embs = np.concatenate(embs, axis=0)
    return timings, embs


# Video utils

In [6]:
def visual_compute_embs(
    model,
    path,
    t_start,
    t_end,
    fps=32,
    frame_size=224,
    frame_crop_size=224,
    per_batch_size=4,
    frames_per_clip=32):
    frames_batch_iter = read_frames_center_crop_batch(
        path,
        fps=fps,
        frame_size=frame_size,
        frame_crop_size=frame_crop_size,
        batch_num_frames=per_batch_size*frames_per_clip,
        t_start=t_start,
        t_end=t_end)
    embs = []
    timings = []
    t = 0
    delta = frames_per_clip / fps
    for frames in frames_batch_iter:
        if len(frames) % frames_per_clip > 0:
            n = len(frames)
            n1 = int(len(frames) // frames_per_clip * frames_per_clip)
            frames1 = frames[:n1]
            # increase frame rate in the last video segment
            idxs = np.ceil(np.linspace(n1, n-1, frames_per_clip)).astype(np.long)
            frames2 = frames[idxs]
            frames = np.concatenate([frames1, frames2], axis=0)
        assert len(frames) % frames_per_clip == 0
        batch_frames = frames.reshape(-1, frames_per_clip, frame_crop_size, frame_crop_size, 3)
        for _ in range(len(batch_frames)):
            timings.append((t, t + delta))
            t += delta
        embs.append(model(batch_frames))
    embs = np.concatenate(embs, axis=0)
    timings = np.array(timings) # (nsegm, 2)
    return timings, embs

In [50]:
def prepare_features(features, features_t):
    all_features = {}
    all_features_t = {}
    all_features_mask = {}

    for mod_name, einfo in experts_info.items():
        max_tok = einfo["max_tok"]
        dim = einfo["dim"]
        all_features[mod_name] = torch.zeros(1, max_tok, dim)
        all_features_t[mod_name] = torch.zeros(1, max_tok)
        all_features_mask[mod_name] = torch.zeros(1, max_tok)
    for mod_name in features.keys():
        max_tok = experts_info[mod_name]["max_tok"]
        mod_feat = features[mod_name]
        mod_feat_t = np.array(features_t[mod_name])
        if mod_feat is None:
            continue
        assert len(mod_feat) == len(mod_feat_t), (len(mod_feat), len(mod_feat_t))
        if np.isnan(mod_feat_t.sum()):
            mod_feat_t = np.zeros(len(mod_feat_t))
            mod_feat_t[:] = 1
        else:
            mod_feat_t = mod_feat_t - mod_feat_t[:,0].min()
            mod_feat_t = 2 + (mod_feat_t[:,1] + mod_feat_t[:,0]) / 2 # (ntok,)
        all_features[mod_name][0,:len(mod_feat)] = torch.from_numpy(mod_feat[:max_tok].copy())
        all_features_t[mod_name][0,:len(mod_feat)] = torch.from_numpy(mod_feat_t[:max_tok].copy())
        all_features_mask[mod_name][0, :len(mod_feat)] = 1
    return all_features, all_features_t, all_features_mask

def dict_to_cuda(d):
    return {k:v.cuda() for k, v in d.items()}


def encode_video(vggish_model, vmz_model, clip_model, model_vid, path, t_start=None, t_end=None):
    try:
        timings_vggish, embs_vggish = vggish_compute_embs(vggish_model, path, t_start, t_end)
    except NoAudio:
        timings_vggish, embs_vggish = None, None
    timings_vmz, embs_vmz = visual_compute_embs(vmz_model, path, t_start, t_end,
                                                fps=24, frames_per_clip=32, frame_crop_size=224, frame_size=224)
    timings_clip, embs_clip = visual_compute_embs(clip_model, path, t_start, t_end,
                                                fps=24, frames_per_clip=1, frame_crop_size=224, frame_size=224)

    features = {
        'VIDEO': embs_vmz,
        'CLIP': embs_clip,
        'tf_vggish': embs_vggish,
    }

    features_t = {
        'VIDEO': timings_vmz,
        'CLIP': timings_clip,
        'tf_vggish': timings_vggish,
    }
    
    all_features, all_features_t, all_features_mask = prepare_features(features, features_t)
    
    all_features = dict_to_cuda(all_features)
    all_features_t = dict_to_cuda(all_features_t)
    all_features_mask = dict_to_cuda(all_features_mask)
    
    out = model_vid(all_features, all_features_t, all_features_mask) # (1, 512*3)
    return out[0]

def encode_text(text):
    emb = model_txt([text])[0]
    return emb

def batch_encode_text(texts):
    embs = model_txt(texts)
    return embs

def sim(x1, x2):
    return (x1*x2).sum()

# Models

In [51]:
vggish_model = VGGish(ckpt_path='ckpts/vggish_model.ckpt', per_batch_size=32)
vmz_model = VMZ_irCSN_152('ckpts/irCSN_152_ig65m_from_scratch_f125286141.pth')
clip_model = CLIP()

INFO:tensorflow:Restoring parameters from ckpts/vggish_model.ckpt


In [52]:
from transformers import AutoModel, AutoTokenizer 


# Define the model repo
model_name = "bert-base-cased" 


# Download pytorch model
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)



Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [53]:

state = torch.load('/home/dimas/mdmmt_test-master/checkpoint/mdmmt_3mod.pth', map_location='cpu')

experts_info = {
    'VIDEO': dict(dim=2048, idx=1, max_tok=30),
    'CLIP': dict(dim=512, idx=2, max_tok=30),
    'tf_vggish': dict(dim=128, idx=3, max_tok=30),
}
vid_bert_params = {
    'vocab_size_or_config_json_file': 10,
    'hidden_size': 512,
    'num_hidden_layers': 9,
    'intermediate_size': 3072,
    'hidden_act': 'gelu',
    'hidden_dropout_prob': 0.2,
    'attention_probs_dropout_prob': 0.2,
    'max_position_embeddings': 32,
    'type_vocab_size': 19,
    'initializer_range': 0.02,
    'layer_norm_eps': 1e-12,
    'num_attention_heads': 8,
}
model_vid = BertVID(expert_dims=experts_info, vid_bert_params=vid_bert_params)
model_vid = model_vid.eval()
model_vid.load_state_dict(state['vid_state_dict'])
model_vid = model_vid.cuda()

txt_bert_params = {
    'hidden_dropout_prob': 0.2,
    'attention_probs_dropout_prob': 0.2,
}
model_txt = BertTXT(
    modalities=list(experts_info.keys()),
    add_special_tokens=True,
    txt_bert_params=txt_bert_params,
)
model_txt = model_txt.eval()
model_txt.load_state_dict(state['txt_state_dict'])
model_txt = model_txt.cuda()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Compute scores

In [65]:
path = '/home/dimas/mdmmt_test-master/1010_TITANIC_00_41_32_072-00_41_40_196.mp4'
#path = '/home/wx587276/SML_office_videos/IMG_1497.MOV'
#path = '/home/wx587276/SML_office_videos/IMG_1498.MOV'
t_start=0
t_end=2
VideoElem(path, t_start, t_end)

<__main__.VideoElem at 0x7f77900dea50>

In [66]:
vemb = encode_video(
    vggish_model, # adio modality
    vmz_model, # video modality
    clip_model, # image modality
    model_vid, # aggregator
    path, t_start, t_end)

In [68]:
texts = [
    'actor stands closely behind a red haired woman',
    'this scene was filmed on a cathedral balcony',
    'a man in t-shirt sits near the computer',
    'a man in shirt sits near the computer',
    'a man in a shirt sits in front of a computer',
    'a man in a t-shirt sits in front of a computer',
    'woman is resting her hand on the ship rail and conversing with someone',
    'a women is standing near men on the boat',
    'woman is dressed in finery and hangs on the edge of the boat looking sad',
    'a man is walking',
    'this scene is on a large boat',
    'man in red jacket',
    'a man walks by a chair',
    'A man is jumping near the chair',
    'A man is jumping',
    'A man walks',
    'this scene was filmed on a ship deck',
    'the man jumps'
]
tembs = batch_encode_text(texts)
scores = torch.matmul(tembs, vemb)
for txt, score in zip(texts, scores):
    print(score.item(), txt)

0.22126026451587677 actor stands closely behind a red haired woman
0.09978814423084259 this scene was filmed on a cathedral balcony
-0.11414779722690582 a man in t-shirt sits near the computer
-0.11777155846357346 a man in shirt sits near the computer
-0.11854524910449982 a man in a shirt sits in front of a computer
-0.11945630609989166 a man in a t-shirt sits in front of a computer
0.2930251657962799 woman is resting her hand on the ship rail and conversing with someone
0.3165666162967682 a women is standing near men on the boat
0.287163645029068 woman is dressed in finery and hangs on the edge of the boat looking sad
0.022646740078926086 a man is walking
0.23212887346744537 this scene is on a large boat
0.03144485503435135 man in red jacket
-0.028973018750548363 a man walks by a chair
-0.03816576302051544 A man is jumping near the chair
-0.04523948207497597 A man is jumping
0.10681651532649994 A man walks
0.304289847612381 this scene was filmed on a ship deck
0.1555849313735962 the m

# MSRVTT 1kA test

In [135]:
import pickle
import json
from collections import defaultdict

with open('/home/wx587276/level2/rb_mixin_clip/msrvtt/symlinked-feats/jsfusion_val_caption_idx.pkl', 'rb') as f:
    vid_2_capidx = pickle.load(f)

with open('/ssd/ssd_srv79/datasets/MSR_VTT/msrvtt/test_videodatainfo.json') as f:
    j = json.load(f)
    
video_2_texts = defaultdict(list)
for x in j['sentences']:
    video_2_texts[x['video_id']].append(x['caption'])
video_2_text = {}
for vid, cap_idx in vid_2_capidx.items():
    video_2_text[vid] = video_2_texts[vid][cap_idx]
root = '/ssd/ssd_srv79/datasets/MSR_VTT/msrvtt/AllVideo/'
path_2_text = {}
for vid, caption in video_2_text.items():
    path = os.path.join(root, f'{vid}.mp4')
    assert os.path.exists(path)
    path_2_text[path] = caption   

paths = list(path_2_text.keys())

## prepare video embs

In [164]:
vembs = []
for path in tqdm(paths):
    vemb = encode_video(
        vggish_model, # adio modality
        vmz_model, # video modality
        clip_model, # image modality
        model_vid, # aggregator
        path)  
    vembs.append(vemb.cpu())
vembs = torch.cat([x[None, :] for x in vembs], dim=0) # (nvid, dim)

100%|██████████| 1000/1000 [2:16:16<00:00,  8.18s/it] 


In [149]:
vembs_old = vembs

## prepare video embs

In [137]:
tembs = []
for path in tqdm(paths):
    text = path_2_text[path]
    temb = encode_text(text).cpu()
    tembs.append(temb) 
tembs = torch.cat([x[None, :] for x in tembs], dim=0) # (ntxt, dim)

100%|██████████| 1000/1000 [00:12<00:00, 78.97it/s]


## compute results

In [165]:
S = torch.matmul(tembs, vembs.t())
pos = S.diag()
pos_text2video = pos[:,None]
ranks = (S >= pos_text2video).sum(dim=1) # (ntxt,)

ntxt = len(ranks)
R1 = (100 * (ranks <= 1).sum() / ntxt).item()
R5 = (100 * (ranks <= 5).sum() / ntxt).item()
print(f'R@1={R1} R@5={R5}')

R@1=38.0 R@5=67.5999984741211


In [160]:
import numpy as np

# video10
with open('/ssd/ssd_srv79/dumps/msrvtt/CLIP/msrvtt-0.emb', 'rb') as f:
    clip_embs_old = torch.from_numpy(np.frombuffer(f.read(4*512*13), dtype=np.float32).reshape(13,512))

In [154]:
path = '/ssd/ssd_srv79/datasets/MSR_VTT/msrvtt/AllVideo/video10.mp4'

In [158]:
timings_clip, embs_clip = visual_compute_embs(clip_model, path, t_start=None, t_end=None,
                                            fps=1, frames_per_clip=1, frame_crop_size=228, frame_size=228)

In [163]:
(torch.from_numpy(embs_clip)*clip_embs_old).sum(dim=1) / torch.from_numpy(embs_clip).norm(dim=1) / clip_embs_old.norm(dim=1)

tensor([0.9719, 0.9750, 0.9719, 0.9708, 0.9534, 0.9640, 0.9745, 0.9689, 0.9735,
        0.9607, 0.9747, 0.9678, 0.9627])