# Tests

This notebook is concerned with results from the paper A Straightforward Framework For Video Retrieval Using CLIP [1].

Recall metrics are calculated using a library provided by the author. Results are stored in `test.csv`

In [1]:
# Modules
import torch
import pandas as pd

# MSR-VTT
Split:
* Validation: 1000 videos from val set for single frame, average frame and $k$-means experiments.
  * 30th frame as video representation.
  * $k$-means for video representation.
* Full Test: 2990 videos, 20 captions each.
  * Average frame video representation.
* JSFusion Test: 1000 sampled video-text pairs.
  * Average frame video representation.


In [2]:
!gdown --id 1Gp3_I_OvcKwjOQmn334-T4wfwQk29TCp -O "MSRVTT_test_dict_CLIP_visual.pt"
!gdown --id 1-3tpfZzo1_D18WdrioQzc-iogEl-KSnA -O "MSRVTT_test_dict_CLIP_text.pt"
!gdown --id 1-7_zAogZjLLoaUvZa9i0OotvK81BDVDS -O "MSRVTT_train_dict_Kmeans_centers_CLIP_video.pt"
!gdown --id 1-3PBg8qnLxe7AH008fl_WqfnPHi5Ror2 -O "MSRVTT_val_1000_dict_sentence_0_CLIP_text.pt"
!gdown --id 1AiptvnIiObxGd_K9zDoOPvMjbJ1p2Rzr -O "MSRVTT_val_1000_dict_CLIP_visual.pt"
!gdown --id 15mvFQxrWLNvBvFg4_9rr_Kqyzsy9dudj -O "JS_test_dict_CLIP_text.pt"
!gdown --id 1DCPlt4zHJhatd3E2_9SW9ipeMKzy5Z0v -O "MSVD_test_dict_CLIP_visual.pt"
!gdown --id 1ZkyeC1spejKXo8eAG3cCa2_RGEJg2MSV -O "MSVD_test_dict_CLIP_text.pt"

Downloading...
From: https://drive.google.com/uc?id=1Gp3_I_OvcKwjOQmn334-T4wfwQk29TCp
To: /content/MSRVTT_test_dict_CLIP_visual.pt
2.53GB [00:15, 162MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-3tpfZzo1_D18WdrioQzc-iogEl-KSnA
To: /content/MSRVTT_test_dict_CLIP_text.pt
123MB [00:00, 151MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-7_zAogZjLLoaUvZa9i0OotvK81BDVDS
To: /content/MSRVTT_train_dict_Kmeans_centers_CLIP_video.pt
116MB [00:01, 99.0MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1-3PBg8qnLxe7AH008fl_WqfnPHi5Ror2
To: /content/MSRVTT_val_1000_dict_sentence_0_CLIP_text.pt
41.3MB [00:00, 113MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1AiptvnIiObxGd_K9zDoOPvMjbJ1p2Rzr
To: /content/MSRVTT_val_1000_dict_CLIP_visual.pt
819MB [00:08, 98.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=15mvFQxrWLNvBvFg4_9rr_Kqyzsy9dudj
To: /content/JS_test_dict_CLIP_text.pt
2.17MB [00:00, 68.1MB/s]
Downloading...
From: https://drive.google.co

In [3]:
!git clone https://github.com/Deferf/Experiments

Cloning into 'Experiments'...
remote: Enumerating objects: 140, done.[K
remote: Counting objects: 100% (140/140), done.[K
remote: Compressing objects: 100% (92/92), done.[K
remote: Total 140 (delta 84), reused 101 (delta 45), pack-reused 0[K
Receiving objects: 100% (140/140), 22.34 KiB | 7.45 MiB/s, done.
Resolving deltas: 100% (84/84), done.


In [12]:
%cd Experiments
from metrics import rank_at_k_precomputed,stack_encoded_dict,generate_sim_tensor,tensor_video_to_text_sim,tensor_text_to_video_metrics,normalize_matrix
%cd "/content"

/content/Experiments
/content


In [13]:
MSR_path = "/content/"
MSVD_path = "/content/"
LSMDC_path = "/content/"

In [14]:
MSR_test_video_dict = torch.load(MSR_path + "MSRVTT_test_dict_CLIP_visual.pt")
MSR_test_text_dict = torch.load(MSR_path + "MSRVTT_test_dict_CLIP_text.pt")
MSR_cluster_centers_dict = torch.load(MSR_path + "MSRVTT_train_dict_Kmeans_centers_CLIP_video.pt", map_location = "cpu")

## Preliminar Experiments

In [15]:
# Text features from the 1000 validation videos
val_1000_text_dict = torch.load(MSR_path + "MSRVTT_val_1000_dict_sentence_0_CLIP_text.pt", map_location = "cpu")

### Single frame

In [16]:
val_order = list(MSR_cluster_centers_dict[1].keys())

In [17]:
val_1000_videos_dict = torch.load(MSR_path + "MSRVTT_val_1000_dict_CLIP_visual.pt", map_location = "cpu")

In [18]:
# Sample the 30th frame
FRAME = 29

In [19]:
val_1000_videos_30th = torch.stack([val_1000_videos_dict[key][FRAME] for key in val_order])

In [20]:
val_1000_text = torch.stack([val_1000_text_dict[key] for key in val_order])

In [21]:
MSR_val_metrics_ttv_30th = rank_at_k_precomputed(val_1000_text @ val_1000_videos_30th.T)

In [22]:
MSR_val_metrics_ttv_30th["description"] = ["MSR-VTT Validation 1000 x (Val Set[" + str(FRAME + 1) + " frame], First sentence)"]
MSR_val_metrics_ttv_30th["task"] = ["Text-to-video"]
MSR_val_metrics_ttv_30th["dataset"] = ["MSR-VTT"]
MSR_val_metrics_ttv_30th = pd.DataFrame(MSR_val_metrics_ttv_30th)
MSR_val_metrics_ttv_30th.to_csv("test.csv", mode='a', header=True)

### K-Means

In [23]:
# Repeat evaluations for each centroid 
for center in MSR_cluster_centers_dict:
  video_features = torch.stack([MSR_cluster_centers_dict[center][key] for key in val_order])
  temp_center, aux_temp_center = stack_encoded_dict(MSR_cluster_centers_dict[center],val_order)
  cluster_similarities = dict()
  if center > 1:
    similarity_map = (val_1000_text @ video_features.permute(1,2,0))
    similarity_map, _ = torch.max(similarity_map, dim = 0)
  else:
    similarity_map = (val_1000_text @ video_features.T)
  MSR_val_metrics_ttv = rank_at_k_precomputed(similarity_map)
  MSR_val_metrics_ttv["description"] = ["MSR-VTT Validation 1000 x (Val Set[" + str(center) + " centroids], First sentence)"]
  MSR_val_metrics_ttv["task"] = ["Text-to-video"]
  MSR_val_metrics_ttv["dataset"] = ["MSR-VTT"]
  MSR_val_metrics_ttv = pd.DataFrame(MSR_val_metrics_ttv)
  MSR_val_metrics_ttv.to_csv("test.csv", mode='a', header=False)

### Full Test Set


In [24]:
MSR_full_test_sim_tensor = generate_sim_tensor(MSR_test_text_dict, MSR_test_video_dict, MSR_test_text_dict.keys())

In [25]:
MSR_full_test_sim_tensor.shape

torch.Size([2990, 20, 2990])

Video to text retrieval

In [26]:
MSR_video_text_sim = tensor_video_to_text_sim(MSR_full_test_sim_tensor)

In [27]:
MSR_video_text_sim.shape

torch.Size([2990, 2990])

In [28]:
MSR_full_metrics_vtt = rank_at_k_precomputed(MSR_video_text_sim)

MSR_full_metrics_vtt["description"] = ["MSR-VTT Test 2990 x (Test Set[mean frame], Corresponding sentences)"]
MSR_full_metrics_vtt["task"] = ["Video-to-text"]
MSR_full_metrics_vtt["dataset"] = ["MSR-VTT"]
MSR_full_metrics_vtt = pd.DataFrame(MSR_full_metrics_vtt)
print(MSR_full_metrics_vtt)
MSR_full_metrics_vtt.to_csv("test.csv", mode='a', header=False)

         R@1        R@5  ...           task  dataset
0  40.301003  69.732445  ...  Video-to-text  MSR-VTT

[1 rows x 9 columns]


Text to video retrieval

In [29]:
MSR_full_metrics_ttv, MSR_diagonal = tensor_text_to_video_metrics(MSR_full_test_sim_tensor, return_ranks = True)
MSR_full_metrics_ttv["description"] = ["MSR-VTT Test 2990 x (Test Set[mean frame], Corresponding sentences)"]
MSR_full_metrics_ttv["task"] = ["Text-to-video"]
MSR_full_metrics_ttv["dataset"] = ["MSR-VTT"]
MSR_full_metrics_ttv = pd.DataFrame(MSR_full_metrics_ttv)
print(MSR_full_metrics_ttv)
MSR_full_metrics_ttv.to_csv("test.csv", mode='a', header=False)

         R@1        R@5  ...           task  dataset
0  21.367893  41.138798  ...  Text-to-video  MSR-VTT

[1 rows x 9 columns]


### JS Test Set

In [30]:
JS_text_encoded_dict = torch.load(MSR_path + "JS_test_dict_CLIP_text.pt", map_location=torch.device('cpu'))

In [31]:
JS_video_encoded_mean, _ = stack_encoded_dict(MSR_test_video_dict, JS_text_encoded_dict.keys(), lambda x : normalize_matrix(torch.mean(x, dim = 0, keepdim = True)))

In [32]:
JS_text_encoded = torch.stack([JS_text_encoded_dict[key] for key in JS_text_encoded_dict])

In [33]:
JS_text_encoded.shape, JS_video_encoded_mean.shape

(torch.Size([1000, 512]), torch.Size([1000, 512]))

In [34]:
JS_metrics_vtt = rank_at_k_precomputed(JS_video_encoded_mean @ JS_text_encoded.T)
JS_metrics_vtt["description"] = ["MSR-VTT Test 1000 x (JS_Fusion_Split[mean frame], sampled sentence)"]
JS_metrics_vtt["task"] = ["Video-to-text"]
JS_metrics_vtt["dataset"] = ["MSR-VTT"]
JS_metrics_vtt = pd.DataFrame(JS_metrics_vtt)
print(JS_metrics_vtt)
JS_metrics_vtt.to_csv("test.csv", mode='a', header=False)

         R@1        R@5  ...           task  dataset
0  27.200001  51.700001  ...  Video-to-text  MSR-VTT

[1 rows x 9 columns]


In [35]:
JS_metrics_ttv, JS_diagonal_ttv  =  rank_at_k_precomputed(JS_text_encoded @ JS_video_encoded_mean.T, diag = True)
JS_metrics_ttv["description"] = ["MSR-VTT Test 1000 x (JS_Fusion_Split[mean frame], sampled sentence)"]
JS_metrics_ttv["task"] = ["Text-to-video"]
JS_metrics_ttv["dataset"] = ["MSR-VTT"]
JS_metrics_ttv = pd.DataFrame(JS_metrics_ttv)
print(JS_metrics_ttv)
JS_metrics_ttv.to_csv("test.csv", mode='a', header=False)

         R@1        R@5  ...           task  dataset
0  31.200001  53.700001  ...  Text-to-video  MSR-VTT

[1 rows x 9 columns]


# MSVD

In [36]:
multiple_video_dict = torch.load(MSVD_path + "MSVD_test_dict_CLIP_visual.pt", map_location=torch.device('cpu'))

In [37]:
test_text_dict = torch.load(MSVD_path + "MSVD_test_dict_CLIP_text.pt", map_location=torch.device('cpu'))

In [38]:
MSVD_test_sim_tensor = generate_sim_tensor(test_text_dict, multiple_video_dict, test_text_dict.keys())

In [39]:
MSVD_test_sim_tensor.shape

torch.Size([670, 81, 670])

Video to text retrieval

In [40]:
MSVD_video_text_sim = tensor_video_to_text_sim(MSVD_test_sim_tensor)

In [41]:
MSVD_video_text_sim.shape

torch.Size([670, 670])

In [42]:
MSVD_full_metrics_vtt = rank_at_k_precomputed(MSVD_video_text_sim)

MSVD_full_metrics_vtt["description"] = ["MSVD Test 670 x (Test Set[mean frame], Corresponding sentences)"]
MSVD_full_metrics_vtt["task"] = ["Video-to-text"]
MSVD_full_metrics_vtt["dataset"] = ["MSVD"]
MSVD_full_metrics_vtt = pd.DataFrame(MSVD_full_metrics_vtt)
print(MSVD_full_metrics_vtt)
MSVD_full_metrics_vtt.to_csv("test.csv", mode='a', header=False)

         R@1        R@5  ...           task  dataset
0  59.850746  85.223877  ...  Video-to-text     MSVD

[1 rows x 9 columns]


In [43]:
MSVD_full_metrics_ttv = tensor_text_to_video_metrics(MSVD_test_sim_tensor)
MSVD_full_metrics_ttv["description"] = ["MSVD Test 670 x (Test Set[mean frame], Corresponding sentences)"]
MSVD_full_metrics_ttv["task"] = ["Text-to-video"]
MSVD_full_metrics_ttv["dataset"] = ["MSVD"]
MSVD_full_metrics_ttv = pd.DataFrame(MSVD_full_metrics_ttv)
print(MSVD_full_metrics_ttv)
MSVD_full_metrics_ttv.to_csv("test.csv", mode='a', header=False)

         R@1        R@5  ...           task  dataset
0  37.009689  64.103302  ...  Text-to-video     MSVD

[1 rows x 9 columns]



# LSMDC

LSMDC access is restricted, please obtain access to the dataset. We use extracted features from files listed in:

```
LSMDC16_challenge_1000_publictect.csv
```

Feel free to use the video processing code in the library.


In [44]:
"""LSMDC_test_video_dict = torch.load(LSMDC_path + "LSMDC_test_CLIP_visual_1_2.pt",map_location=torch.device('cpu'))
LSMDC_test_text_dict = torch.load(LSMDC_path + "LSMDC_test_dict_CLIP_text.pt",map_location=torch.device('cpu'))
LSMDC_text_matrix, LSMDC_aux_text = stack_encoded_dict(LSMDC_test_text_dict, LSMDC_test_text_dict.keys())
LSMDC_video_matrix, LSMDC_aux_video = stack_encoded_dict(LSMDC_test_video_dict, LSMDC_test_text_dict.keys(), lambda x : normalize_matrix(torch.mean(x, dim = 0, keepdim = True)))
LSMDC_text_matrix.shape, LSMDC_video_matrix.shape
LSMDC_metrics_vtt = rank_at_k_precomputed(LSMDC_video_matrix @ LSMDC_text_matrix.T)

LSMDC_metrics_vtt["description"] = ["LSMDC Test 1000 x (Test Set[mean frame], Corresponding sentences)"]
LSMDC_metrics_vtt["task"] = ["Video-to-text"]
LSMDC_metrics_vtt["dataset"] = ["LSMDC"]
LSMDC_metrics_vtt = pd.DataFrame(LSMDC_metrics_vtt)
print(LSMDC_metrics_vtt)
LSMDC_metrics_vtt.to_csv("test.csv", mode='a', header=False)
LSMDC_metrics_ttv, LSMDC_diagonal_ttv = rank_at_k_precomputed(LSMDC_text_matrix @ LSMDC_video_matrix.T, diag = True)

LSMDC_metrics_ttv["description"] = ["LSMDC Test 1000 x (Test Set[mean frame], Corresponding sentences)"]
LSMDC_metrics_ttv["task"] = ["Text-to-video"]
LSMDC_metrics_ttv["dataset"] = ["LSMDC"]
LSMDC_metrics_ttv = pd.DataFrame(LSMDC_metrics_ttv)
print(LSMDC_metrics_ttv)
LSMDC_metrics_ttv.to_csv("test.csv", mode='a', header=False)"""

'LSMDC_test_video_dict = torch.load(LSMDC_path + "LSMDC_test_CLIP_visual_1_2.pt",map_location=torch.device(\'cpu\'))\nLSMDC_test_text_dict = torch.load(LSMDC_path + "LSMDC_test_dict_CLIP_text.pt",map_location=torch.device(\'cpu\'))\nLSMDC_text_matrix, LSMDC_aux_text = stack_encoded_dict(LSMDC_test_text_dict, LSMDC_test_text_dict.keys())\nLSMDC_video_matrix, LSMDC_aux_video = stack_encoded_dict(LSMDC_test_video_dict, LSMDC_test_text_dict.keys(), lambda x : normalize_matrix(torch.mean(x, dim = 0, keepdim = True)))\nLSMDC_text_matrix.shape, LSMDC_video_matrix.shape\nLSMDC_metrics_vtt = rank_at_k_precomputed(LSMDC_video_matrix @ LSMDC_text_matrix.T)\n\nLSMDC_metrics_vtt["description"] = ["LSMDC Test 1000 x (Test Set[mean frame], Corresponding sentences)"]\nLSMDC_metrics_vtt["task"] = ["Video-to-text"]\nLSMDC_metrics_vtt["dataset"] = ["LSMDC"]\nLSMDC_metrics_vtt = pd.DataFrame(LSMDC_metrics_vtt)\nprint(LSMDC_metrics_vtt)\nLSMDC_metrics_vtt.to_csv("test.csv", mode=\'a\', header=False)\nLSMDC