In [1]:
!pip install pycocotools

Collecting pycocotools
  Downloading pycocotools-2.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Downloading pycocotools-2.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (426 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.2/426.2 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: pycocotools
Successfully installed pycocotools-2.0.7


In [6]:
import os
import json
import numpy as np
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# File paths and directories
IMAGE_DIR = '/kaggle/input/vcr-dset/vcr/vcr1images/vcr1images'
ANNOTATION_FILE = '/kaggle/input/vcr-dset/vcr/vcr1annots/val.jsonl'
OUTPUT_PATH = '/kaggle/working/'

# Load VCR annotations
with open(ANNOTATION_FILE, 'r') as f:
    annotations = [json.loads(line) for line in f]

# Load the pretrained CLIP Processor and CLIP model
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

# Move model to GPU if available
model.to(device)

# Function to extract features using CLIP model
def extract_features(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")
        
        # Move inputs to GPU if available
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model.get_image_features(**inputs)
            features = outputs.cpu().numpy()
        return features
    except Exception as e:
        print(f"Error extracting features for image {image_path}: {str(e)}")
        return np.array([])  # Return empty array on error

# Feature extraction loop
features_list = []
labels_list = []

# Check if there are any previously saved features and labels
if os.path.exists(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy")):
    checkpoint = np.load(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy"), allow_pickle=True).item()
    start_index = checkpoint['last_processed_index']
    features_list = checkpoint['features_list']
    labels_list = checkpoint['labels_list']
    print(f"Checkpoint found. Resuming from index {start_index}.")
else:
    start_index = 0

for i, ann in enumerate(annotations[start_index:], start=start_index):
    image_filename = ann['img_fn']
    image_path = os.path.join(IMAGE_DIR, image_filename)
    try:
        features = extract_features(image_path)
        if features.size == 0:
            continue
        labels = ann['answer_label']  # Adjust this part based on actual label structure in VCR
        features_list.append(features)
        labels_list.append(labels)
    except FileNotFoundError:
        print(f"File not found: {image_path}, skipping.")
    except Exception as e:
        print(f"An error occurred with file {image_path}: {e}")
    
    if (i + 1) % 1000 == 0:
        print(f"{i + 1} images processed")

    # Save checkpoint every 500 images processed
    if (i + 1) % 500 == 0:
        checkpoint = {
            'last_processed_index': i + 1,
            'features_list': features_list,
            'labels_list': labels_list
        }
        np.save(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy"), checkpoint)

# Determine all unique labels (categories)
unique_labels = sorted(set(labels_list))

# Convert labels to one-hot encoding
label_map = {label: idx for idx, label in enumerate(unique_labels)}

def convert_to_one_hot(label, label_map):
    one_hot_labels = np.zeros(len(label_map))
    one_hot_labels[label_map[label]] = 1
    return one_hot_labels

# Convert labels_list to one-hot encoding
one_hot_labels_list = [convert_to_one_hot(label, label_map) for label in labels_list]

# Reshape extracted features and labels
val_features_array = np.array(features_list)
val_labels_array = np.array(one_hot_labels_list)

# Save the features and labels
np.save(os.path.join(OUTPUT_PATH, "clip_val_features.npy"), val_features_array)
np.save(os.path.join(OUTPUT_PATH, "clip_val_labels.npy"), val_labels_array)

print("Features array shape:", val_features_array.shape)
print("Labels array shape:", val_labels_array.shape)

# Remove checkpoint file if processing completed successfully
if os.path.exists(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy")):
    os.remove(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy"))
    print("Removed checkpoint file.")


Using device: cuda
Checkpoint found. Resuming from index 22000.
An error occurred with file /kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/movieclips_The_Magnificent_Seven_Ride/3ynO7Oaj2oY@22.jpg: 'answer_label'
An error occurred with file /kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/movieclips_Mallrats/C6k9TFjWiGs@22.jpg: 'answer_label'
An error occurred with file /kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/movieclips_Mallrats/C6k9TFjWiGs@22.jpg: 'answer_label'
An error occurred with file /kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/movieclips_Mallrats/C6k9TFjWiGs@22.jpg: 'answer_label'
An error occurred with file /kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/movieclips_Bad_News_Bears_2/sloo9PMVoRE@23.jpg: 'answer_label'
An error occurred with file /kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/movieclips_Bad_News_Bears_2/qq4gK8PkKNM@26.jpg: 'answer_label'
An error occurred with file /kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/movieclips_Proof/MdTaJDKTgUc@7.jp

In [9]:
import os
import json
import numpy as np
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# File paths and directories
IMAGE_DIR = '/kaggle/input/vcr-dset/vcr/vcr1images/vcr1images'
ANNOTATION_FILE = '/kaggle/input/vcr-dset/vcr/vcr1annots/val.jsonl'
OUTPUT_PATH = '/kaggle/working/'

# Load VCR annotations
with open(ANNOTATION_FILE, 'r') as f:
    annotations = [json.loads(line) for line in f]

# Load the pretrained CLIP Processor and CLIP model
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

# Move model to GPU if available
model.to(device)

# Function to extract features using CLIP model
def extract_features(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")
        
        # Move inputs to GPU if available
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model.get_image_features(**inputs)
            features = outputs.cpu().numpy()
        return features
    except Exception as e:
        print(f"Error extracting features for image {image_path}: {str(e)}")
        return np.array([])  # Return empty array on error

# Feature extraction loop
features_list = []
labels_list = []

# Check if there are any previously saved features and labels
if os.path.exists(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy")):
    checkpoint = np.load(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy"), allow_pickle=True).item()
    start_index = checkpoint['last_processed_index']
    features_list = checkpoint['features_list']
    labels_list = checkpoint['labels_list']
    print(f"Checkpoint found. Resuming from index {start_index}.")
else:
    start_index = 0

for i, ann in enumerate(annotations[start_index:], start=start_index):
    image_filename = ann['img_fn']
    image_path = os.path.join(IMAGE_DIR, image_filename)
    try:
        features = extract_features(image_path)
        if features.size == 0:
            continue
        labels = ann['answer_label']  # Adjust this part based on actual label structure in VCR
        features_list.append(features)
        labels_list.append(labels)
    except FileNotFoundError:
        print(f"File not found: {image_path}, skipping.")
    except Exception as e:
        print(f"An error occurred with file {image_path}: {e}")
    
    if (i + 1) % 1000 == 0:
        print(f"{i + 1} images processed")

    # Save checkpoint every 500 images processed
    if (i + 1) % 500 == 0:
        checkpoint = {
            'last_processed_index': i + 1,
            'features_list': features_list,
            'labels_list': labels_list
        }
        np.save(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy"), checkpoint)

# Determine all unique labels (categories)
unique_labels = sorted(set(labels_list))

# Convert labels to one-hot encoding
label_map = {label: idx for idx, label in enumerate(unique_labels)}

def convert_to_one_hot(label, label_map):
    one_hot_labels = np.zeros(len(label_map))
    one_hot_labels[label_map[label]] = 1
    return one_hot_labels

# Convert labels_list to one-hot encoding
one_hot_labels_list = [convert_to_one_hot(label, label_map) for label in labels_list]

# Reshape extracted features and labels
val_features_array = np.array(features_list)
val_labels_array = np.array(one_hot_labels_list)

# Save the features and labels
np.save(os.path.join(OUTPUT_PATH, "clip_val_features.npy"), val_features_array)
np.save(os.path.join(OUTPUT_PATH, "clip_val_labels.npy"), val_labels_array)

print("Features array shape:", val_features_array.shape)
print("Labels array shape:", val_labels_array.shape)

# Remove checkpoint file if processing completed successfully
if os.path.exists(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy")):
    os.remove(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy"))
    print("Removed checkpoint file.")


Using device: cuda
1000 images processed
2000 images processed
3000 images processed
4000 images processed
5000 images processed
6000 images processed
7000 images processed
8000 images processed
9000 images processed
10000 images processed
Error extracting features for image /kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/lsmdc_1010_TITANIC/1010_TITANIC_02.33.49.825-02.33.53.701@0.jpg: [Errno 2] No such file or directory: '/kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/lsmdc_1010_TITANIC/1010_TITANIC_02.33.49.825-02.33.53.701@0.jpg'
Error extracting features for image /kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/lsmdc_1010_TITANIC/1010_TITANIC_02.33.49.825-02.33.53.701@0.jpg: [Errno 2] No such file or directory: '/kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/lsmdc_1010_TITANIC/1010_TITANIC_02.33.49.825-02.33.53.701@0.jpg'
Error extracting features for image /kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/lsmdc_1010_TITANIC/1010_TITANIC_02.33.49.825-02.33.53.701@0.jpg: [Errno

In [8]:
import os
import json
import numpy as np
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# File paths and directories
IMAGE_DIR = '/kaggle/input/vcr-dset/vcr/vcr1images/vcr1images'
ANNOTATION_FILE = '/kaggle/input/vcr-dset/vcr/vcr1annots/train.jsonl'
OUTPUT_PATH = '/kaggle/working/'

# Load VCR annotations
with open(ANNOTATION_FILE, 'r') as f:
    annotations = [json.loads(line) for line in f]

# Load the pretrained CLIP Processor and CLIP model
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

# Move model to GPU if available
model.to(device)

# Function to extract features using CLIP model
def extract_features(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")
        
        # Move inputs to GPU if available
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model.get_image_features(**inputs)
            features = outputs.cpu().numpy()
        return features
    except Exception as e:
        print(f"Error extracting features for image {image_path}: {str(e)}")
        return np.array([])  # Return empty array on error

# Feature extraction loop
features_list = []
labels_list = []

# Check if there are any previously saved features and labels
if os.path.exists(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy")):
    checkpoint = np.load(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy"), allow_pickle=True).item()
    start_index = checkpoint['last_processed_index']
    features_list = checkpoint['features_list']
    labels_list = checkpoint['labels_list']
    print(f"Checkpoint found. Resuming from index {start_index}.")
else:
    start_index = 0

for i, ann in enumerate(annotations[start_index:], start=start_index):
    image_filename = ann['img_fn']
    image_path = os.path.join(IMAGE_DIR, image_filename)
    try:
        features = extract_features(image_path)
        if features.size == 0:
            continue
        labels = ann['answer_label']  # Adjust this part based on actual label structure in VCR
        features_list.append(features)
        labels_list.append(labels)
    except FileNotFoundError:
        print(f"File not found: {image_path}, skipping.")
    except Exception as e:
        print(f"An error occurred with file {image_path}: {e}")
    
    if (i + 1) % 1000 == 0:
        print(f"{i + 1} images processed")

    # Save checkpoint every 500 images processed
    if (i + 1) % 500 == 0:
        checkpoint = {
            'last_processed_index': i + 1,
            'features_list': features_list,
            'labels_list': labels_list
        }
        np.save(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy"), checkpoint)

# Determine all unique labels (categories)
unique_labels = sorted(set(labels_list))

# Convert labels to one-hot encoding
label_map = {label: idx for idx, label in enumerate(unique_labels)}

def convert_to_one_hot(label, label_map):
    one_hot_labels = np.zeros(len(label_map))
    one_hot_labels[label_map[label]] = 1
    return one_hot_labels

# Convert labels_list to one-hot encoding
one_hot_labels_list = [convert_to_one_hot(label, label_map) for label in labels_list]

# Reshape extracted features and labels
train_features_array = np.array(features_list)
train_labels_array = np.array(one_hot_labels_list)

# Save the features and labels
np.save(os.path.join(OUTPUT_PATH, "clip_train_features.npy"), train_features_array)
np.save(os.path.join(OUTPUT_PATH, "clip_train_labels.npy"), train_labels_array)

print("Features array shape:", train_features_array.shape)
print("Labels array shape:", train_labels_array.shape)

# Remove checkpoint file if processing completed successfully
if os.path.exists(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy")):
    os.remove(os.path.join(OUTPUT_PATH, "checkpoint_clip.npy"))
    print("Removed checkpoint file.")


Using device: cuda
1000 images processed
2000 images processed
3000 images processed
4000 images processed
5000 images processed
6000 images processed
7000 images processed
8000 images processed
9000 images processed
10000 images processed
11000 images processed
12000 images processed
13000 images processed
Error extracting features for image /kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/movieclips_Roman_J._Israel_Esq./c9oE47YW6YM@3.jpg: [Errno 2] No such file or directory: '/kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/movieclips_Roman_J._Israel_Esq./c9oE47YW6YM@3.jpg'
Error extracting features for image /kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/movieclips_Roman_J._Israel_Esq./c9oE47YW6YM@3.jpg: [Errno 2] No such file or directory: '/kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/movieclips_Roman_J._Israel_Esq./c9oE47YW6YM@3.jpg'
Error extracting features for image /kaggle/input/vcr-dset/vcr/vcr1images/vcr1images/movieclips_The_Stoning_of_Soraya_M./5CuKjYdDd50@7.jpg: [Er