In [None]:
!pip install torch torchvision torchaudio
!pip install transformers
!pip install sentence-transformers
!pip install librosa
!pip install opencv-python

In [None]:
import torch
import numpy as np
import librosa
import cv2
from sentence_transformers import SentenceTransformer
from transformers import ASTModel, ASTFeatureExtractor, ResNetModel, ResNetConfig
from torchvision.transforms import Resize, CenterCrop, Normalize, Compose, ToTensor


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
text_model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
ast_feature_extractor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
ast_model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

    # If using GPU, move model to GPU
if device.type == "cuda":
  ast_model.to(device)

preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [None]:
resnet_config = ResNetConfig.from_pretrained("microsoft/resnet-50")
resnet_model = ResNetModel.from_pretrained("microsoft/resnet-50")

resnet_model  = resnet_model.to(device)

config.json:   0%|          | 0.00/69.6k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

In [None]:
video_preprocessing = Compose([
    Resize(256),
    CenterCrop(224),
])

video_tensor_transforms = Compose([
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
def extract_text_embedding(text):
    embedding = text_model.encode(text)
    return embedding

In [None]:
import os
import tempfile
import subprocess
import librosa
import torch

def extract_audio_embedding(video_path, sample_rate=16000, temp_audio_path=None, use_gpu=True):
    """
    Extract audio embeddings from a video file using GPU acceleration if available.

    Args:
        video_path: Path to the video file
        sample_rate: Target sample rate for audio processing
        temp_audio_path: Optional path for temporary audio file
        use_gpu: Whether to use GPU if available

    Returns:
        numpy.ndarray: The audio embedding vector
    """
    # Check if CUDA is available when GPU usage is requested
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


    # Create a temporary file if path not provided
    if temp_audio_path is None:
        temp_dir = tempfile.gettempdir()
        temp_audio_path = os.path.join(temp_dir, "temp_audio.wav")

    try:
        # Extract audio from video using ffmpeg
        subprocess.run([
            "ffmpeg", "-i", video_path, "-q:a", "0", "-map", "a",
            "-vn", "-y", temp_audio_path
        ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        # Load the extracted audio file
        waveform, _ = librosa.load(temp_audio_path, sr=sample_rate, mono=True)

        # Extract features
        inputs = ast_feature_extractor(
            waveform,
            sampling_rate=sample_rate,
            return_tensors="pt"
        )

        # Move inputs to GPU if available
        if device.type == "cuda":
            inputs = {k: v.to(device) for k, v in inputs.items()}

        # Extract embeddings
        with torch.no_grad():
            outputs = ast_model(**inputs)

        # Use pooled output as embedding
        # Move back to CPU before converting to numpy
        embedding = outputs.pooler_output.squeeze()
        if device.type == "cuda":
            embedding = embedding.cpu()

        embedding = embedding.numpy()

        return embedding

    finally:
        # Clean up the temporary file if we created one and it exists
        if temp_audio_path is None and os.path.exists(temp_audio_path):
            os.remove(temp_audio_path)

In [None]:
!pip install pillow



In [None]:
from PIL import Image
def extract_video_embedding(video_path, frame_sampling=10, use_gpu=True):
    """Extract video embeddings using ResNet model on sampled frames.

    Args:
        video_path (str): Path to the video file
        frame_sampling (int): Number of frames to sample from the video
        use_gpu (bool): Whether to use GPU acceleration if available

    Returns:
        numpy.ndarray: Average embedding vector for the video
    """
    # Check for GPU availability
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Move the model to the appropriate device


    # Open video file
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        raise ValueError(f"Cannot open video file: {video_path}")

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    # Sample frames uniformly
    frames_to_sample = min(frame_sampling, frame_count)
    frame_indices = np.linspace(0, frame_count-1, frames_to_sample, dtype=int)

    # Extract frames and process
    frame_embeddings = []
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()

        if ret:
            # Convert BGR to RGB
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_pil = Image.fromarray(frame)

            # Apply transformations
            frame_preprocessed = video_preprocessing(frame_pil)

            # Convert to tensor manually
            frame_tensor = torch.from_numpy(np.array(frame_preprocessed)).permute(2, 0, 1).float() / 255.0

            # Then apply tensor-based transforms
            frame_tensor = video_tensor_transforms(frame_tensor)
            frame_tensor = frame_tensor.unsqueeze(0)

            # Move tensor to the appropriate device
            frame_tensor = frame_tensor.to(device)

            # Extract features
            with torch.no_grad():
                outputs = resnet_model(frame_tensor)

            # Use pooled output and move back to CPU for numpy conversion
            embedding = outputs.pooler_output.squeeze().cpu().numpy()
            frame_embeddings.append(embedding)

    cap.release()

    # Average the frame embeddings
    if frame_embeddings:
        return np.mean(frame_embeddings, axis=0)
    else:
        raise ValueError("No frames could be extracted from the video")

In [None]:
!unzip /content/utter.zip

In [None]:
dialogue = "So he looked down again and he said, hey, I'm Jewish. I know the difference."
video_path="/content/Utterances Akash 2.0/utterances/22-001/22-001-006.mp4"

In [None]:
emb_text =  extract_text_embedding(dialogue)

In [None]:
print(len(emb_text))

768


In [None]:
emb_audio = extract_audio_embedding(video_path = video_path )

In [None]:
print(emb_audio)

In [None]:
emb_video = extract_video_embedding(video_path=video_path)

In [None]:
print(len(emb_video))

2048


In [None]:
# prompt: def get_utterance_emb(video_path,dialogue):
#   emb_text =  extract_text_embedding(dialogue)
#   emb_audio = extract_audio_embedding(video_path = video_path )
#   emb_video = extract_video_embedding(video_path=video_path)
#   return
# return dictioanrt

def get_utterance_emb(video_path, dialogue):
    emb_text = extract_text_embedding(dialogue)
    emb_audio = extract_audio_embedding(video_path=video_path)
    emb_video = extract_video_embedding(video_path=video_path)
    return {"text": emb_text, "audio": emb_audio, "video": emb_video}


In [None]:
emb=get_utterance_emb(video_path,dialogue)

In [None]:
len(emb)

3

In [None]:
!unzip /content/utter.zip

In [None]:
# prompt: Read the csv into a df "/content/Utterances Akash/utterances.csv
# "

import pandas as pd

df = pd.read_csv("/content/Utterances Akash 2.0/utterances.csv")
print(df.head(10))


In [None]:
from tqdm import tqdm

In [None]:
# prompt: write a for loop accessing the files in "/content/Utterances Akash/utterances"

import os
embs={}
for filename in tqdm(os.listdir("/content/Utterances Akash 2.0/utterances"), desc="Processing directories"):
    filepath = os.path.join("/content/Utterances Akash 2.0/utterances", filename)
    for subclip_name in os.listdir(filepath):
        video_path = os.path.join(filepath, subclip_name)
        # print(video_path)
        # print(subclip_name)
        dialogue = df.loc[df['subclip_name'] == subclip_name]['utterance_text'].iloc[0]
        # print(dialogue)
        embs[subclip_name] = get_utterance_emb(video_path, dialogue)

        # print(len(embs[subclip_name]['text']))
        # print(len(embs[subclip_name]['video']))
        # print(len(embs[subclip_name]['audio']))


Processing directories: 100%|██████████| 525/525 [14:02<00:00,  1.60s/it]


In [None]:
# prompt: save the dict as a csv file

import pandas as pd

# Assuming 'embs' dictionary is already populated as in your provided code

# Create a list to store the data for the CSV
data = []

for subclip_name, embeddings in embs.items():
    text_embedding = embeddings['text']
    audio_embedding = embeddings['audio']
    video_embedding = embeddings['video']

    # Convert numpy arrays to lists for CSV compatibility
    data.append([
        subclip_name,
        list(text_embedding),
        list(audio_embedding),
        list(video_embedding),
    ])

# Create a DataFrame from the list
df_embeddings = pd.DataFrame(data, columns=['subclip_name', 'text_embedding', 'audio_embedding', 'video_embedding'])

# Save the DataFrame to a CSV file
df_embeddings.to_csv('embeddings.csv', index=False)


In [None]:
df_embeddings.tail(10)

In [None]:
# prompt: store the dict using pickle

import pickle

# Assuming 'embs' dictionary is already populated as in your provided code

# Save the 'embs' dictionary to a pickle file
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(embs, f)
