In [1]:
import os 
import torch
import numpy as np 

In [2]:
def clean_and_float_data(data_path:str): 
    clean_score =[]
    videos = os.listdir(data_path)
    for vid in videos : 
        try: 
            score = vid.split('-')[-1].replace('.mp4', '')
            clean_score.append((vid , float(score)))
        except: 
            os.remove(os.path.join(data_path, vid))
            continue
    return clean_score

In [3]:
clean_data = clean_and_float_data('data')
clean_data[:10]

[('2-1-2.25-3.mp4', 3.0),
 ('2-10-3-3.mp4', 3.0),
 ('2-11-3.5-2.7.mp4', 2.7),
 ('2-12-3.8-2.75.mp4', 2.75),
 ('2-13-3.5-2.75.mp4', 2.75),
 ('2-14-3-3.mp4', 3.0),
 ('2-15-3.8-3.mp4', 3.0),
 ('2-16-3-3.mp4', 3.0),
 ('2-17-3-3.mp4', 3.0),
 ('2-18-3.8-3.mp4', 3.0)]

In [4]:
from pytorchvideo.data.encoded_video import EncodedVideo
from torchvision.transforms import ToPILImage

def capture_and_save_frames(data_path:tuple , save_dir:str) :  
 vids_name = os.listdir(data_path)
 vids_path = [os.path.join(data_path, name) for name in vids_name]

 name_counter = 0
 for name ,vid  in zip(vids_name, vids_path): 
    # i used encoded video to capture all frames of the video at once in a sequnce 
    video = EncodedVideo.from_path(vid)
    clip = video.get_clip(0 , 1)['video']
    frames_path = os.path.join(save_dir, name)
    os.makedirs(frames_path, exist_ok=True)
    frames = [clip[:,i,:,:] for i in range((clip.shape)[1])] # i seperetaed each frame since its sequence [3, 25 ,...] to 25 one frames 
    
    to_pil = ToPILImage() # to pil image only understand normalized video for saving my frames thats where /255 came from
    for frame in frames : 
      to_pil(frame/255).save(os.path.join(frames_path , f'{name_counter}.jpg'))
      name_counter+=1


In [5]:
# capture_and_save_frames('data', save_dir='frames')

In [None]:
from torch.utils.data import Dataset
from torch import nn
from PIL import Image
from torchvision import transforms 

class MyDataSet(Dataset): 
    def __init__(self , root_dir:str):
        self.root_dir = root_dir
        self.samples = self.clean_and_float_data()


    def clean_and_float_data(self): 
            clean_score =[]
            videos = os.listdir(self.root_dir)
            for vid in videos : 
                try: 
                    score = vid.split('-')[-1].replace('.mp4', '')
                    clean_score.append((vid , float(score)))
                except: 
                    os.remove(os.path.join(self.root_dir, vid))
                    continue
            return clean_score

    def __len__(self): 
        return len(self.samples)

    def __getitem__(self, index):
         img , score = self.samples[index]
         images_dir = os.path.join(self.root_dir , img)
         img_names = os.listdir(images_dir)
         img_paths = [os.path.join(images_dir , fname) for fname in img_names]
         frames = [Image.open(p).convert("RGB") for p in img_paths]
         return frames , score, img  # from what i got , blip2 understand raw image no frames no tensor no array

         
    

In [7]:
test = Image.open('frames/2-1-2.25-3.mp4/0.jpg')

In [8]:
dataset = MyDataSet('frames') 
len(dataset)

56

In [9]:
from torch.utils.data import random_split
train_size  = int(.80 * len(dataset))
test_size = len(dataset) - train_size

train_set , test_set = random_split(dataset , [train_size , test_size])
len(train_set) , len(test_set)


(44, 12)

In [26]:
for images, score in test_set: 
    print(len(images), score)
    break

25 2.75


In [27]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
import torch.nn as nn

class ScoringMLP(nn.Module):
    def __init__(self , in_channel, out_channel):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(in_channel, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, out_channel)
        )

    def forward(self, x): 
        return self.mlp(x)


In [None]:
from lavis.model import load_model_and_preprocess

model, vis_processors, _ = load_model_and_preprocess(
    name="blip2_feature_extractor", model_type="pretrain", is_eval=True, device=device
)

for images, score , folder_name in dataset:  # 25 frame on each ne 
    counter = 0
    frame_features = []
    for frame in images: 
        image_tensor = vis_processors["eval"](frame).unsqueeze(0).to(device)

        with torch.inference_mode():
            features = model.visual_encoder(image_tensor) #[1, 257, 1408] [batch, patches , vectors]
            cls_token = features[:, 0, :]            # [1, 1408]
            frame_features.append(cls_token.squeeze(0))  # [1408]
            score_tensor = torch.tensor([score], dtype=torch.float32)

    video_tensor = torch.stack(frame_features)     # [25, 1408]
    counter+=1

    save_path = os.path.join('frames', folder_name , 'feature.pt')
    torch.save(video_tensor, save_path)