### setup dataset

In [10]:
target = [["有回应","无回应"], 
        ['积极','中性','消极'], 
        ["专注（任务中）","走神（任务外）"], 
        ["主导", "支持", "旁观", "冲突", "玩乐", "闲聊", "一人独立尝试一人摸鱼", "各自神游"]]

In [8]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import os
import numpy as np

class SiblingsVideoDatasetT1(Dataset):
    def __init__(self, path_to_labels, num_of_cls, image_processor):

        label_1 = pd.read_csv('labelA/'+path_to_labels, header=1).iloc[:, :num_of_cls+1]
        label_2 = pd.read_csv('label_2/'+path_to_labels, header=1 ).iloc[:, :num_of_cls+1]
        # label1_3 = pd.read_csv('label_3/'+path_to_labels)

        label = pd.concat([label_1, label_2], axis=0)
        label = label.rename(columns={"Unnamed: 0":"path"})

        def correct_name(directory):
            # VCAM number
            Date = directory.split('/')[0] # 15YS_20230317_01
            Vcam = directory.split('/')[-1]  # VCAM_xxxx_xx
            VcamID = Vcam.split('_')[1] # xxxx

            
            # reconstruct the correct Openpose directory
            return os.path.join('video_data', Date,'VCAM_'+VcamID, Vcam+'.mp4')# video_data/15YS_20230317_01/VCAM_0000/VCAM_0000_1.mp4

        label['path'] = label['path'].apply(correct_name)

        cls_1 = label.iloc[:,1:].idxmax(axis=1)

        class_mapping = {class_name: i for i, class_name in enumerate(target[0])}

        cls_1 = cls_1.map(class_mapping)

        self.video_paths = np.array(label['path'])
        self.labels = cls_1
        self.image_processor = image_processor

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]
        video_frames = self.image_processor.read_video(video_path)
        inputs = self.image_processor(video_frames, return_tensors="pt")
        return inputs, torch.tensor(label)



Using some sota video classification model in transformers  
[VideoMAE](https://huggingface.co/docs/transformers/main/en/model_doc/videomae#transformers.VideoMAEForVideoClassification)   
[notebook](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb)  

In [3]:
from transformers import AutoFeatureExtractor, AutoModelForVideoClassification

extractor = AutoFeatureExtractor.from_pretrained("meermoazzam41/videomae-base-finetuned-human-activity-classification")

model = AutoModelForVideoClassification.from_pretrained("meermoazzam41/videomae-base-finetuned-human-activity-classification")



In [5]:
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification, VideoMAEConfig


model_ckpt = "MCG-NJU/videomae-base"
config = VideoMAEConfig.from_pretrained(model_ckpt)

image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)
model = VideoMAEForVideoClassification.from_pretrained(model_ckpt)

Some weights of the model checkpoint at MCG-NJU/videomae-base were not used when initializing VideoMAEForVideoClassification: ['decoder.decoder_layers.3.attention.attention.query.weight', 'decoder.decoder_layers.2.output.dense.bias', 'decoder.decoder_layers.3.attention.output.dense.weight', 'decoder.decoder_layers.1.output.dense.bias', 'decoder.decoder_layers.2.attention.output.dense.bias', 'decoder.decoder_layers.1.intermediate.dense.weight', 'decoder.decoder_layers.3.layernorm_before.weight', 'decoder.decoder_layers.0.layernorm_after.weight', 'decoder.decoder_layers.1.attention.output.dense.weight', 'decoder.decoder_layers.3.attention.attention.value.weight', 'mask_token', 'decoder.decoder_layers.2.attention.output.dense.weight', 'decoder.decoder_layers.2.layernorm_after.weight', 'decoder.decoder_layers.2.layernorm_before.bias', 'decoder.decoder_layers.3.attention.attention.v_bias', 'decoder.decoder_layers.2.output.dense.weight', 'decoder.decoder_layers.0.attention.output.dense.bias'

In [11]:
path_to_labels = "回应情况-表格 1.csv"
train_dataset = SiblingsVideoDatasetT1(path_to_labels=path_to_labels, num_of_cls=4, image_processor=image_processor)
# val_dataset = SiblingsVideoDatasetT1(val_video_paths, val_labels, image_processor)

In [13]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [14]:
from transformers import AdamW

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    total_loss = 0
    for inputs, labels in val_loader:
        with torch.no_grad():
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()

    avg_loss = total_loss / len(val_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_loss:.4f}")



AttributeError: 'VideoMAEImageProcessor' object has no attribute 'read_video'

In [None]:
video_path = "path/to/your/video.mp4"
video_frames = image_processor.read_video(video_path)

inputs = image_processor(video_frames, return_tensors="pt")


with torch.no_grad():
    logits = model(**inputs).logits


predicted_class = logits.argmax(dim=-1).item()