In [109]:
import json, os, cv2, random, tqdm

dataset_path = '/cpfs/shared/research-llm/instruc_data_en/multimodal_instruct_tuning/funqa/raw/'
frame_dir = '/cpfs/shared/research-llm/instruc_data_en/multimodal_instruct_tuning/funqa/frames/'

split = 'train'
video_dir = os.path.join(dataset_path, split)
frame_dir = os.path.join(frame_dir, split)
os.makedirs(frame_dir, exist_ok=True)

samples = json.load(open(os.path.join(dataset_path, f'annotation_with_ID/funqa_{split}.json')))
print(len(samples))

219685


In [110]:
import math

def extract_frames(video_path, frame_dir):
    # 创建以视频路径basename为名称的文件夹
    video_name = os.path.basename(video_path)
    output_folder = os.path.join(frame_dir, os.path.splitext(video_name)[0])
    os.makedirs(output_folder, exist_ok=True)

    # 打开视频文件
    cap = cv2.VideoCapture(video_path)

    # 确认视频文件是否打开成功
    if not cap.isOpened():
        print("无法打开视频文件。")
        return [], []

    # 获取视频的帧率和总帧数
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # 总时长
    total_seconds = total_frames / fps
    num_frames = 8 if total_seconds < 16 else 16

    # 计算均匀抽帧的间隔
    interval = total_frames // num_frames

    # 保存帧图像路径和每个图像对应的秒数
    frame_paths = []
    frame_seconds = []

    for i in range(num_frames):
        # 计算抽取帧的位置
        frame_idx = i * interval

        # 设置视频的当前帧位置
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)

        # 读取当前帧
        ret, frame = cap.read()

        if not ret:
            break

        # 保存帧图像
        frame_filename = os.path.join(output_folder, f"frame_{i:04d}.jpg")
        cv2.imwrite(frame_filename, cv2.resize(frame, (336,336)))
        
        # 计算当前帧对应的秒数
        seconds = i * interval / fps

        # 保留两位小数
        seconds = round(seconds, 2)

        frame_paths.append(frame_filename)
        frame_seconds.append(seconds)

    # 释放视频文件和资源
    cap.release()

    return frame_paths, frame_seconds

In [115]:
subdir_mapping = {
    'H': 'humor',
    'C': 'creative',
    'M': 'magic'
}

extracted_videos = {}
all_samples = []
for sample in tqdm.tqdm(samples):
    # print(sample)
    sub_dir = subdir_mapping[sample['visual_input'][0]]
    video_path = os.path.join(video_dir, f'{split}_{sub_dir}', sample['visual_input'])

    if sample['visual_input'] not in extracted_videos:
        frame_paths, timestamps = extract_frames(video_path, frame_dir)
        extracted_videos[sample['visual_input']] = (frame_paths, timestamps)
    else:
        frame_paths, timestamps = extracted_videos[sample['visual_input']]

    video_frames_str = ''
    for i in range(len(frame_paths)):
        video_frames_str += f'{timestamps[i]}s: <img_path>{frame_paths[i]}<img_path>\n'
    
    if random.random() < 0.5:
        input_str = sample['instruction'] + '\n' + video_frames_str
    else:
        input_str = video_frames_str + sample['instruction']

    if sample['output'].startswith(' '):
        sample['output'] = sample['output'][1:]

    if sample['task'][1] != '1':
        all_samples.append({
            'input': input_str,
            'output': sample['output']
        })

print(len(all_samples))
os.makedirs('converted_datasets/funqa', exist_ok=True)
json.dump(all_samples, open(f'converted_datasets/funqa/funqa-{split}.json', 'w'), indent=4)

  0%|          | 0/219685 [00:00<?, ?it/s]

100%|██████████| 219685/219685 [00:01<00:00, 110507.84it/s]


204325
