For Pedestrians with Behavior attributes

In [30]:
import json
import pickle
import os

annotations_file = "./data_cache/jaad_database.pkl"
images_dir = "./images"
output_main_dir = "./Prompts_for_Pedestrians"

with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

num_videos_to_process = 1
videos_processed = 0

occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'not-nodding', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

for video_id, video_data in database.items():
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")
    os.makedirs(output_video_dir, exist_ok=True)

    if videos_processed >= num_videos_to_process:
        break
        
    vehicle_annotations = video_data['vehicle_annotations']
    traffic_annotations = video_data['traffic_annotations']
    ped_annotations = video_data['ped_annotations']        
    num_frames = video_data['num_frames']
    
    first_frame_index = 0  
    first_frame_traffic_attributes = traffic_annotations[first_frame_index]
        
    vehicle = vehicle_mapping.get(vehicle_annotations[first_frame_index], 'Unknown')
    road_type = road_type_mapping.get(traffic_annotations.get('road_type', 0), 'Unknown') 
    pedestrian_crossing = pedestrian_crossing_mapping.get(first_frame_traffic_attributes.get('ped_crossing', 0), 'Unknown')
    pedestrian_sign = pedestrian_sign_mapping.get(first_frame_traffic_attributes.get('ped_sign', 0), 'Unknown')
    stop_sign = stop_sign_mapping.get(first_frame_traffic_attributes.get('stop_sign', 0), 'Unknown')
    traffic_light = traffic_light_mapping.get(first_frame_traffic_attributes.get('traffic_light', 0), 'Unknown')


    for pedestrian_id, pedestrian_data in ped_annotations.items():
        output_pedestrian_dir = output_video_dir  # Remove subfolder creation
        os.makedirs(output_pedestrian_dir, exist_ok=True)
        
        all_video_prompts = []

        for frame_num in pedestrian_data['frames']:
            if frame_num <= num_frames:
                frame_index = pedestrian_data['frames'].index(frame_num)

                if 'behavior' in pedestrian_data and 'action' in pedestrian_data['behavior'] and frame_index < len(pedestrian_data['behavior']['action']):
                    
                    age = age_mapping.get(pedestrian_data['attributes'].get('age', 0), 'Unknown')
                    gender = gender_mapping.get(pedestrian_data['attributes'].get('gender', 0), 'Unknown')
                    motion_direction = motion_direction_mapping.get(pedestrian_data['attributes'].get('motion_direction', 0), 'Unknown')
                    action = action_mapping.get(pedestrian_data['behavior']['action'][frame_index], 'Unknown')
                    cross = cross_mapping.get(pedestrian_data['behavior'].get('cross', [])[frame_index], 'Unknown')
                    reaction = reaction_mapping.get(pedestrian_data['behavior'].get('reaction', [])[frame_index], 'Unknown')
                    hand_gesture = hand_gesture_mapping.get(pedestrian_data['behavior'].get('hand_gesture', [])[frame_index], 'Unknown')
                    look = look_mapping.get(pedestrian_data['behavior'].get('look', [])[frame_index], 'Unknown')
                    nod = nod_mapping.get(pedestrian_data['behavior'].get('nod', [])[frame_index], 'Unknown')
                    vehicle = vehicle_mapping.get(vehicle_annotations[frame_index], 'Unknown')

                    if frame_num < len(pedestrian_data['bbox']):
                        current_bbox = pedestrian_data['bbox'][frame_num]
                    else:
                        continue
                             
                    previous_bboxes = []
                    for i in range(frame_index - 1, max(frame_index - 6, -1), -1):
                        if pedestrian_data['frames'][i] < len(pedestrian_data['bbox']):
                            previous_bboxes.append(pedestrian_data['bbox'][pedestrian_data['frames'][i]])

                    future_bboxes = []
                    for i in range(frame_index + 1, min(frame_index + 6, len(pedestrian_data['frames']))):
                        if i < len(pedestrian_data['frames']) and pedestrian_data['frames'][i] < len(pedestrian_data['bbox']):
                            future_bboxes.append(pedestrian_data['bbox'][pedestrian_data['frames'][i]])
                        else:
                            break  
                    
                    prompt = {
                        "id": pedestrian_id,
                        "image": f"./images_with_boxes_Pedestrians Focused_/{video_id}/Pedestrian_{pedestrian_id}/Pedestrian_{pedestrian_id}_Image_{frame_num}.png",
                        "conversations": [
                            {
                                "from": "user",
                                "value": f"<image> Can you provide insights on pedestrian {pedestrian_id} in {video_id}? Specifically, what is the coordinate of the pedestrian at present and the past 5 frames? Is the pedestrian currently engaged in crossing the road? What is the pedestrian's motion direction? Can you tell the pedestrian's action? What is the pedestrian's reaction? Is the pedestrian making any specific hand gestures? Is the pedestrian looking at the vehicle? Is the pedestrian nodding? Also, can you tell me about the vehicle movement? Moreover can you describe the entire traffic scene? Can you also assist me in predicting the trajectories of pedestrians in the next 5 frames?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's current location is within the bounding box: {pedestrian_data['bbox'][frame_index]}. The 5 bounding boxes for previous frames are: {previous_bboxes}. The pedestrian is {cross} the road. The pedestrian's motion direction is {motion_direction}. The pedestrian is currently {action}. The pedestrian's reaction is {reaction}. The pedestrian maintains a {hand_gesture} hand gesture throughout the video. The pedestrian is {look} at the vehicle and the pedestrian is {nod}. For the vehicle attributes, it is '{vehicle}'. While, for the scene attributes, the road type is '{road_type}', the pedestrian crossing is '{pedestrian_crossing}', pedestrian sign is '{pedestrian_sign}', stop sign is '{stop_sign}', and the traffic light is '{traffic_light}'. The future 5 bounding boxes are: {future_bboxes}."
                            }
                        ]
                    }

                    all_video_prompts.append(prompt)
                    
        if all_video_prompts:

            with open(os.path.join(output_video_dir, f'pedestrian_{pedestrian_id}_prompts.json'), 'w') as f:
                json.dump(all_video_prompts, f, indent=4)

    videos_processed += 1

print("Prompts generated and saved to respective folders.")


Prompts generated and saved to respective folders.


For Pedestrians without Behavior Attributes

In [4]:
"""
                    prompt = {
                        "id": pedestrian_counter,
                        "image": f"./images_with_boxes_Pedestrians Focused_/{video_id}/Pedestrian_{pedestrian_id}/Pedestrian_{pedestrian_id}_Image_{frame_num}.png",
                        "conversations": [
                            {
                                "from": "user",
                                "value": f"<image> Can you provide insights on pedestrian {pedestrian_id} in {video_id}? Specifically, what is the coordinate of the pedestrian at present and the past 5 frames? Is the pedestrian currently engaged in crossing the road? What is the pedestrian's motion direction? Can you tell the pedestrian's action? What is the pedestrian's reaction? Is the pedestrian making any specific hand gestures? Is the pedestrian looking at the vehicle? Is the pedestrian nodding? Also, can you tell me about the vehicle movement? Moreover can you describe the entire traffic scene? Can you also assist me in predicting the trajectories of pedestrians in the next 5 frames?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's current location is within the bounding box: {pedestrian_data['bbox'][frame_index]}. The 5 bounding boxes for previous frames are: {previous_bboxes}. The pedestrian is {cross} the road. The pedestrian's motion direction is {motion_direction}. The pedestrian is currently {action}. The pedestrian's reaction is {reaction}. The pedestrian maintains a {hand_gesture} hand gesture throughout the video. The pedestrian is {look} at the vehicle and the pedestrian is {nod}. For the vehicle attributes, it is '{vehicle}'. While, for the scene attributes, the road type is '{road_type}', the pedestrian crossing is '{pedestrian_crossing}', pedestrian sign is '{pedestrian_sign}', stop sign is '{stop_sign}', and the traffic light is '{traffic_light}'. The future 5 bounding boxes are: {future_bboxes}."
                            }
                        ]
                    }"""



import json
import pickle
import os

annotations_file = "./data_cache/jaad_database.pkl"
images_dir = "./images"
output_main_dir = "./Prompts_for_Pedestrians"

with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

num_videos_to_process = 1
videos_processed = 0

occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'not-nodding', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

for video_id, video_data in database.items():
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")
    os.makedirs(output_video_dir, exist_ok=True)

    if videos_processed >= num_videos_to_process:
        break
        
    vehicle_annotations = video_data['vehicle_annotations']
    traffic_annotations = video_data['traffic_annotations']
    ped_annotations = video_data['ped_annotations']
    num_frames = video_data['num_frames']
    
    first_frame_index = 0  
    first_frame_traffic_attributes = traffic_annotations[first_frame_index]
        
    vehicle = vehicle_mapping.get(vehicle_annotations[first_frame_index], 'Unknown')
    road_type = road_type_mapping.get(traffic_annotations.get('road_type', 0), 'Unknown') 
    pedestrian_crossing = pedestrian_crossing_mapping.get(first_frame_traffic_attributes.get('ped_crossing', 0), 'Unknown')
    pedestrian_sign = pedestrian_sign_mapping.get(first_frame_traffic_attributes.get('ped_sign', 0), 'Unknown')
    stop_sign = stop_sign_mapping.get(first_frame_traffic_attributes.get('stop_sign', 0), 'Unknown')
    traffic_light = traffic_light_mapping.get(first_frame_traffic_attributes.get('traffic_light', 0), 'Unknown')

    for pedestrian_id, pedestrian_data in ped_annotations.items():
        output_pedestrian_dir = output_video_dir  # Remove subfolder creation
        os.makedirs(output_pedestrian_dir, exist_ok=True)

        simplified_prompts = []

        if 'behavior' not in pedestrian_data or not pedestrian_data['behavior']:

            for frame_num in pedestrian_data['frames']:
                if frame_num <= video_data['num_frames']:
                    frame_index = pedestrian_data['frames'].index(frame_num)

                    if frame_index < len(pedestrian_data['bbox']):
                        current_bbox = pedestrian_data['bbox'][frame_index]
                    else:
                        continue  

                    previous_bboxes = []
                    for i in range(max(frame_index - 5, 0), frame_index):
                        if i < len(pedestrian_data['bbox']):
                            previous_bboxes.append(pedestrian_data['bbox'][i])

                    future_bboxes = []
                    for i in range(frame_index + 1, min(frame_index + 6, len(pedestrian_data['bbox']))):
                        future_bboxes.append(pedestrian_data['bbox'][i])

                    simplified_prompt = {
                        "id": pedestrian_id,
                        "image": f"./images_with_boxes_Pedestrians Focused_/{video_id}/Pedestrian_{pedestrian_id}/Pedestrian_{pedestrian_id}_Image_{frame_num}.png",
                        "conversations": [
                            {
                                "from": "user",
                                "value": f": <image> Can you provide insights on pedestrian {pedestrian_id} in {video_id}? Specifically, what is the coordinate of the pedestrian at present and for 5 previous frames?  Also, can you tell me about the vehicle movement? Moreover can you describe the entire traffic scene? Can you also assist me in predicting the trajectories of pedestrians in the next 5 frames?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's current location is within the bounding box: {current_bbox}. The 5 bounding boxes for previous frames are: {previous_bboxes}. For the vehicle attributes, it is '{vehicle}'. While, for the scene attributes, the road type is '{road_type}', the pedestrian crossing is '{pedestrian_crossing}', pedestrian sign is '{pedestrian_sign}', stop sign is '{stop_sign}', and the traffic light is '{traffic_light}'. The future 5 bounding boxes are: {future_bboxes}."
                            }
                        ]
                    }

                    simplified_prompts.append(simplified_prompt)  # Append the simplified prompt to the list

            with open(os.path.join(output_pedestrian_dir, f'pedestrian_{pedestrian_id}_prompts.json'), 'w') as f:
                json.dump(simplified_prompts, f, indent=4)

            videos_processed += 1

print("Simplified prompts generated and saved to respective folders.")


Simplified prompts generated and saved to respective folders.
