# Creation of Prompts for the Dataset

In [None]:
import json
import pickle

# Mapping of numerical values to words
gender_mapping = {0: 'Unknown', 1: 'Male', 2: 'Female'}
age_mapping = {0: 'Unknown', 1: 'Child', 2: 'Adult', 3: 'Elderly'}
group_size_mapping = {0: 'Unknown', 1: 'Alone', 2: 'Small Group', 3: 'Large Group'}
motion_direction_mapping = {0: 'Unknown', 1: 'Forward', 2: 'Backward', 3: 'Left', 4: 'Right'}
vehicle_action_mapping = {0: 'Unknown', 1: 'Stopped', 2: 'Moving Forward', 3: 'Turning Left', 4: 'Turning Right'}
presence_absence_mapping = {0: 'Absent', 1: 'Present'}
road_type_mapping = {0: 'Unknown', 1: 'Parking Lot', 2: 'Urban Road', 3: 'Highway'}

# Path to the .pkl file
pkl_file_path = "./data_cache/jaad_database.pkl"

# Path to save the JSON file
json_file_path = "./jaad_prompts_final.json"

# Load the data from the .pkl file
with open(pkl_file_path, 'rb') as f:
    data = pickle.load(f)

# List to store prompts
prompts = []

# Iterate through the videos
for vid, vid_data in data.items():
    num_frames = vid_data['num_frames']
    # Generate prompts for every 50th frame
    for frame_num in range(0, num_frames, 50):
        prompt = f"1. We are working on {vid}.\n"
        prompt += f"2. We examine each pedestrian's behavior in frame {frame_num}. In this frame, pedestrians are {presence_absence_mapping[1]}.\n"
        pedestrian_annotations = vid_data.get('ped_annotations', {})
        for ped_id, ped_data in pedestrian_annotations.items():
            if frame_num in ped_data.get('frames', []):
                pedestrian_gender = gender_mapping.get(ped_data.get('attributes', {}).get('gender', 0))
                pedestrian_age = age_mapping.get(ped_data.get('attributes', {}).get('age', 0))
                pedestrian_group_size = group_size_mapping.get(ped_data.get('attributes', {}).get('group_size', 0))
                pedestrian_motion_direction = motion_direction_mapping.get(ped_data.get('attributes', {}).get('motion_direction', 0))
                pedestrian_bbox = ped_data.get('bbox', [])
                if len(pedestrian_bbox) > frame_num:
                    pedestrian_bbox_info = pedestrian_bbox[frame_num]
                else:
                    pedestrian_bbox_info = "Not available"
                prompt += f"    - Gender: {pedestrian_gender}\n"
                prompt += f"    - Age: {pedestrian_age}\n"
                prompt += f"    - Group Size: {pedestrian_group_size}\n"
                prompt += f"    - Motion Direction: {pedestrian_motion_direction}\n"
                prompt += f"    - Bounding Box: {pedestrian_bbox_info}\n"

        vehicle_annotations = vid_data.get('vehicle_annotations', {})
        vehicle_action = vehicle_annotations.get(frame_num, 'Unknown')
        vehicle_action_word = vehicle_action_mapping.get(vehicle_action, 'Unknown')
        if vehicle_action != 'Unknown':
            prompt += f"3. Next, we inspect vehicle actions observed in the video frames. In this frame, the vehicle is {vehicle_action_word}.\n"
        else:
            prompt += f"3. There are no vehicle annotations available.\n"

        # Explore traffic-related attributes for the current frame
        traffic_annotations = vid_data.get('traffic_annotations', {}).get(frame_num, {})
        road_type = road_type_mapping.get(traffic_annotations.get('road_type', 0), 'Unknown')
        pedestrian_crossings = presence_absence_mapping.get(traffic_annotations.get('ped_crossing', 0))
        pedestrian_signs = presence_absence_mapping.get(traffic_annotations.get('ped_sign', 0))
        stop_signs = presence_absence_mapping.get(traffic_annotations.get('stop_sign', 0))
        traffic_lights = presence_absence_mapping.get(traffic_annotations.get('traffic_light', 0))
        prompt += f"4. Finally, we explore traffic-related attributes. In this frame, the road type is {road_type}. It {presence_absence_mapping[1]} the following attributes:\n"
        prompt += f"    - Pedestrian Crossings: {pedestrian_crossings}\n"
        prompt += f"    - Pedestrian Signs: {pedestrian_signs}\n"
        prompt += f"    - Stop Signs: {stop_signs}\n"
        prompt += f"    - Traffic Lights: {traffic_lights}\n"

        # Append prompt to the list
        prompts.append(prompt)

# Write prompts to a JSON file
with open(json_file_path, 'w') as json_file:
    json_file.write("\n".join(prompts))

print("Prompts saved to", json_file_path)


Prompts for the entire video, focusing only on the first pedestrian of each video


In [None]:
import json
import pickle

# Mapping of numerical values to words
gender_mapping = {0: 'Unknown', 1: 'Male', 2: 'Female'}
age_mapping = {0: 'Unknown', 1: 'Child', 2: 'Adult', 3: 'Elderly'}
group_size_mapping = {0: 'Unknown', 1: 'Alone', 2: 'Small Group', 3: 'Large Group'}
motion_direction_mapping = {0: 'Unknown', 1: 'Forward', 2: 'Backward', 3: 'Left', 4: 'Right'}
vehicle_action_mapping = {0: 'Unknown', 1: 'Stopped', 2: 'Moving Forward', 3: 'Turning Left', 4: 'Turning Right'}
presence_absence_mapping = {0: 'Absent', 1: 'Present'}
road_type_mapping = {0: 'Unknown', 1: 'Parking Lot', 2: 'Urban Road', 3: 'Highway'}

# Path to the .pkl file
pkl_file_path = "./data_cache/jaad_database.pkl"

# Path to save the JSON file
json_file_path = "./jaad_prompts_revised.json"

# Load the data from the .pkl file
with open(pkl_file_path, 'rb') as f:
    data = pickle.load(f)

# List to store prompts for all videos
all_video_prompts = []

# Iterate through each video
for video_id, video_data in data.items():
    # Get the number of frames in the video
    num_frames = video_data['num_frames']
    # Get the middle frame number
    middle_frame_num = num_frames // 2
    
    # Get pedestrian data for the middle frame
    pedestrian_annotations = video_data.get('ped_annotations', {})
    pedestrians_data = []
    pedestrian_added = False
    for ped_id, ped_data in pedestrian_annotations.items():
        if middle_frame_num in ped_data.get('frames', []):
            if not pedestrian_added:
                bbox_data = ped_data.get('bbox', [])
                if len(bbox_data) > middle_frame_num:
                    bounding_box = bbox_data[middle_frame_num]
                else:
                    bounding_box = "Not available"
                pedestrians_data.append({
                    "Types of Pedestrians": "Alone",  # Assuming group size is always alone in this example
                    "Motion Direction": "Backward",  # Assuming motion direction is always backward in this example
                    "Bounding Box": bounding_box
                })
                pedestrian_added = True

    # Check if pedestrian data exists for the middle frame
    if not pedestrians_data:
        continue

    # Get vehicle data for the middle frame
    vehicle_annotations = video_data.get('vehicle_annotations', {})
    vehicle_action = vehicle_annotations.get(middle_frame_num, 'Unknown')

    # Get traffic data for the middle frame
    traffic_annotations = video_data.get('traffic_annotations', {}).get(middle_frame_num, {})
    road_type = "Unknown"
    pedestrian_crossings = "Absent"
    pedestrian_signs = "Absent"
    stop_signs = "Absent"
    traffic_lights = "Absent"

    # Generate the prompt for the current video
    prompt = f"Role: You are tasked with enhancing the pedestrian detection system for an autonomous vehicle. Develop a strategy to improve the accuracy and efficiency of pedestrian detection while minimizing false positives.\n\n"
    prompt += f"Context:\n"
    prompt += f"• Pedestrian Behavior: We are examining {video_id}. In this video, the pedestrians are Present and have the following attributes:\n"
    prompt += f"  - Types of Pedestrians: {pedestrians_data[0]['Types of Pedestrians']}\n"
    prompt += f"  - Motion Direction: {pedestrians_data[0]['Motion Direction']}\n"
    prompt += f"• Vehicle Actions: Now we examine the vehicle actions. In this video, the vehicle is {vehicle_action}.\n"
    prompt += f"• Traffic Attributes: Now we examine the traffic attributes. In this video, the road type is {road_type}, with the following attributes:\n"
    prompt += f"  - Road type: {road_type}\n"
    prompt += f"  - Pedestrian Crossings: {pedestrian_crossings}\n"
    prompt += f"  - Pedestrian Signs: {pedestrian_signs}\n"
    prompt += f"  - Stop Signs: {stop_signs}\n"
    prompt += f"  - Traffic Lights: {traffic_lights}\n\n"
    prompt += f"Task:\n"
    prompt += f"• Analysis: Review pedestrian behavior and vehicle status in the entire video.\n"
    prompt += f"• Trajectory Prediction: Predict the trajectories of predictions in the next 3 seconds or 6 next bounding boxes.\n\n"

    # Append the prompt to the list of all video prompts
    all_video_prompts.append(prompt)

# Write prompts to a JSON file
with open(json_file_path, 'w') as json_file:
    for video_prompt in all_video_prompts:
        json_file.write(video_prompt)

print("Prompts saved to", json_file_path)

In [None]:
import json
import pickle
import os

# Path to annotations .pkl file
annotations_file = "./data_cache/jaad_database.pkl"

# Path to the directory containing images
images_dir = "./images"

# Output directory to store images with bounding boxes
output_main_dir = "./images_with_boxes(Pedestrians Focused)"

# Initialize list to store all video prompts
all_video_prompts = []

# Load the database
with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

# Process the first 10 videos
num_videos_to_process = 10
videos_processed = 0

# Specify the maximum number of images per pedestrian
max_images_per_pedestrian = 5

# New mapping
occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'undefined', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

for video_id, video_data in database.items():
    if videos_processed >= num_videos_to_process:
        break
    
    # Create a directory for the current video
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")
    os.makedirs(output_video_dir, exist_ok=True)

    # Get the number of frames in the video
    num_frames = video_data['num_frames']

    # Process each pedestrian in the video
    for pedestrian_id, pedestrian_data in video_data['ped_annotations'].items():
        # Extract pedestrian attributes for the middle frame
        middle_frame_index = len(pedestrian_data['frames']) // 2
        middle_frame_attributes = pedestrian_data['attributes']

        age = age_mapping.get(middle_frame_attributes.get('age', 0), 'Unknown')
        gender = gender_mapping.get(middle_frame_attributes.get('gender', 0), 'Unknown')
        motion_direction = motion_direction_mapping.get(middle_frame_attributes.get('motion_direction', 0), 'Unknown')
        action = action_mapping.get(pedestrian_data.get('action', 0), 'Unknown')
        cross = cross_mapping.get(pedestrian_data.get('cross', 0), 'Unknown')
        reaction = reaction_mapping.get(pedestrian_data.get('reaction', 0), 'Unknown')
        hand_gesture = hand_gesture_mapping.get(pedestrian_data.get('hand_gesture', 0), 'Unknown')
        look = look_mapping.get(pedestrian_data.get('look', 0), 'Unknown')
        nod = nod_mapping.get(pedestrian_data.get('nod', 0), 'Unknown')
        vehicle = vehicle_mapping.get(pedestrian_data.get('vehicle', 0), 'Unknown')
        road_type = road_type_mapping.get(pedestrian_data.get('road_type', 0), 'Unknown')
        pedestrian_crossing = pedestrian_crossing_mapping.get(pedestrian_data.get('pedestrian_crossing', 0), 'Unknown')
        pedestrian_sign = pedestrian_sign_mapping.get(pedestrian_data.get('pedestrian_sign', 0), 'Unknown')
        stop_sign = stop_sign_mapping.get(pedestrian_data.get('stop_sign', 0), 'Unknown')
        traffic_light = traffic_light_mapping.get(pedestrian_data.get('traffic_light', 0), 'Unknown')

        # Create a directory for the current pedestrian inside the output_video_dir
        output_pedestrian_dir = os.path.join(output_video_dir, f"Pedestrian_{pedestrian_id}")
        os.makedirs(output_pedestrian_dir, exist_ok=True)

        # Initialize a counter for the number of images processed for the current pedestrian
        images_processed = 0

        # List to store image paths for the current pedestrian
        image_paths = []

        # Process the frames for this pedestrian
        for frame_num in pedestrian_data['frames']:
            # Check if the current frame is within the bounds of the video
            if frame_num <= num_frames:
                # Check if the frame number is divisible by 10 and the maximum number of images has not been reached
                if frame_num % 10 == 1 and images_processed < max_images_per_pedestrian:
                    # Load the image for the current frame
                    image_file = f"{frame_num:05d}.png"
                    image_path = os.path.join(images_dir, video_id, image_file)
                    if not os.path.exists(image_path):
                        print(f"Image file {image_file} does not exist for video {video_id}. Skipping...")
                        continue

                    # Add the image path to the list
                    image_paths.append(image_path)

                    # Increment the counter for images processed
                    images_processed += 1

                    # Check if the maximum number of images has been reached
                    if images_processed >= max_images_per_pedestrian:
                        break

        # Pad the image_paths list with empty strings if needed
        while len(image_paths) < max_images_per_pedestrian:
            image_paths.append("")

        # Generate the prompt for the current pedestrian
        prompt = {
            "Video ID": video_id,
            "Ped_id": pedestrian_id,
            "conversations": [
                {
                    "from": "user",
                    "value": f"Role: You are an autonomous vehicle that uses front-camera images to interact with pedestrians. Input: {input_images_line}. Above are 5 sequential ego-vehicle front-camera view images extraced from a 2 second video that you can see behind the wheel.{bounding_box_info}\nTask: predict the trajectory of the pedestrian of interest for the next 1 second.\nExpected output: coordinates of 6 bounding box indicating the trajectory of pedestrain for the next 1 second(in the form of [((al1,bl1), (ar1,br1))],[((al6, b6), (ar6, br6))])."
                },
                {
                    "from": "user",
                    "value": "Role: You are tasked with enhancing the pedestrian detection and its immediate trajectory prediction system for an autonomous vehicle. Firstly, can you provide insights on pedestrian {} in {}, as well as the vehicle movement and the entire traffic scene?".format(pedestrian_id, video_id)
                },
                {
                    "from": "assistant",
                    "value": "Of course. The pedestrian identified as '{}' is of the age group: {}. The gender is: {}. Its motion direction is {}, and it is currently {}. The pedestrian is {} the road, while its reaction is {}. Also, The pedestrian maintains a {} hand gesture throughout. The pedestrian is {}. The pedestrian nodding state is {}. For the vehicle, it is '{}'. Moreover, for the scene attributes, the road type is '{}', the pedestrian crossing is '{}', pedestrian sign is '{}', stop sign is '{}', and the traffic light is '{}'.".format(
                        pedestrian_id, age, gender, motion_direction, action, cross, reaction, hand_gesture, look,  nod, vehicle, road_type, pedestrian_crossing, pedestrian_sign, stop_sign, traffic_light)
                },
                {
                    "from": "user",
                    "value": "Thank you for the information. Can you also assist me in predicting the trajectories of pedestrians in the next 0.5 and 1 seconds?"
                },
                {
                    "from": "assistant",
                    "value": "Sure, I can help with that. Let's analyze the pedestrian behavior and predict their trajectories for the next 0.5 and 1 seconds."
                }
            ]
        }

        # Append the prompt to the list of all video prompts
        all_video_prompts.append(prompt)

    # Increment the counter for videos processed
    videos_processed += 1

# Write prompts to a JSON file
with open('output_prompts_updated.json', 'w') as f:
    json.dump(all_video_prompts, f, indent=4)

print("Prompts generated and saved to 'output_prompts_updated.json'.")


In [None]:
import json
import pickle
import os

# Path to annotations .pkl file
annotations_file = "./data_cache/jaad_database.pkl"

# Path to the directory containing images
images_dir = "./images"

# Output directory to store images with bounding boxes
output_main_dir = "./images_with_boxes(Pedestrians Focused)"

# Initialize list to store all video prompts
all_video_prompts = []

# Load the database
with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

# Process the first 10 videos
num_videos_to_process = 5
videos_processed = 0

# Specify the maximum number of images per pedestrian
max_images_per_pedestrian = 5

# Mapping of numerical values to words
gender_mapping = {0: 'Unknown', 1: 'Male', 2: 'Female'}
age_mapping = {0: 'Unknown', 1: 'Child', 2: 'Adult', 3: 'Elderly'}
group_size_mapping = {0: 'Unknown', 1: 'Alone', 2: 'Small Group', 3: 'Large Group'}
motion_direction_mapping = {0: 'Unknown', 1: 'Forward', 2: 'Backward', 3: 'Left', 4: 'Right'}
vehicle_action_mapping = {0: 'Unknown', 1: 'Stopped', 2: 'Moving Forward', 3: 'Turning Left', 4: 'Turning Right'}
presence_absence_mapping = {0: 'Absent', 1: 'Present'}
road_type_mapping = {0: 'Unknown', 1: 'Parking Lot', 2: 'Urban Road', 3: 'Highway'}

for video_id, video_data in database.items():
    if videos_processed >= num_videos_to_process:
        break
    
    # Create a directory for the current video
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")
    os.makedirs(output_video_dir, exist_ok=True)

    # Get the number of frames in the video
    num_frames = video_data['num_frames']

    # Process each pedestrian in the video
    for pedestrian_id, pedestrian_data in video_data['ped_annotations'].items():
        # Extract pedestrian attributes for the middle frame
        middle_frame_index = len(pedestrian_data['frames']) // 2
        middle_frame_attributes = pedestrian_data['attributes']

        age = age_mapping.get(middle_frame_attributes.get('age', 0), 'Unknown')
        gender = gender_mapping.get(middle_frame_attributes.get('gender', 0), 'Unknown')
        motion_direction = motion_direction_mapping.get(middle_frame_attributes.get('motion_direction', 0), 'Unknown')

        # Get vehicle action for middle frame index
        vehicle_action = vehicle_action_mapping.get(video_data['vehicle_annotations'].get('frames', {}).get(middle_frame_index, {}).get('action', 0), 'Unknown')

        # Get traffic annotations for middle frame index
        traffic_annotations = video_data['traffic_annotations'].get(middle_frame_index, {})

        road_type = road_type_mapping.get(traffic_annotations.get('road_type', 0), 'Unknown')
        ped_crossing = presence_absence_mapping.get(traffic_annotations.get('ped_crossing', 0), 'Unknown')
        ped_sign = presence_absence_mapping.get(traffic_annotations.get('ped_sign', 0), 'Unknown')
        stop_sign = presence_absence_mapping.get(traffic_annotations.get('stop_sign', 0), 'Unknown')
        traffic_light = presence_absence_mapping.get(traffic_annotations.get('traffic_light', 0), 'Unknown')

        # Create a directory for the current pedestrian
        output_pedestrian_dir = os.path.join(output_video_dir, f"Pedestrian_{pedestrian_id}")
        os.makedirs(output_pedestrian_dir, exist_ok=True)

        # Initialize a counter for the number of images processed for the current pedestrian
        images_processed = 0

        # List to store image paths for the current pedestrian
        image_paths = []

        # Process the frames for this pedestrian
        for frame_num in pedestrian_data['frames']:
            # Check if the current frame is within the bounds of the video
            if frame_num <= num_frames:
                # Check if the frame number is divisible by 10 and the maximum number of images has not been reached
                if frame_num % 10 == 1 and images_processed < max_images_per_pedestrian:
                    # Load the image for the current frame
                    image_file = f"{frame_num:05d}.png"
                    image_path = os.path.join(images_dir, video_id, image_file)
                    if not os.path.exists(image_path):
                        print(f"Image file {image_file} does not exist for video {video_id}. Skipping...")
                        continue

                    # Add the image path to the list
                    image_paths.append(image_path)

                    # Increment the counter for images processed
                    images_processed += 1

                    # Check if the maximum number of images has been reached
                    if images_processed >= max_images_per_pedestrian:
                        break

        # Generate the prompt for the current pedestrian
        prompt = {
            "Video ID": video_id,
            "Ped_id": pedestrian_id,
            "conversations": [
                {
                    "from": "user",
                    "value": " Picture 1: <img>{}</img>\nPicture 2: <img>{}</img>\nPicture 3: <img>{}</img>\nPicture 4: <img>{}</img>\nPicture 5: <img>{}</img>".format(*image_paths)
                },
                {
                    "from": "user",
                    "value": "Role: You are tasked with enhancing the pedestrian detection and its immediate trajectory prediction system for an autonomous vehicle. Firstly, can you provide insights on pedestrian {} in {}, as well as the vehicle movement and the entire traffic scene?".format(pedestrian_id, video_id)
                },
                {
                    "from": "assistant",
                    "value": "Of course. The pedestrian identified as '{}' exhibits the following characteristics:\n- Age: {}\n- Gender: {}\n- Motion Direction: {}\nThe pedestrian does not cross the road in any frame.\nThere is no reaction observed from the pedestrian in any frame.\nThe pedestrian maintains a neutral hand gesture throughout.\nThe pedestrian is not looking in any frame.\nNo action is taken by the pedestrian in any frame.\nThere is no nodding observed from the pedestrian in any frame.\nFor the vehicle, it is 'moving forward'. Moreover, for the scene attributes, the road type is 'unknown', and there are 'a/no' pedestrian crossings, 'a/no' pedestrian signs, 'a/no' stop signs, or 'a/no' traffic lights detected.".format(pedestrian_id, age, gender, motion_direction)
                },
                {
                    "from": "user",
                    "value": "Thank you for the information. Can you also assist me in predicting the trajectories of pedestrians in the next 0.5 and 1 seconds?"
                },
                {
                    "from": "assistant",
                    "value": "Sure, I can help with that. Let's analyze the pedestrian behavior and predict their trajectories for the next 0.5 and 1 seconds."
                }
            ]
        }

        # Append the prompt to the list of all video prompts
        all_video_prompts.append(prompt)

    # Increment the counter for videos processed
    videos_processed += 1

# Write prompts to a JSON file
with open('output_prompts_old.json', 'w') as f:
    json.dump(all_video_prompts, f, indent=4)

print("Prompts generated and saved to 'output_prompts_old.json'.")


5 Prompts skipped by an interval of 10 frames, for every pedestrian in every video.

In [None]:
import json
import pickle
import os

annotations_file = "./data_cache/jaad_database.pkl"

images_dir = "./images"

output_main_dir = "./images_with_boxes(Pedestrians Focused)"

with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

num_videos_to_process = 1
videos_processed = 0

max_frames_per_pedestrian = 5

occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'undefined', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

all_video_prompts = []

for video_id, video_data in database.items():
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")
    #print(output_pedestrian_dir)
    #os.makedirs(output_video_dir, exist_ok=True)
    if videos_processed >= num_videos_to_process:
        break
    
    vehicle_annotations = video_data['vehicle_annotations']
    #print(vehicle_annotations.keys())
    traffic_annotations = video_data['traffic_annotations']
    #print(traffic_annotations.keys())
    ped_annotations = video_data['ped_annotations']
    #print(ped_annotations.keys())
        
    first_frame_index = 0  
    first_frame_traffic_attributes = traffic_annotations[first_frame_index]
    '''
    print("Keys and Values in first_frame_traffic_attributes:")
    for key, value in first_frame_traffic_attributes.items():
        print(f"{key}: {value}")
    '''
    vehicle = vehicle_mapping.get(vehicle_annotations[first_frame_index], 'Unknown')
    road_type = road_type_mapping.get(traffic_annotations.get('road_type', 0), 'Unknown')
    
    pedestrian_crossing = pedestrian_crossing_mapping.get(first_frame_traffic_attributes.get('ped_crossing', 0), 'Unknown')
    pedestrian_sign = pedestrian_sign_mapping.get(first_frame_traffic_attributes.get('ped_sign', 0), 'Unknown')
    stop_sign = stop_sign_mapping.get(first_frame_traffic_attributes.get('stop_sign', 0), 'Unknown')
    traffic_light = traffic_light_mapping.get(first_frame_traffic_attributes.get('traffic_light', 0), 'Unknown')


    num_frames = video_data['num_frames']

    for pedestrian_id, pedestrian_data in video_data['ped_annotations'].items():
        
        output_pedestrian_dir = os.path.join(output_video_dir, f"Pedestrian_{pedestrian_id}")

        middle_frame_index = len(pedestrian_data['frames']) // 2
        middle_frame_attributes = pedestrian_data['attributes']

        frames_processed = 0

        first_frame_number = pedestrian_data['frames'][0]

        frames_to_process = pedestrian_data['frames'][::10][:max_frames_per_pedestrian]
        for frame_num in frames_to_process:
            if frame_num <= num_frames:
                frames_processed += 1

                frame_index = pedestrian_data['frames'].index(frame_num)

                if 'behavior' in pedestrian_data and 'action' in pedestrian_data['behavior'] and frame_index < len(pedestrian_data['behavior']['action']):
                    age = age_mapping.get(middle_frame_attributes.get('age', 0), 'Unknown')
                    gender = gender_mapping.get(middle_frame_attributes.get('gender', 0), 'Unknown')
                    motion_direction = motion_direction_mapping.get(middle_frame_attributes.get('motion_direction', 0), 'Unknown')
                    action = action_mapping.get(pedestrian_data['behavior']['action'][frame_index], 'Unknown')
                    cross = cross_mapping.get(pedestrian_data['behavior'].get('cross', [])[frame_index], 'Unknown')
                    reaction = reaction_mapping.get(pedestrian_data['behavior'].get('reaction', [])[frame_index], 'Unknown')
                    hand_gesture = hand_gesture_mapping.get(pedestrian_data['behavior'].get('hand_gesture', [])[frame_index], 'Unknown')
                    look = look_mapping.get(pedestrian_data['behavior'].get('look', [])[frame_index], 'Unknown')
                    nod = nod_mapping.get(pedestrian_data['behavior'].get('nod', [])[frame_index], 'Unknown')
                    vehicle = vehicle_mapping.get(vehicle_annotations[frame_index], 'Unknown')


                    bounding_box_info = ""
                    if frame_num in pedestrian_data['bbox']:
                        bbox = pedestrian_data['bbox'][frame_num]
                        bounding_box_info = f"in the bounding box: {bbox}, "
                        
                    next_bounding_box_coordinates = []
                    for i in range(1, 3):
                        next_frame_index = frame_index + i * 10
                        if next_frame_index < len(pedestrian_data['frames']):
                            next_bbox = pedestrian_data['bbox'][next_frame_index]
                            next_bounding_box_coordinates.append(next_bbox)
                        else:
                            break

                    next_bounding_box_info = ""
                    if next_bounding_box_coordinates:
                        next_bounding_box_info = "The predicted trajectory for the pedestrian for the next 1 second is: "
                        next_bounding_box_info += ', '.join([f"[{bbox[0]}, {bbox[1]}]" for bbox in next_bounding_box_coordinates if bbox])
                        next_bounding_box_info += " respectively."
                    
                    
                    prompt = {
                        "Video ID": video_id,
                        "Ped_id": pedestrian_id,
                        "Frame Number": frame_num,
                        "conversations": [
                            {
                                "from": "user",
                                "value": f" Picture: {os.path.join(output_pedestrian_dir, f'Pedestrian_{pedestrian_id}_Image_{frame_num}.png')}"
                            },
                            {
                                "from": "user",
                                "value": f"Role: You are tasked with enhancing the pedestrian detection and its immediate trajectory prediction system for an autonomous vehicle. Firstly, can you provide insights on pedestrian {pedestrian_id} in {video_id}, as well as the vehicle movement and the entire traffic scene?"
                            },
                            {
                                "from": "assistant",
                                "value": f"Of course. The pedestrian identified as '{pedestrian_id}' is present in the bounding box: {pedestrian_data['bbox'][frame_index]}, is of the age group: {age}, and has gender is: {gender}. Its motion direction is {motion_direction}, and it is currently {action}. The pedestrian is {cross} the road, while its reaction is {reaction}. Also, The pedestrian maintains a {hand_gesture} hand gesture throughout. The pedestrian is {look}. The pedestrian nodding state is {nod}. For the vehicle, it is '{vehicle}'. Moreover, for the scene attributes, the road type is '{road_type}', the pedestrian crossing is '{pedestrian_crossing}', pedestrian sign is '{pedestrian_sign}', stop sign is '{stop_sign}', and the traffic light is '{traffic_light}'."
                            },
                            {
                                "from": "user",
                                "value": "Thank you for the information. Can you also assist me in predicting the trajectories of pedestrians in the next 0.5 and 1 seconds?"
                            },
                            {
                                "from": "assistant",
                                "value": "Sure, I can help with that. Let's analyze the pedestrian behavior and predict their trajectories for the next 0.5 and 1 seconds."
                            }
                        ]
                    }

                    all_video_prompts.append(prompt)
                    
                    #print("Prompt:")
                    #print(json.dumps(prompt, indent=4))
                    
                if frames_processed >= max_frames_per_pedestrian:
                    break

    videos_processed += 1

with open('output_prompts_Kiran(Original).json', 'w') as f:
    json.dump(all_video_prompts, f, indent=4)

print("Prompts generated and saved to 'output_prompts_Kiran(Original).json'.")


In [None]:
import json
import pickle
import os

annotations_file = "./data_cache/jaad_database.pkl"

images_dir = "./images"

output_main_dir = "./images_with_boxes(Pedestrians Focused)"

with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

num_videos_to_process = 346
videos_processed = 0

max_frames_per_pedestrian = 5

occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'not-nodding', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

all_video_prompts = []

for video_id, video_data in database.items():
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")
    #print(output_pedestrian_dir)
    #os.makedirs(output_video_dir, exist_ok=True)
    if videos_processed >= num_videos_to_process:
        break
    
    vehicle_annotations = video_data['vehicle_annotations']
    #print(vehicle_annotations.keys())
    traffic_annotations = video_data['traffic_annotations']
    #print(traffic_annotations.keys())
    ped_annotations = video_data['ped_annotations']
    #print(ped_annotations.keys())
        
    first_frame_index = 0  
    first_frame_traffic_attributes = traffic_annotations[first_frame_index]
    '''
    print("Keys and Values in first_frame_traffic_attributes:")
    for key, value in first_frame_traffic_attributes.items():
        print(f"{key}: {value}")
    '''
    vehicle = vehicle_mapping.get(vehicle_annotations[first_frame_index], 'Unknown')
    road_type = road_type_mapping.get(traffic_annotations.get('road_type', 0), 'Unknown')
    
    pedestrian_crossing = pedestrian_crossing_mapping.get(first_frame_traffic_attributes.get('ped_crossing', 0), 'Unknown')
    pedestrian_sign = pedestrian_sign_mapping.get(first_frame_traffic_attributes.get('ped_sign', 0), 'Unknown')
    stop_sign = stop_sign_mapping.get(first_frame_traffic_attributes.get('stop_sign', 0), 'Unknown')
    traffic_light = traffic_light_mapping.get(first_frame_traffic_attributes.get('traffic_light', 0), 'Unknown')


    num_frames = video_data['num_frames']

    for pedestrian_id, pedestrian_data in video_data['ped_annotations'].items():
        
        output_pedestrian_dir = os.path.join(output_video_dir, f"Pedestrian_{pedestrian_id}")

        middle_frame_index = len(pedestrian_data['frames']) // 2
        middle_frame_attributes = pedestrian_data['attributes']

        frames_processed = 0

        first_frame_number = pedestrian_data['frames'][0]

        frames_to_process = pedestrian_data['frames'][::10][:max_frames_per_pedestrian]
        
        for frame_num in frames_to_process:
            if frame_num <= num_frames:
                frames_processed += 1

                frame_index = pedestrian_data['frames'].index(frame_num)

                if 'behavior' in pedestrian_data and 'action' in pedestrian_data['behavior'] and frame_index < len(pedestrian_data['behavior']['action']):
                    age = age_mapping.get(middle_frame_attributes.get('age', 0), 'Unknown')
                    gender = gender_mapping.get(middle_frame_attributes.get('gender', 0), 'Unknown')
                    motion_direction = motion_direction_mapping.get(middle_frame_attributes.get('motion_direction', 0), 'Unknown')
                    action = action_mapping.get(pedestrian_data['behavior']['action'][frame_index], 'Unknown')
                    cross = cross_mapping.get(pedestrian_data['behavior'].get('cross', [])[frame_index], 'Unknown')
                    reaction = reaction_mapping.get(pedestrian_data['behavior'].get('reaction', [])[frame_index], 'Unknown')
                    hand_gesture = hand_gesture_mapping.get(pedestrian_data['behavior'].get('hand_gesture', [])[frame_index], 'Unknown')
                    look = look_mapping.get(pedestrian_data['behavior'].get('look', [])[frame_index], 'Unknown')
                    nod = nod_mapping.get(pedestrian_data['behavior'].get('nod', [])[frame_index], 'Unknown')
                    vehicle = vehicle_mapping.get(vehicle_annotations[frame_index], 'Unknown')


                    bounding_box_info = ""
                    if frame_num in pedestrian_data['bbox']:
                        bbox = pedestrian_data['bbox'][frame_num]
                        bounding_box_info = f"in the bounding box: {bbox}, "

                    # Prompting for prediction of the next 2 bounding boxes
                    next_bounding_box_coordinates = []
                    for i in range(1, 3):
                        next_frame_index = frame_index + i * 10
                        if next_frame_index < len(pedestrian_data['frames']):
                            next_bbox = pedestrian_data['bbox'][next_frame_index]
                            next_bounding_box_coordinates.append(next_bbox)
                        else:
                            break

                    next_bounding_box_info = ""
                    if next_bounding_box_coordinates:
                        next_bounding_box_info = "The predicted trajectory for the pedestrian for the next 1 second is: "
                        next_bounding_box_info += ', '.join([f"[{bbox}]" for bbox in next_bounding_box_coordinates if bbox])
                        next_bounding_box_info += " respectively."

                    
                    
                    prompt = {
                        "Video ID": video_id,
                        "Ped_id": pedestrian_id,
                        "Frame Number": frame_num,
                        "conversations": [
                            {
                                "from": "user",
                                "value": f" Picture: {os.path.join(output_pedestrian_dir, f'Pedestrian_{pedestrian_id}_Image_{frame_num}.png')}"
                            },
                            {
                                "from": "user",
                                "value": f"Role: You are tasked with enhancing the pedestrian detection and its immediate trajectory prediction system for an autonomous vehicle. Firstly, can you provide insights on pedestrian {pedestrian_id} in {video_id}?"
                            },
                            {
                                "from": "assistant",
                                "value": f"Of course. The pedestrian identified as '{pedestrian_id}' is present in the bounding box: {pedestrian_data['bbox'][frame_index]}, is of the {age} age group, and its gender is {gender}. Its motion direction is {motion_direction}, and it is currently {action}. The pedestrian is {cross} the road, while its reaction is {reaction}. Also, The pedestrian maintains a {hand_gesture} hand gesture throughout the video. Moreover, the pedestrian is {look} at the vehicle and {nod}." 
                            },
                            {
                                "from": "user",
                                "value": f"Great! Now can you tell me about the vehicle movement?"
                            },
                            {
                                "from": "assistant",
                                "value": f"For the vehicle, it is '{vehicle}'" 
                            },
                            {
                                "from": "user",
                                "value": f"Great! Now can you describe the entire traffic scene?"
                            },
                            {
                                "from": "assistant",
                                "value": f"Yes, for the scene attributes, the road type is '{road_type}', the pedestrian crossing is '{pedestrian_crossing}', pedestrian sign is '{pedestrian_sign}', stop sign is '{stop_sign}', and the traffic light is '{traffic_light}'." 
                            },
                            {
                                "from": "user",
                                "value": "Thank you for the information. Can you also assist me in predicting the trajectories of pedestrians in the next 0.5 and 1 seconds?"
                            },
                             {
                                "from": "assistant",
                                "value": f"{next_bounding_box_info}"
                            },
                        ]
                    }


                    all_video_prompts.append(prompt)
                    
                    #print("Prompt:")
                    #print(json.dumps(prompt, indent=4))
                    
                if frames_processed >= max_frames_per_pedestrian:
                    break

    videos_processed += 1

with open('output_prompts_Kiran(Type2).json', 'w') as f:
    json.dump(all_video_prompts, f, indent=4)

print("Prompts generated and saved to 'output_prompts_Kiran(Type2).json'.")


In [None]:
import json
import pickle
import os

annotations_file = "./data_cache/jaad_database.pkl"

images_dir = "./images"

output_main_dir = "./images_with_boxes(Pedestrians Focused)"

with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

num_videos_to_process = 1
videos_processed = 0

max_frames_per_pedestrian = 5

occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'not-nodding', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

all_video_prompts = []

for video_id, video_data in database.items():
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")
    #print(output_pedestrian_dir)
    #os.makedirs(output_video_dir, exist_ok=True)
    if videos_processed >= num_videos_to_process:
        break
    
    vehicle_annotations = video_data['vehicle_annotations']
    #print(vehicle_annotations.keys())
    traffic_annotations = video_data['traffic_annotations']
    #print(traffic_annotations.keys())
    ped_annotations = video_data['ped_annotations']
    #print(ped_annotations.keys())
        
    first_frame_index = 0  
    first_frame_traffic_attributes = traffic_annotations[first_frame_index]
    '''
    print("Keys and Values in first_frame_traffic_attributes:")
    for key, value in first_frame_traffic_attributes.items():
        print(f"{key}: {value}")
    '''
    vehicle = vehicle_mapping.get(vehicle_annotations[first_frame_index], 'Unknown')
    road_type = road_type_mapping.get(traffic_annotations.get('road_type', 0), 'Unknown')
    
    pedestrian_crossing = pedestrian_crossing_mapping.get(first_frame_traffic_attributes.get('ped_crossing', 0), 'Unknown')
    pedestrian_sign = pedestrian_sign_mapping.get(first_frame_traffic_attributes.get('ped_sign', 0), 'Unknown')
    stop_sign = stop_sign_mapping.get(first_frame_traffic_attributes.get('stop_sign', 0), 'Unknown')
    traffic_light = traffic_light_mapping.get(first_frame_traffic_attributes.get('traffic_light', 0), 'Unknown')


    num_frames = video_data['num_frames']

    for pedestrian_id, pedestrian_data in video_data['ped_annotations'].items():
        
        output_pedestrian_dir = os.path.join(output_video_dir, f"Pedestrian_{pedestrian_id}")

        middle_frame_index = len(pedestrian_data['frames']) // 2
        middle_frame_attributes = pedestrian_data['attributes']

        frames_processed = 0

        first_frame_number = pedestrian_data['frames'][0]

        frames_to_process = pedestrian_data['frames'][::10][:max_frames_per_pedestrian]
        
        for frame_num in frames_to_process:
            if frame_num <= num_frames:
                frames_processed += 1

                frame_index = pedestrian_data['frames'].index(frame_num)

                if 'behavior' in pedestrian_data and 'action' in pedestrian_data['behavior'] and frame_index < len(pedestrian_data['behavior']['action']):
                    age = age_mapping.get(middle_frame_attributes.get('age', 0), 'Unknown')
                    gender = gender_mapping.get(middle_frame_attributes.get('gender', 0), 'Unknown')
                    motion_direction = motion_direction_mapping.get(middle_frame_attributes.get('motion_direction', 0), 'Unknown')
                    action = action_mapping.get(pedestrian_data['behavior']['action'][frame_index], 'Unknown')
                    cross = cross_mapping.get(pedestrian_data['behavior'].get('cross', [])[frame_index], 'Unknown')
                    reaction = reaction_mapping.get(pedestrian_data['behavior'].get('reaction', [])[frame_index], 'Unknown')
                    hand_gesture = hand_gesture_mapping.get(pedestrian_data['behavior'].get('hand_gesture', [])[frame_index], 'Unknown')
                    look = look_mapping.get(pedestrian_data['behavior'].get('look', [])[frame_index], 'Unknown')
                    nod = nod_mapping.get(pedestrian_data['behavior'].get('nod', [])[frame_index], 'Unknown')
                    vehicle = vehicle_mapping.get(vehicle_annotations[frame_index], 'Unknown')


                    bounding_box_info = ""
                    if frame_num in pedestrian_data['bbox']:
                        bbox = pedestrian_data['bbox'][frame_num]
                        bounding_box_info = f"in the bounding box: {bbox}, "

                    # Prompting for prediction of the next 2 bounding boxes
                    next_bounding_box_coordinates = []
                    for i in range(1, 3):
                        next_frame_index = frame_index + i * 10
                        if next_frame_index < len(pedestrian_data['frames']):
                            next_bbox = pedestrian_data['bbox'][next_frame_index]
                            next_bounding_box_coordinates.append(next_bbox)
                        else:
                            break

                    next_bounding_box_info = ""
                    if next_bounding_box_coordinates:
                        next_bounding_box_info = "The predicted trajectory for the pedestrian for the next 1 second is: "
                        next_bounding_box_info += ', '.join([f"[{bbox}]" for bbox in next_bounding_box_coordinates if bbox])
                        next_bounding_box_info += " respectively."

                    
                    
                    prompt = {
                        "Video ID": video_id,
                        "Ped_id": pedestrian_id,
                        "Frame Number": frame_num,
                        "conversations": [
                            {
                                "from": "user",
                                "value": f"Picture: {os.path.join(output_pedestrian_dir, f'Pedestrian_{pedestrian_id}_Image_{frame_num}.png')}"
                            },
                            {
                                "from": "user",
                                "value": f"Role: You are tasked with enhancing the pedestrian detection and its immediate trajectory prediction system for an autonomous vehicle. Firstly, can you provide insights on pedestrian {pedestrian_id} in {video_id}? Specifically, what is the coordinate of the pedestrian at present?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's current location is within the bounding box: {pedestrian_data['bbox'][frame_index]}."
                            },
                            {
                                "from": "user",
                                "value": f"Is the pedestrian currently engaged in crossing the road?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian is {cross} the road."
                            },
                            {
                                "from": "user",
                                "value": f"What is the pedestrian's motion direction?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's motion direction is {motion_direction}."
                            },
                            {
                                "from": "user",
                                "value": f"Can you describe the pedestrian's action?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian is currently {action}."
                            },
                            {
                                "from": "user",
                                "value": f"What is the pedestrian's reaction?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's reaction is {reaction}."
                            },
                            {
                                "from": "user",
                                "value": f"Is the pedestrian making any specific hand gestures?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian maintains a {hand_gesture} hand gesture throughout the video."
                            },
                            {
                                "from": "user",
                                "value": f"Is the pedestrian looking at the vehicle?"
                            },
                            {
                                "from": "assistant",
                                "value": f"Yes, the pedestrian is {look} at the vehicle."
                            },
                            {
                                "from": "user",
                                "value": f"Is the pedestrian nodding?"
                            },
                            {
                                "from": "assistant",
                                "value": f"Yes, the pedestrian is {nod}."
                            },
                            {
                                "from": "user",
                                "value": f"Great! Now can you tell me about the vehicle movement?"
                            },
                            {
                                "from": "assistant",
                                "value": f"For the vehicle, it is '{vehicle}'." 
                            },
                            {
                                "from": "user",
                                "value": f"Great! Now can you describe the entire traffic scene?"
                            },
                            {
                                "from": "assistant",
                                "value": f"Yes, for the scene attributes, the road type is '{road_type}', the pedestrian crossing is '{pedestrian_crossing}', pedestrian sign is '{pedestrian_sign}', stop sign is '{stop_sign}', and the traffic light is '{traffic_light}'." 
                            },
                            {
                                "from": "user",
                                "value": "Thank you for the information. Can you also assist me in predicting the trajectories of pedestrians in the next 0.5 and 1 seconds?"
                            },
                            {
                                "from": "assistant",
                                "value": f"{next_bounding_box_info}"
                            },
                        ]
                    }
                    


                    all_video_prompts.append(prompt)
                    
                    print("Prompt:")
                    print(json.dumps(prompt, indent=4))
                    
                if frames_processed >= max_frames_per_pedestrian:
                    break

    videos_processed += 1

with open('output_prompts_Kiran(Type3).json', 'w') as f:
    json.dump(all_video_prompts, f, indent=4)

print("Prompts generated and saved to 'output_prompts_Kiran(Type3).json'.")


In [None]:
import json
import pickle
import os

annotations_file = "./data_cache/jaad_database.pkl"

images_dir = "./images"

output_main_dir = "./images_with_boxes(Pedestrians Focused)"

with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

num_videos_to_process = 1
videos_processed = 0

max_frames_per_pedestrian = 5

occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'not-nodding', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

all_video_prompts = []

for video_id, video_data in database.items():
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")
    #print(output_pedestrian_dir)
    #os.makedirs(output_video_dir, exist_ok=True)
    if videos_processed >= num_videos_to_process:
        break
    
    vehicle_annotations = video_data['vehicle_annotations']
    #print(vehicle_annotations.keys())
    traffic_annotations = video_data['traffic_annotations']
    #print(traffic_annotations.keys())
    ped_annotations = video_data['ped_annotations']
    #print(ped_annotations.keys())
        
    first_frame_index = 0  
    first_frame_traffic_attributes = traffic_annotations[first_frame_index]
    '''
    print("Keys and Values in first_frame_traffic_attributes:")
    for key, value in first_frame_traffic_attributes.items():
        print(f"{key}: {value}")
    '''
    vehicle = vehicle_mapping.get(vehicle_annotations[first_frame_index], 'Unknown')
    road_type = road_type_mapping.get(traffic_annotations.get('road_type', 0), 'Unknown')
    
    pedestrian_crossing = pedestrian_crossing_mapping.get(first_frame_traffic_attributes.get('ped_crossing', 0), 'Unknown')
    pedestrian_sign = pedestrian_sign_mapping.get(first_frame_traffic_attributes.get('ped_sign', 0), 'Unknown')
    stop_sign = stop_sign_mapping.get(first_frame_traffic_attributes.get('stop_sign', 0), 'Unknown')
    traffic_light = traffic_light_mapping.get(first_frame_traffic_attributes.get('traffic_light', 0), 'Unknown')


    num_frames = video_data['num_frames']

    for pedestrian_id, pedestrian_data in video_data['ped_annotations'].items():
        
        output_pedestrian_dir = os.path.join(output_video_dir, f"Pedestrian_{pedestrian_id}")

        middle_frame_index = len(pedestrian_data['frames']) // 2
        middle_frame_attributes = pedestrian_data['attributes']

        frames_processed = 0

        first_frame_number = pedestrian_data['frames'][0]

        frames_to_process = pedestrian_data['frames'][::10][:max_frames_per_pedestrian]
        
        for frame_num in frames_to_process:
            if frame_num <= num_frames:
                frames_processed += 1

                frame_index = pedestrian_data['frames'].index(frame_num)

                if 'behavior' in pedestrian_data and 'action' in pedestrian_data['behavior'] and frame_index < len(pedestrian_data['behavior']['action']):
                    age = age_mapping.get(middle_frame_attributes.get('age', 0), 'Unknown')
                    gender = gender_mapping.get(middle_frame_attributes.get('gender', 0), 'Unknown')
                    motion_direction = motion_direction_mapping.get(middle_frame_attributes.get('motion_direction', 0), 'Unknown')
                    action = action_mapping.get(pedestrian_data['behavior']['action'][frame_index], 'Unknown')
                    cross = cross_mapping.get(pedestrian_data['behavior'].get('cross', [])[frame_index], 'Unknown')
                    reaction = reaction_mapping.get(pedestrian_data['behavior'].get('reaction', [])[frame_index], 'Unknown')
                    hand_gesture = hand_gesture_mapping.get(pedestrian_data['behavior'].get('hand_gesture', [])[frame_index], 'Unknown')
                    look = look_mapping.get(pedestrian_data['behavior'].get('look', [])[frame_index], 'Unknown')
                    nod = nod_mapping.get(pedestrian_data['behavior'].get('nod', [])[frame_index], 'Unknown')
                    vehicle = vehicle_mapping.get(vehicle_annotations[frame_index], 'Unknown')


                    bounding_box_info = ""
                    if frame_num in pedestrian_data['bbox']:
                        bbox = pedestrian_data['bbox'][frame_num]
                        bounding_box_info = f"in the bounding box: {bbox}, "

                    next_bounding_box_coordinates = []
                    for i in range(1, 3):
                        next_frame_index = frame_index + i * 10
                        if next_frame_index < len(pedestrian_data['frames']):
                            next_bbox = pedestrian_data['bbox'][next_frame_index]
                            next_bounding_box_coordinates.append(next_bbox)
                        else:
                            break

                    next_bounding_box_info = ""
                    if next_bounding_box_coordinates:
                        next_bounding_box_info = "The predicted trajectory for the pedestrian for the next 1 second is: "
                        next_bounding_box_info += ', '.join([f"[{bbox}]" for bbox in next_bounding_box_coordinates if bbox])
                        next_bounding_box_info += " respectively."
                    
                    
                    prompt = {
                        "Video ID": video_id,
                        "Ped_id": pedestrian_id,
                        "Frame Number": frame_num,
                        "conversations": [
                            {
                                "from": "user",
                                "value": f" Picture: {os.path.join(output_pedestrian_dir, f'Pedestrian_{pedestrian_id}_Image_{frame_num}.png')}"
                            },
                            {
                                "from": "user",
                                "value": f"Role: You are tasked with enhancing the pedestrian detection and its immediate trajectory prediction system for an autonomous vehicle. Firstly, can you provide insights on pedestrian {pedestrian_id} in {video_id}, as well as the vehicle movement and the entire traffic scene?"
                            },
                            {
                                "from": "assistant",
                                "value": f"Of course. The pedestrian identified as '{pedestrian_id}' is present in the bounding box: {pedestrian_data['bbox'][frame_index]}, is of the {age} age group, and its gender is {gender}. Its motion direction is {motion_direction}, and it is currently {action}. The pedestrian is {cross} the road, while its reaction is {reaction}. Also, The pedestrian maintains a {hand_gesture} hand gesture throughout the video. Moreover, the pedestrian is {look} at the vehicle and {nod}. For the vehicle, it is '{vehicle}'. Moreover, for the scene attributes, the road type is '{road_type}', the pedestrian crossing is '{pedestrian_crossing}', pedestrian sign is '{pedestrian_sign}', stop sign is '{stop_sign}', and the traffic light is '{traffic_light}'."
                            },
                            {
                                "from": "user",
                                "value": "Thank you for the information. Can you also assist me in predicting the trajectories of pedestrians in the next 0.5 and 1 seconds?"
                            },
                             {
                                "from": "assistant",
                                "value": f"{next_bounding_box_info}"
                            },
                        ]
                    }

                    all_video_prompts.append(prompt)
                    
                    print("Prompt:")
                    print(json.dumps(prompt, indent=4))
                    
                if frames_processed >= max_frames_per_pedestrian:
                    break

    videos_processed += 1

with open('output_prompts_Kiran(Type1).json', 'w') as f:
    json.dump(all_video_prompts, f, indent=4)

print("Prompts generated and saved to 'output_prompts_Kiran(Type1).json'.")


In [None]:
import json
import pickle
import os

annotations_file = "./data_cache/jaad_database.pkl"

images_dir = "./images"

output_main_dir = "./images_with_boxes(Pedestrians Focused)"

with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

num_videos_to_process = 1
videos_processed = 0

max_frames_per_pedestrian = 5

occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'not-nodding', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

all_video_prompts = []

for video_id, video_data in database.items():
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")
    #print(output_pedestrian_dir)
    #os.makedirs(output_video_dir, exist_ok=True)
    if videos_processed >= num_videos_to_process:
        break
    
    vehicle_annotations = video_data['vehicle_annotations']
    #print(vehicle_annotations.keys())
    traffic_annotations = video_data['traffic_annotations']
    #print(traffic_annotations.keys())
    ped_annotations = video_data['ped_annotations']
    #print(ped_annotations.keys())
        
    first_frame_index = 0  
    first_frame_traffic_attributes = traffic_annotations[first_frame_index]
    '''
    print("Keys and Values in first_frame_traffic_attributes:")
    for key, value in first_frame_traffic_attributes.items():
        print(f"{key}: {value}")
    '''
    vehicle = vehicle_mapping.get(vehicle_annotations[first_frame_index], 'Unknown')
    road_type = road_type_mapping.get(traffic_annotations.get('road_type', 0), 'Unknown')
    
    pedestrian_crossing = pedestrian_crossing_mapping.get(first_frame_traffic_attributes.get('ped_crossing', 0), 'Unknown')
    pedestrian_sign = pedestrian_sign_mapping.get(first_frame_traffic_attributes.get('ped_sign', 0), 'Unknown')
    stop_sign = stop_sign_mapping.get(first_frame_traffic_attributes.get('stop_sign', 0), 'Unknown')
    traffic_light = traffic_light_mapping.get(first_frame_traffic_attributes.get('traffic_light', 0), 'Unknown')


    num_frames = video_data['num_frames']

    for pedestrian_id, pedestrian_data in video_data['ped_annotations'].items():
        
        output_pedestrian_dir = os.path.join(output_video_dir, f"Pedestrian_{pedestrian_id}")

        middle_frame_index = len(pedestrian_data['frames']) // 2
        middle_frame_attributes = pedestrian_data['attributes']

        frames_processed = 0

        first_frame_number = pedestrian_data['frames'][0]

        frames_to_process = pedestrian_data['frames'][::10][:max_frames_per_pedestrian]
        
        for frame_num in frames_to_process:
            if frame_num <= num_frames:
                frames_processed += 1

                frame_index = pedestrian_data['frames'].index(frame_num)

                if 'behavior' in pedestrian_data and 'action' in pedestrian_data['behavior'] and frame_index < len(pedestrian_data['behavior']['action']):
                    age = age_mapping.get(middle_frame_attributes.get('age', 0), 'Unknown')
                    gender = gender_mapping.get(middle_frame_attributes.get('gender', 0), 'Unknown')
                    motion_direction = motion_direction_mapping.get(middle_frame_attributes.get('motion_direction', 0), 'Unknown')
                    action = action_mapping.get(pedestrian_data['behavior']['action'][frame_index], 'Unknown')
                    cross = cross_mapping.get(pedestrian_data['behavior'].get('cross', [])[frame_index], 'Unknown')
                    reaction = reaction_mapping.get(pedestrian_data['behavior'].get('reaction', [])[frame_index], 'Unknown')
                    hand_gesture = hand_gesture_mapping.get(pedestrian_data['behavior'].get('hand_gesture', [])[frame_index], 'Unknown')
                    look = look_mapping.get(pedestrian_data['behavior'].get('look', [])[frame_index], 'Unknown')
                    nod = nod_mapping.get(pedestrian_data['behavior'].get('nod', [])[frame_index], 'Unknown')
                    vehicle = vehicle_mapping.get(vehicle_annotations[frame_index], 'Unknown')


                    bounding_box_info = ""
                    if frame_num in pedestrian_data['bbox']:
                        bbox = pedestrian_data['bbox'][frame_num]
                        bounding_box_info = f"in the bounding box: {bbox}, "

                    # Prompting for prediction of the next 2 bounding boxes
                    next_bounding_box_coordinates = []
                    for i in range(1, 3):
                        next_frame_index = frame_index + i * 10
                        if next_frame_index < len(pedestrian_data['frames']):
                            next_bbox = pedestrian_data['bbox'][next_frame_index]
                            next_bounding_box_coordinates.append(next_bbox)
                        else:
                            break

                    next_bounding_box_info = ""
                    if next_bounding_box_coordinates:
                        next_bounding_box_info = "The predicted trajectory for the pedestrian for the next 1 second is: "
                        next_bounding_box_info += ', '.join([f"[{bbox}]" for bbox in next_bounding_box_coordinates if bbox])
                        next_bounding_box_info += " respectively."

                    occlusion_sentence = (
                        "The pedestrian in this video is fully visible, without any obstructions." if occlusion_mapping == 0 else
                        "The pedestrian in this video is partially visible, there's some obstruction in the view." if occlusion_mapping == 1 else
                        "The pedestrian is fully obstructed in the video angle and cannot be seen clearly."
                    )

                    action_sentence = (
                        "The pedestrian is currently standing still." if action == 0 else
                        "The pedestrian is currently walking."
                    )

                    nod_sentence = (
                        "The pedestrian is nodding in agreement." if nod == 1 else
                        "The pedestrian is not nodding."
                    )

                    look_sentence = (
                        "The pedestrian is actively looking around." if look == 1 else
                        "The pedestrian is not actively looking around."
                    )

                    hand_gesture_sentence = (
                        f"The pedestrian is making a {hand_gesture} hand gesture." if hand_gesture != 0 else
                        "The pedestrian's hand gesture is not identifiable."
                    )

                    reaction_sentence = (
                        f"The pedestrian's reaction suggests a clear path ahead." if reaction == 1 else
                        f"The pedestrian's reaction suggests they are speeding up." if reaction == 2 else
                        f"The pedestrian's reaction suggests they are slowing down." if reaction == 3 else
                        "The pedestrian's reaction is unclear."
                    )

                    cross_sentence = (
                        "The pedestrian is currently crossing the road." if cross == 1 else
                        "The pedestrian is not currently crossing the road." if cross == 0 else
                        "The status of pedestrian crossing is irrelevant."
                    )

                    # Generate the combined paragraph for the prompt
                    prompt_paragraph = (
                        f"{occlusion_sentence} {action_sentence} {nod_sentence} "
                        f"{look_sentence} {hand_gesture_sentence} {reaction_sentence} {cross_sentence}"
                    )

                    
                    prompt = {
                        "conversations": [
                        {
                            "from": "user",
                            "value": (
                                "Context: Your job is to predict the pedestrian trajectory of pedestrians. "
                                "I will give you images that are frames from a video and your role is to predict what the pedestrian will do next. "
                                f"Here is some information based on the pedestrian {pedestrian_id} in {video_id}:"
                                f"Input: {pedestrian_id} who is bounded by the box  ,Age: adult\nGender: female\nDirection of motion: LONGITUDE\nCurrent action: walking\nType of road: parking_lot\nPedestrian crossing: 'Absent'\nPedestrian sign: 'Absent'\nState of traffic light: n/a\nVehicle: moving_slow\n\nThe pedestrian is not-crossing the road, while its reaction is undefined. Also, The pedestrian maintains a undefined hand gesture throughout. The pedestrian is not-looking. The pedestrian nodding state is not-nodding.\n\n"
                                "Role:\nNow, based on the images, your role is to predict what the pedestrian will do next in the next 1 second. Analyze the pedestrian behaviour and predict their trajectory."
                            )
                        },
                            {
                                "from": "GPT",
                                "value": "The pedestrian will cross the road after 1 second because the combination of their raised hand gesture, indicating a desire to halt any potentially oncoming traffic, the current green traffic light for pedestrians, the absence of vehicles in the immediate vicinity, and their consistent gaze towards the opposite sidewalk suggests they are actively assessing the safety of crossing at this moment. Given these conditions, including their positioning at the edge of the pedestrian crossing zone, it's highly probable they will initiate crossing the road within the next second."
                            }
                        ]
                    }

                    all_video_prompts.append(prompt)
                    
                    print("Prompt:")
                    print(json.dumps(prompt, indent=4))
                    
                if frames_processed >= max_frames_per_pedestrian:
                    break

    videos_processed += 1

with open('output_prompts_Kiran(Type4).json', 'w') as f:
    json.dump(all_video_prompts, f, indent=4)

print("Prompts generated and saved to 'output_prompts_Kiran(Type4).json'.")


This code creates prompt for every frame, contains information of 5 previous bboxes, current bbox, and upcoming 5 bboxes

In [None]:
import json
import pickle
import os

annotations_file = "./data_cache/jaad_database.pkl"
images_dir = "./images"
output_main_dir = "./images_with_boxes(Pedestrians Focused)"

with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

num_videos_to_process = 1
videos_processed = 0

occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'not-nodding', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

all_video_prompts = []

for video_id, video_data in database.items():
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")

    if videos_processed >= num_videos_to_process:
        break

    
    vehicle_annotations = video_data['vehicle_annotations']
    traffic_annotations = video_data['traffic_annotations']
    ped_annotations = video_data['ped_annotations']
    
    first_frame_index = 0  
    first_frame_traffic_attributes = traffic_annotations[first_frame_index]

    vehicle = vehicle_mapping.get(vehicle_annotations[first_frame_index], 'Unknown')
    road_type = road_type_mapping.get(traffic_annotations.get('road_type', 0), 'Unknown')
    
    pedestrian_crossing = pedestrian_crossing_mapping.get(first_frame_traffic_attributes.get('ped_crossing', 0), 'Unknown')
    pedestrian_sign = pedestrian_sign_mapping.get(first_frame_traffic_attributes.get('ped_sign', 0), 'Unknown')
    stop_sign = stop_sign_mapping.get(first_frame_traffic_attributes.get('stop_sign', 0), 'Unknown')
    traffic_light = traffic_light_mapping.get(first_frame_traffic_attributes.get('traffic_light', 0), 'Unknown')

    num_frames = video_data['num_frames']

    for pedestrian_id, pedestrian_data in video_data['ped_annotations'].items():
        output_pedestrian_dir = os.path.join(output_video_dir, f"Pedestrian_{pedestrian_id}")

        for frame_num in pedestrian_data['frames']:
            if frame_num <= num_frames:
                frame_index = pedestrian_data['frames'].index(frame_num)

                if 'behavior' in pedestrian_data and 'action' in pedestrian_data['behavior'] and frame_index < len(pedestrian_data['behavior']['action']):
                    middle_frame_attributes = pedestrian_data['attributes']

                    age = age_mapping.get(middle_frame_attributes.get('age', 0), 'Unknown')
                    gender = gender_mapping.get(middle_frame_attributes.get('gender', 0), 'Unknown')
                    motion_direction = motion_direction_mapping.get(middle_frame_attributes.get('motion_direction', 0), 'Unknown')
                    action = action_mapping.get(pedestrian_data['behavior']['action'][frame_index], 'Unknown')
                    cross = cross_mapping.get(pedestrian_data['behavior'].get('cross', [])[frame_index], 'Unknown')
                    reaction = reaction_mapping.get(pedestrian_data['behavior'].get('reaction', [])[frame_index], 'Unknown')
                    hand_gesture = hand_gesture_mapping.get(pedestrian_data['behavior'].get('hand_gesture', [])[frame_index], 'Unknown')
                    look = look_mapping.get(pedestrian_data['behavior'].get('look', [])[frame_index], 'Unknown')
                    nod = nod_mapping.get(pedestrian_data['behavior'].get('nod', [])[frame_index], 'Unknown')
                    vehicle = vehicle_mapping.get(vehicle_annotations[frame_index], 'Unknown')

                    # Get current bounding box coordinates
                    current_bbox = pedestrian_data['bbox'][frame_num]

                    # Get previous 5 bounding boxes
                    previous_bboxes = []
                    for i in range(frame_index - 5, frame_index):
                        if i >= 0:
                            previous_bboxes.append(pedestrian_data['bbox'][pedestrian_data['frames'][i]])

                    # Get future 5 bounding boxes
                    future_bboxes = []
                    for i in range(frame_index + 1, min(frame_index + 6, len(pedestrian_data['frames']))):
                        future_bboxes.append(pedestrian_data['bbox'][pedestrian_data['frames'][i]])

                    # Construct the prompt
                    prompt = {
                        "Video ID": video_id,
                        "Ped_id": pedestrian_id,
                        "Frame Number": frame_num,
                        "conversations": [
                            {
                                "from": "user",
                                "value": f"Picture: {os.path.join(output_pedestrian_dir, f'Pedestrian_{pedestrian_id}_Image_{frame_num}.png')}"
                            },
                            {
                                "from": "user",
                                "value": f"Role: You are tasked with enhancing the pedestrian detection and its immediate trajectory prediction system for an autonomous vehicle. Firstly, can you provide insights on pedestrian {pedestrian_id} in {video_id}? Specifically, what is the coordinate of the pedestrian at present?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's current location is within the bounding box: {current_bbox}."
                            },
                            {
                                "from": "assistant",
                                "value": f"The 5 bounding boxes for previous frames are: {previous_bboxes}"
                            },
                            {
                                "from": "user",
                                "value": f"Is the pedestrian currently engaged in crossing the road?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian is {cross} the road."
                            },
                            {
                                "from": "user",
                                "value": f"What is the pedestrian's motion direction?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's motion direction is {motion_direction}."
                            },
                            {
                                "from": "user",
                                "value": f"Can you describe the pedestrian's action?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian is currently {action}."
                            },
                            {
                                "from": "user",
                                "value": f"What is the pedestrian's reaction?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's reaction is {reaction}."
                            },
                            {
                                "from": "user",
                                "value": f"Is the pedestrian making any specific hand gestures?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian maintains a {hand_gesture} hand gesture throughout the video."
                            },
                            {
                                "from": "user",
                                "value": f"Is the pedestrian looking at the vehicle?"
                            },
                            {
                                "from": "assistant",
                                "value": f"Yes, the pedestrian is {look} at the vehicle."
                            },
                            {
                                "from": "user",
                                "value": f"Is the pedestrian nodding?"
                            },
                            {
                                "from": "assistant",
                                "value": f"Yes, the pedestrian is {nod}."
                            },
                            {
                                "from": "user",
                                "value": f"Great! Now can you tell me about the vehicle movement?"
                            },
                            {
                                "from": "assistant",
                                "value": f"For the vehicle, it is '{vehicle}'." 
                            },
                            {
                                "from": "user",
                                "value": f"Great! Now can you describe the entire traffic scene?"
                            },
                            {
                                "from": "assistant",
                                "value": f"Yes, for the scene attributes, the road type is '{road_type}', the pedestrian crossing is '{pedestrian_crossing}', pedestrian sign is '{pedestrian_sign}', stop sign is '{stop_sign}', and the traffic light is '{traffic_light}'." 
                            },
                            {
                                "from": "user",
                                "value": "Thank you for the information. Can you also assist me in predicting the trajectories of pedestrians in the next 0.5 and 1 seconds?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The future 5 bounding boxes are: {future_bboxes}"
                            },
                        ]
                    }

                    all_video_prompts.append(prompt)

                    #print("Prompt:")
                    #print(json.dumps(prompt, indent=4))

    videos_processed += 1

with open('output_prompts_Kiran(Type5)_updated(22Apr).json', 'w') as f:
    json.dump(all_video_prompts, f, indent=4)

print("Prompts generated and saved to 'output_prompts_Kiran(Type5)_updated(22Apr).json'.")


In [None]:
import json
import pickle
import os

annotations_file = "./data_cache/jaad_database.pkl"
images_dir = "./images"
output_main_dir = "./Prompts_for_Pedestrians"

with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

num_videos_to_process = 346
videos_processed = 0

occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'not-nodding', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

for video_id, video_data in database.items():
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")
    os.makedirs(output_video_dir, exist_ok=True)

    if videos_processed >= num_videos_to_process:
        break

    vehicle_annotations = video_data['vehicle_annotations']
    traffic_annotations = video_data['traffic_annotations']
    ped_annotations = video_data['ped_annotations']
    
    vehicle_annotations = video_data['vehicle_annotations']
    traffic_annotations = video_data['traffic_annotations']
    ped_annotations = video_data['ped_annotations']
    
    first_frame_index = 0  
    first_frame_traffic_attributes = traffic_annotations[first_frame_index]

    num_frames = video_data['num_frames']
    
    vehicle = vehicle_mapping.get(vehicle_annotations[first_frame_index], 'Unknown')
    road_type = road_type_mapping.get(traffic_annotations.get('road_type', 0), 'Unknown')
    
    pedestrian_crossing = pedestrian_crossing_mapping.get(first_frame_traffic_attributes.get('ped_crossing', 0), 'Unknown')
    pedestrian_sign = pedestrian_sign_mapping.get(first_frame_traffic_attributes.get('ped_sign', 0), 'Unknown')
    stop_sign = stop_sign_mapping.get(first_frame_traffic_attributes.get('stop_sign', 0), 'Unknown')
    traffic_light = traffic_light_mapping.get(first_frame_traffic_attributes.get('traffic_light', 0), 'Unknown')


    for pedestrian_id, pedestrian_data in ped_annotations.items():
        output_pedestrian_dir = output_video_dir  # Remove subfolder creation
        os.makedirs(output_pedestrian_dir, exist_ok=True)
        
        all_video_prompts = []

        for frame_num in pedestrian_data['frames']:
            if frame_num <= num_frames:
                frame_index = pedestrian_data['frames'].index(frame_num)

                if 'behavior' in pedestrian_data and 'action' in pedestrian_data['behavior'] and frame_index < len(pedestrian_data['behavior']['action']):
                    #print(f"Processing frame {frame_num} for pedestrian {pedestrian_id}")

                    age = age_mapping.get(pedestrian_data['attributes'].get('age', 0), 'Unknown')
                    gender = gender_mapping.get(pedestrian_data['attributes'].get('gender', 0), 'Unknown')
                    motion_direction = motion_direction_mapping.get(pedestrian_data['attributes'].get('motion_direction', 0), 'Unknown')
                    action = action_mapping.get(pedestrian_data['behavior']['action'][frame_index], 'Unknown')
                    cross = cross_mapping.get(pedestrian_data['behavior'].get('cross', [])[frame_index], 'Unknown')
                    reaction = reaction_mapping.get(pedestrian_data['behavior'].get('reaction', [])[frame_index], 'Unknown')
                    hand_gesture = hand_gesture_mapping.get(pedestrian_data['behavior'].get('hand_gesture', [])[frame_index], 'Unknown')
                    look = look_mapping.get(pedestrian_data['behavior'].get('look', [])[frame_index], 'Unknown')
                    nod = nod_mapping.get(pedestrian_data['behavior'].get('nod', [])[frame_index], 'Unknown')
                    vehicle = vehicle_mapping.get(vehicle_annotations[frame_index], 'Unknown')

                    if frame_num < len(pedestrian_data['bbox']):
                        current_bbox = pedestrian_data['bbox'][frame_num]
                    else:
                        continue
                        
                 
                    previous_bboxes = []
                    for i in range(frame_index - 1, max(frame_index - 6, -1), -1):
                        if pedestrian_data['frames'][i] < len(pedestrian_data['bbox']):
                            previous_bboxes.append(pedestrian_data['bbox'][pedestrian_data['frames'][i]])
                    #else:
                        #print(f"Warning: Bbox index out of range for frame {pedestrian_data['frames'][i]}")


                    future_bboxes = []
                    for i in range(frame_index + 1, min(frame_index + 6, len(pedestrian_data['frames']))):
                        if i < len(pedestrian_data['frames']) and pedestrian_data['frames'][i] < len(pedestrian_data['bbox']):
                            future_bboxes.append(pedestrian_data['bbox'][pedestrian_data['frames'][i]])
                        else:
                            #print(f"Warning: Bbox index out of range for frame {pedestrian_data['frames'][i]}")
                            break  
                    
                    #print(f"Creating prompt for pedestrian {pedestrian_id}, frame {frame_num}")

                    prompt = {
                        "Video ID": video_id,
                        "Ped_id": pedestrian_id,
                        "Frame Number": frame_num,
                        "conversations": [
                            {
                                "from": "user",
                                "value": f"Picture: ./images_with_boxes(Pedestrians Focused)/{video_id}/Pedestrian_{pedestrian_id}/Pedestrian_{pedestrian_id}_Image_{frame_num}.png"

                            },
                            {
                                "from": "user",
                                "value": f"Role: You are tasked with enhancing the pedestrian detection and its immediate trajectory prediction system for an autonomous vehicle. Firstly, can you provide insights on pedestrian {pedestrian_id} in {video_id}? Specifically, what is the coordinate of the pedestrian at present?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's current location is within the bounding box: {pedestrian_data['bbox'][frame_index]}."
                            },
                            {
                                "from": "assistant",
                                "value": f"The 5 bounding boxes for previous frames are: {previous_bboxes}"
                            },
                            {
                                "from": "user",
                                "value": f"Is the pedestrian currently engaged in crossing the road?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian is {cross} the road."
                            },
                            {
                                "from": "user",
                                "value": f"What is the pedestrian's motion direction?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's motion direction is {motion_direction}."
                            },
                            {
                                "from": "user",
                                "value": f"Can you describe the pedestrian's action?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian is currently {action}."
                            },
                            {
                                "from": "user",
                                "value": f"What is the pedestrian's reaction?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's reaction is {reaction}."
                            },
                            {
                                "from": "user",
                                "value": f"Is the pedestrian making any specific hand gestures?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian maintains a {hand_gesture} hand gesture throughout the video."
                            },
                            {
                                "from": "user",
                                "value": f"Is the pedestrian looking at the vehicle?"
                            },
                            {
                                "from": "assistant",
                                "value": f"Yes, the pedestrian is {look} at the vehicle."
                            },
                            {
                                "from": "user",
                                "value": f"Is the pedestrian nodding?"
                            },
                            {
                                "from": "assistant",
                                "value": f"Yes, the pedestrian is {nod}."
                            },
                            {
                                "from": "user",
                                "value": f"Great! Now can you tell me about the vehicle movement?"
                            },
                            {
                                "from": "assistant",
                                "value": f"For the vehicle, it is '{vehicle}'."
                            },
                            {
                                "from": "user",
                                "value": f"Great! Now can you describe the entire traffic scene?"
                            },
                            {
                                "from": "assistant",
                                "value": f"Yes, for the scene attributes, the road type is '{road_type}', the pedestrian crossing is '{pedestrian_crossing}', pedestrian sign is '{pedestrian_sign}', stop sign is '{stop_sign}', and the traffic light is '{traffic_light}'."
                            },
                            {
                                "from": "user",
                                "value": "Thank you for the information. Can you also assist me in predicting the trajectories of pedestrians in the next 0.5 and 1 seconds?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The future 5 bounding boxes are: {future_bboxes}"
                            },
                        ]
                    }

                    all_video_prompts.append(prompt)
                #else:
                    #print(f"No behavior data found for frame {frame_num} of pedestrian {pedestrian_id}")

                    #print("Prompt:")
                    #print(json.dumps(prompt, indent=4))
        if all_video_prompts:

            with open(os.path.join(output_video_dir, f'pedestrian_{pedestrian_id}_prompts.json'), 'w') as f:
                json.dump(all_video_prompts, f, indent=4)
        #else:
            #print(f"No prompts generated for pedestrian {pedestrian_id}")

    videos_processed += 1

print("Prompts generated and saved to respective folders.")


In [None]:
import json
import pickle
import os

annotations_file = "./data_cache/jaad_database.pkl"
images_dir = "./images"
output_main_dir = "./Prompts_for_Pedestrians"

with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

num_videos_to_process = 346
videos_processed = 0

occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'not-nodding', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

for video_id, video_data in database.items():
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")
    os.makedirs(output_video_dir, exist_ok=True)

    if videos_processed >= num_videos_to_process:
        break

    ped_annotations = video_data['ped_annotations']

    for pedestrian_id, pedestrian_data in ped_annotations.items():
        output_pedestrian_dir = output_video_dir  # Remove subfolder creation
        os.makedirs(output_pedestrian_dir, exist_ok=True)

        simplified_prompts = []

        # Check if pedestrian has behavior data
        if 'behavior' not in pedestrian_data or not pedestrian_data['behavior']:
            print(f"No behavior data found for pedestrian {pedestrian_id}. Creating prompts based on bounding boxes only.")

            for frame_num in pedestrian_data['frames']:
                if frame_num <= video_data['num_frames']:
                    frame_index = pedestrian_data['frames'].index(frame_num)

                    # Get current bounding box coordinates
                    if frame_index < len(pedestrian_data['bbox']):
                        current_bbox = pedestrian_data['bbox'][frame_index]
                    else:
                        continue  # Skip this frame if bbox index is out of range

                    # Get previous 5 bounding boxes
                    previous_bboxes = []
                    for i in range(max(frame_index - 5, 0), frame_index):
                        if i < len(pedestrian_data['bbox']):
                            previous_bboxes.append(pedestrian_data['bbox'][i])

                    # Get future 5 bounding boxes
                    future_bboxes = []
                    for i in range(frame_index + 1, min(frame_index + 6, len(pedestrian_data['bbox']))):
                        future_bboxes.append(pedestrian_data['bbox'][i])

                    # Construct simplified prompt
                    simplified_prompt = {
                        "Video ID": video_id,
                        "Ped_id": pedestrian_id,
                        "Frame Number": frame_num,
                        "conversations": [
                            {
                                "from": "user",
                                "value": f"Picture: ./images_with_boxes(Pedestrians Focused)/{video_id}/Pedestrian_{pedestrian_id}/Pedestrian_{pedestrian_id}_Image_{frame_num}.png"
                            },
                            {
                                "from": "user",
                                "value": f"Role: You are tasked with enhancing the pedestrian detection and its immediate trajectory prediction system for an autonomous vehicle. Firstly, can you provide insights on pedestrian {pedestrian_id} in {video_id}? Specifically, what is the coordinate of the pedestrian at present?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's current location is within the bounding box: {current_bbox}."
                            },
                            {
                                "from": "assistant",
                                "value": f"The 5 bounding boxes for previous frames are: {previous_bboxes}"
                            },
                            {
                                "from": "user",
                                "value": f"Thank you for the information. Can you also assist me in predicting the trajectories of pedestrians in the next 0.5 and 1 seconds?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The future 5 bounding boxes are: {future_bboxes}"
                            },
                        ]
                    }

                    simplified_prompts.append(simplified_prompt)  # Append the simplified prompt to the list

            # Save simplified prompts
            with open(os.path.join(output_pedestrian_dir, f'pedestrian_{pedestrian_id}_prompts.json'), 'w') as f:
                json.dump(simplified_prompts, f, indent=4)

            videos_processed += 1

print("Simplified prompts generated and saved to respective folders.")


In [None]:
import json
import pickle
import os

annotations_file = "./data_cache/jaad_database.pkl"
images_dir = "./images"
output_main_dir = "./Prompts_for_Pedestrians"

with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

num_videos_to_process = 346
videos_processed = 0

occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'not-nodding', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

for video_id, video_data in database.items():
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")
    os.makedirs(output_video_dir, exist_ok=True)

    if videos_processed >= num_videos_to_process:
        break

    vehicle_annotations = video_data['vehicle_annotations']
    traffic_annotations = video_data['traffic_annotations']
    ped_annotations = video_data['ped_annotations']
    
    vehicle_annotations = video_data['vehicle_annotations']
    traffic_annotations = video_data['traffic_annotations']
    ped_annotations = video_data['ped_annotations']
    
    first_frame_index = 0  
    first_frame_traffic_attributes = traffic_annotations[first_frame_index]

    num_frames = video_data['num_frames']
    
    vehicle = vehicle_mapping.get(vehicle_annotations[first_frame_index], 'Unknown')
    road_type = road_type_mapping.get(traffic_annotations.get('road_type', 0), 'Unknown')
    
    pedestrian_crossing = pedestrian_crossing_mapping.get(first_frame_traffic_attributes.get('ped_crossing', 0), 'Unknown')
    pedestrian_sign = pedestrian_sign_mapping.get(first_frame_traffic_attributes.get('ped_sign', 0), 'Unknown')
    stop_sign = stop_sign_mapping.get(first_frame_traffic_attributes.get('stop_sign', 0), 'Unknown')
    traffic_light = traffic_light_mapping.get(first_frame_traffic_attributes.get('traffic_light', 0), 'Unknown')


    for pedestrian_id, pedestrian_data in ped_annotations.items():
        output_pedestrian_dir = output_video_dir  # Remove subfolder creation
        os.makedirs(output_pedestrian_dir, exist_ok=True)
        
        all_video_prompts = []

        for frame_num in pedestrian_data['frames']:
            if frame_num <= num_frames:
                frame_index = pedestrian_data['frames'].index(frame_num)

                if 'behavior' in pedestrian_data and 'action' in pedestrian_data['behavior'] and frame_index < len(pedestrian_data['behavior']['action']):
                    #print(f"Processing frame {frame_num} for pedestrian {pedestrian_id}")

                    age = age_mapping.get(pedestrian_data['attributes'].get('age', 0), 'Unknown')
                    gender = gender_mapping.get(pedestrian_data['attributes'].get('gender', 0), 'Unknown')
                    motion_direction = motion_direction_mapping.get(pedestrian_data['attributes'].get('motion_direction', 0), 'Unknown')
                    action = action_mapping.get(pedestrian_data['behavior']['action'][frame_index], 'Unknown')
                    cross = cross_mapping.get(pedestrian_data['behavior'].get('cross', [])[frame_index], 'Unknown')
                    reaction = reaction_mapping.get(pedestrian_data['behavior'].get('reaction', [])[frame_index], 'Unknown')
                    hand_gesture = hand_gesture_mapping.get(pedestrian_data['behavior'].get('hand_gesture', [])[frame_index], 'Unknown')
                    look = look_mapping.get(pedestrian_data['behavior'].get('look', [])[frame_index], 'Unknown')
                    nod = nod_mapping.get(pedestrian_data['behavior'].get('nod', [])[frame_index], 'Unknown')
                    vehicle = vehicle_mapping.get(vehicle_annotations[frame_index], 'Unknown')

                    if frame_num < len(pedestrian_data['bbox']):
                        current_bbox = pedestrian_data['bbox'][frame_num]
                    else:
                        continue
                        
                 
                    previous_bboxes = []
                    for i in range(frame_index - 1, max(frame_index - 6, -1), -1):
                        if pedestrian_data['frames'][i] < len(pedestrian_data['bbox']):
                            previous_bboxes.append(pedestrian_data['bbox'][pedestrian_data['frames'][i]])
                    #else:
                        #print(f"Warning: Bbox index out of range for frame {pedestrian_data['frames'][i]}")


                    future_bboxes = []
                    for i in range(frame_index + 1, min(frame_index + 6, len(pedestrian_data['frames']))):
                        if i < len(pedestrian_data['frames']) and pedestrian_data['frames'][i] < len(pedestrian_data['bbox']):
                            future_bboxes.append(pedestrian_data['bbox'][pedestrian_data['frames'][i]])
                        else:
                            #print(f"Warning: Bbox index out of range for frame {pedestrian_data['frames'][i]}")
                            break  
                    
                    #print(f"Creating prompt for pedestrian {pedestrian_id}, frame {frame_num}")

                    prompt = {
                        "id": pedestrian_id,
                        "image": f"./images_with_boxes_Pedestrians Focused_/{video_id}/Pedestrian_{pedestrian_id}/Pedestrian_{pedestrian_id}_Image_{frame_num}.png",
                        "conversations": [
                            {
                                "from": "user",
                                "value": f"<image> Can you provide insights on pedestrian {pedestrian_id} in {video_id}? Specifically, what is the coordinate of the pedestrian at present and the past 5 frames? Is the pedestrian currently engaged in crossing the road? What is the pedestrian's motion direction? Can you tell the pedestrian's action? What is the pedestrian's reaction? Is the pedestrian making any specific hand gestures? Is the pedestrian looking at the vehicle? Is the pedestrian nodding? Also, can you tell me about the vehicle movement? Moreover can you describe the entire traffic scene? Can you also assist me in predicting the trajectories of pedestrians in the next 5 frames?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's current location is within the bounding box: {pedestrian_data['bbox'][frame_index]}. The 5 bounding boxes for previous frames are: {previous_bboxes}. The pedestrian is {cross} the road. The pedestrian's motion direction is {motion_direction}. The pedestrian is currently {action}. The pedestrian's reaction is {reaction}. The pedestrian maintains a {hand_gesture} hand gesture throughout the video. The pedestrian is {look} at the vehicle and the pedestrian is {nod}. For the vehicle attributes, it is '{vehicle}'. While, for the scene attributes, the road type is '{road_type}', the pedestrian crossing is '{pedestrian_crossing}', pedestrian sign is '{pedestrian_sign}', stop sign is '{stop_sign}', and the traffic light is '{traffic_light}'. The future 5 bounding boxes are: {future_bboxes}."
                            }
                        ]
                    }

                    all_video_prompts.append(prompt)
                #else:
                    #print(f"No behavior data found for frame {frame_num} of pedestrian {pedestrian_id}")

                    #print("Prompt:")
                    #print(json.dumps(prompt, indent=4))
        if all_video_prompts:

            with open(os.path.join(output_video_dir, f'pedestrian_{pedestrian_id}_prompts.json'), 'w') as f:
                json.dump(all_video_prompts, f, indent=4)
        #else:
            #print(f"No prompts generated for pedestrian {pedestrian_id}")

    videos_processed += 1

print("Prompts generated and saved to respective folders.")


In [None]:
import json
import pickle
import os

annotations_file = "./data_cache/jaad_database.pkl"
images_dir = "./images"
output_main_dir = "./Prompts_for_Pedestrians"

with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

num_videos_to_process = 346
videos_processed = 0

occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'not-nodding', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

for video_id, video_data in database.items():
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")
    os.makedirs(output_video_dir, exist_ok=True)

    if videos_processed >= num_videos_to_process:
        break

    ped_annotations = video_data['ped_annotations']

    for pedestrian_id, pedestrian_data in ped_annotations.items():
        output_pedestrian_dir = output_video_dir  # Remove subfolder creation
        os.makedirs(output_pedestrian_dir, exist_ok=True)

        simplified_prompts = []

        # Check if pedestrian has behavior data
        if 'behavior' not in pedestrian_data or not pedestrian_data['behavior']:
            print(f"No behavior data found for pedestrian {pedestrian_id}. Creating prompts based on bounding boxes only.")

            for frame_num in pedestrian_data['frames']:
                if frame_num <= video_data['num_frames']:
                    frame_index = pedestrian_data['frames'].index(frame_num)

                    # Get current bounding box coordinates
                    if frame_index < len(pedestrian_data['bbox']):
                        current_bbox = pedestrian_data['bbox'][frame_index]
                    else:
                        continue  # Skip this frame if bbox index is out of range

                    # Get previous 5 bounding boxes
                    previous_bboxes = []
                    for i in range(max(frame_index - 5, 0), frame_index):
                        if i < len(pedestrian_data['bbox']):
                            previous_bboxes.append(pedestrian_data['bbox'][i])

                    # Get future 5 bounding boxes
                    future_bboxes = []
                    for i in range(frame_index + 1, min(frame_index + 6, len(pedestrian_data['bbox']))):
                        future_bboxes.append(pedestrian_data['bbox'][i])

                    # Construct simplified prompt
                    simplified_prompt = {
                        "id": pedestrian_id,
                        "image": f"./images_with_boxes_Pedestrians Focused_/{video_id}/Pedestrian_{pedestrian_id}/Pedestrian_{pedestrian_id}_Image_{frame_num}.png",
                        "conversations": [
                            {
                                "from": "user",
                                "value": f": <image> Can you provide insights on pedestrian {pedestrian_id} in {video_id}? Specifically, what is the coordinate of the pedestrian at present and for 5 previous frames?  Also, can you tell me about the vehicle movement? Moreover can you describe the entire traffic scene? Can you also assist me in predicting the trajectories of pedestrians in the next 5 frames?"
                            },
                            {
                                "from": "assistant",
                                "value": f"The pedestrian's current location is within the bounding box: {current_bbox}. The 5 bounding boxes for previous frames are: {previous_bboxes}. For the vehicle attributes, it is '{vehicle}'. While, for the scene attributes, the road type is '{road_type}', the pedestrian crossing is '{pedestrian_crossing}', pedestrian sign is '{pedestrian_sign}', stop sign is '{stop_sign}', and the traffic light is '{traffic_light}'. The future 5 bounding boxes are: {future_bboxes}."
                            }
                        ]
                    }

                    simplified_prompts.append(simplified_prompt)  # Append the simplified prompt to the list

            # Save simplified prompts
            with open(os.path.join(output_pedestrian_dir, f'pedestrian_{pedestrian_id}_prompts.json'), 'w') as f:
                json.dump(simplified_prompts, f, indent=4)

            videos_processed += 1

print("Simplified prompts generated and saved to respective folders.")


For CogVLM

In [None]:
import json
import pickle
import os

annotations_file = "./data_cache/jaad_database.pkl"

with open(annotations_file, 'rb') as f:
    database = pickle.load(f)

# Value Mapping to words
occlusion_mapping = {0: 'none', 1: 'part', 2: 'full'}
action_mapping = {0: 'standing', 1: 'walking'}
nod_mapping = {0: 'undefined', 1: 'nodding'}
look_mapping = {0: 'not-looking', 1: 'looking'}
hand_gesture_mapping = {0: 'undefined', 1: 'greet', 2: 'yield', 3: 'rightofway', 4: 'other'}
reaction_mapping = {0: 'undefined', 1: 'clear_path', 2: 'speed_up', 3: 'slow_down'}
cross_mapping = {0: 'not-crossing', 1: 'crossing', -1: 'irrelevant'}
age_mapping = {0: 'child', 1: 'young', 2: 'adult', 3: 'senior'}
designated_mapping = {0: 'ND', 1: 'D'}
gender_mapping = {0: 'n/a', 1: 'female', 2: 'male'}
intersection_mapping = {0: 'no', 1: 'yes'}
motion_direction_mapping = {0: 'n/a', 1: 'LATITUDE', 2: 'LONGITUDE'}
traffic_direction_mapping = {0: 'OW', 1: 'TW'}
signalized_mapping = {0: 'n/a', 1: 'NS', 2: 'S'}
vehicle_mapping = {0: 'stopped', 1: 'moving_slow', 2: 'moving_fast', 3: 'decelerating', 4: 'accelerating'}
road_type_mapping = {0: 'street', 1: 'parking_lot', 2: 'garage'}
traffic_light_mapping = {0: 'n/a', 1: 'red', 2: 'green'}
pedestrian_crossing_mapping = {0: 'Absent', 1: 'Present'}
pedestrian_sign_mapping = {0: 'Absent', 1: 'Present'}
stop_sign_mapping = {0: 'Absent', 1: 'Present'}

# Define the number of videos to process
num_videos_to_process = 1

all_video_prompts = []

# Loop to iterate through each video
for idx, (video_id, video_data) in enumerate(database.items()):
    if idx >= num_videos_to_process:
        break
        
    output_video_dir = os.path.join(output_main_dir, f"{video_id}")

    #print(video_id)
    #print(video_data.keys())
    vehicle_annotations = video_data['vehicle_annotations']
    #print(vehicle_annotations.keys())
    traffic_annotations = video_data['traffic_annotations']
    #print(traffic_annotations.keys())
    ped_annotations = video_data['ped_annotations']
    #print(ped_annotations.keys())
        
    first_frame_index = 0  
    first_frame_traffic_attributes = traffic_annotations[first_frame_index]
    '''
    print("Keys and Values in first_frame_traffic_attributes:")
    for key, value in first_frame_traffic_attributes.items():
        print(f"{key}: {value}")
    '''
    vehicle = vehicle_mapping.get(vehicle_annotations[first_frame_index], 'Unknown')
    road_type = road_type_mapping.get(traffic_annotations.get('road_type', 0), 'Unknown')
    
    pedestrian_crossing = pedestrian_crossing_mapping.get(first_frame_traffic_attributes.get('ped_crossing', 0), 'Unknown')
    pedestrian_sign = pedestrian_sign_mapping.get(first_frame_traffic_attributes.get('ped_sign', 0), 'Unknown')
    stop_sign = stop_sign_mapping.get(first_frame_traffic_attributes.get('stop_sign', 0), 'Unknown')
    traffic_light = traffic_light_mapping.get(first_frame_traffic_attributes.get('traffic_light', 0), 'Unknown')
        
    '''
    print("Road Type:", road_type)
    print("Pedestrian Crossing:", pedestrian_crossing)
    print("Pedestrian Sign:", pedestrian_sign)
    print("Stop Sign:", stop_sign)
    print("Traffic Light:", traffic_light)
    '''

    # Loop to iterate through each pedestrian in the video
    for pedestrian_id, pedestrian_data in video_data['ped_annotations'].items():
        
        output_pedestrian_dir = os.path.join(output_video_dir, f"Pedestrian_{pedestrian_id}")

        # Extract attributes for the first frame
        first_frame_idx = pedestrian_data['frames'][0]
        first_frame_attributes = pedestrian_data['attributes']
        #print(first_frame_attributes)
        first_frame_behavior = pedestrian_data['behavior']
        #print(first_frame_behavior)
        
        if 'behavior' in pedestrian_data and 'action' in pedestrian_data['behavior'] and frame_index < len(pedestrian_data['behavior']['action']):
            age = age_mapping.get(first_frame_attributes.get('age', 0), 'Unknown')
            gender = gender_mapping.get(first_frame_attributes.get('gender', 0), 'Unknown')
            motion_direction = motion_direction_mapping.get(first_frame_attributes.get('motion_direction', 0), 'Unknown')
            action = action_mapping.get(pedestrian_data['behavior']['action'][frame_index], 'Unknown')
            cross = cross_mapping.get(pedestrian_data['behavior'].get('cross', [])[frame_index], 'Unknown')
            reaction = reaction_mapping.get(pedestrian_data['behavior'].get('reaction', [])[frame_index], 'Unknown')
            hand_gesture = hand_gesture_mapping.get(pedestrian_data['behavior'].get('hand_gesture', [])[frame_index], 'Unknown')
            look = look_mapping.get(pedestrian_data['behavior'].get('look', [])[frame_index], 'Unknown')
            nod = nod_mapping.get(pedestrian_data['behavior'].get('nod', [])[frame_index], 'Unknown')


        input_image_paths = []
        frames_to_process = pedestrian_data['frames'][::10][:5]
    
        
        for i, frame_num in enumerate(frames_to_process):
            # Construct the path to the image
            image_path = os.path.join(output_pedestrian_dir, f"Pedestrian_{pedestrian_id}_Image_{frame_num}.png")
            input_image_paths.append(f"image{i+1}: <img>{image_path}</img>")

        # Combine the input image paths into one line
        input_images_line = ' '.join(input_image_paths)
        
        # Initialize the list to store bounding box coordinates
        bounding_box_coordinates = []

        # Bounding boxes for the first 5 pedestrian frames skipped by an interval of 10
        for i in range(0, 50, 10):
            # Check if the bounding box data exists at the current index
            if i < len(pedestrian_data['bbox']):
                bounding_box_data = pedestrian_data['bbox'][i]
                bounding_box_coordinates.append(bounding_box_data)
            else:
                # Add a placeholder for missing bounding box data
                bounding_box_coordinates.append([])

        # Prompting for first 5 bounding boxes
        bounding_box_info = ""
        if any(bounding_box_coordinates):
            bounding_box_info = "The pedestrian of interest is marked by a green bounding box, the coordinates of the bounding boxes in each of the images above are "
            bounding_box_info += ', '.join([f"[{bbox[0]}, {bbox[1]}, {bbox[2]}, {bbox[3]}]" for bbox in bounding_box_coordinates if bbox])
            bounding_box_info += " respectively."

        # Bounding boxes for the prediction part, containing the next 5 pedestrian frames skipped by an interval of 10
        next_bounding_box_coordinates = []

        for i in range(50, 100, 10):
            if i < len(pedestrian_data['bbox']):
                bounding_box_data = pedestrian_data['bbox'][i]
                next_bounding_box_coordinates.append(bounding_box_data)
            else:
                next_bounding_box_coordinates.append([])

        # Prompting for prediction of the next 5 bounding boxes
        next_bounding_box_info = ""
        if any(next_bounding_box_coordinates):
            next_bounding_box_info = "The predicted trajectory for the pedestrian for the next 1 second is: "
            next_bounding_box_info += ', '.join([f"[{bbox[0]}, {bbox[1]}, {bbox[2]}, {bbox[3]}]" for bbox in next_bounding_box_coordinates if bbox])
            next_bounding_box_info += " respectively."

        # Final Prompt
        prompt = {
            "Video ID": video_id,
            "Ped_id": pedestrian_id,
            "conversations": [
                {
                    "from": "user",
                    "value": f"Role: You are an autonomous vehicle that uses front-camera images to interact with pedestrians. Input: {input_images_line}. Above are 5 sequential ego-vehicle front-camera view images extraced from a 2 second video that you can see behind the wheel.{bounding_box_info}\nTask: predict the trajectory of the pedestrian of interest for the next 1 second.\nExpected output: coordinates of 6 bounding box indicating the trajectory of pedestrain for the next 1 second(in the form of [((al1,bl1), (ar1,br1))],[((al6, b6), (ar6, br6))])."
                },
                {
                    "from": "assistant",
                    "value": f"{next_bounding_box_info}"
                },
                {
                    "from": "user",
                    "value": "According to the pictures and trajectory above, answer the following questions:\n1. Describe the traffic situation (eg. environment, road condition, traffic sign, obstacles, other parties on the road) (optional)\n2. Describe the behavior of the pedestrian of interest by considering the following aspects:\n   - Is the pedestrian looking at the direction of the ego-vehicle?\n   - Is the pedestrian crossing in front of the car?\n3. Does the pedestrian interfere with ego-vehicle? What action should ego-vehicle take (stop, slow down or continue drive forward)"
                },
                {
                    "from": "assistant",
                    "value": f"The video is from a '{road_type}', the pedestrian crossing is '{pedestrian_crossing}', pedestrian sign is '{pedestrian_sign}', stop sign is '{stop_sign}', and the traffic light is '{traffic_light}'. The pedestrian is {cross} the road, while its also {look} at the car. For the vehicle, it is '{vehicle}'."
                }
            ]
        }

        all_video_prompts.append(prompt)
        print("Prompt:")
        print(json.dumps(prompt, indent=4))
        
# Write prompts to a JSON file
with open('output_prompts_liang.json', 'w') as f:
    json.dump(all_video_prompts, f, indent=4)

print("Prompts generated and saved to 'output_prompts_liang.json'.")
