In [6]:
import os
import pandas as pd

def translate_to_natural_language(df):
    natural_language_sentences = []
    class_id_to_name = {-1: "person", 2: "car", 3: "motorcycle", 5: "bus", 7: "truck"}

    for frame, group in df.groupby('Frame'):
        frame_sentences = []
        for _, row in group.iterrows():
            object_name = class_id_to_name[row['Class_ID']]
            sentence = f"At frame {int(row['Frame'])}, {object_name} {int(row['Object_ID'])} stands at coordinates [{row['BB_Left']:.2f}, {row['BB_Top']:.2f}] with a bbox of width {row['BB_Width']:.2f} and height {row['BB_Height']:.2f},conf is {row['Confidence']:.2f}\n"
            frame_sentences.append(sentence)
        
        natural_language_sentences.append("".join(frame_sentences))
    
    return "".join(natural_language_sentences)


def read_and_translate_mot_files(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
    for filename in os.listdir(input_folder):
        if filename.endswith('.txt'):
            print(filename)
            file_path = os.path.join(input_folder, filename)
            lines = []
            with open(file_path, 'r') as file:
                lines = file.readlines()
            if not lines:
                print(f"no detected obj in current video:{file_path}")
                return
                
            parsed_data = []
            for line in lines:
                frame, object_id, bb_left, bb_top, bb_right, bb_bottom, conf, class_id, _, _ = map(float, line.strip().split(","))
                parsed_data.append({
                    'Frame': int(frame),
                    'Object_ID': int(object_id),
                    'BB_Left': bb_left,
                    'BB_Top': bb_top,
                    'BB_Width': bb_right,
                    'BB_Height': bb_bottom,
                    'Class_ID': int(class_id),
                    'Confidence' : float(conf)
                })
                
            df_mot = pd.DataFrame(parsed_data)
            translated_text = translate_to_natural_language(df_mot)


            output_file_path = os.path.join(output_folder, f"mot_{filename}")
            with open(output_file_path, 'w') as file:
                file.write(translated_text)

read_and_translate_mot_files("./","mot_archive/")


video_0001.txt
video_0003.txt
video_0055.txt
video_0056.txt
video_0057.txt
video_0194.txt
video_0310.txt
video_0313.txt
video_0333.txt
video_0343.txt
no detected obj in current video:./video_0343.txt


split for better readability

In [11]:

def parse_mot_into_dict(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
    for filename in os.listdir(input_folder):
        if filename.endswith('.txt'):
            file_path = os.path.join(input_folder, filename)
            lines = []
            with open(file_path, 'r') as file:
                lines = file.readlines()
            if not lines:
                print(f"no detected obj in current video:{file_path}")
                return
                
            parsed_data = []
            for line in lines:
                frame, object_id, bb_left, bb_top, bb_right, bb_bottom, conf, class_id, _, _ = map(float, line.strip().split(","))
                # rewrite this into a dict
                parsed_data.append({
                    'Frame': int(frame),
                    'Object_ID': int(object_id),
                    'BB_Left': bb_left,
                    'BB_Top': bb_top,
                    'BB_Width': bb_right,
                    'BB_Height': bb_bottom,
                    'Class_ID': int(class_id)
                })
                


            return 

read_and_translate_mot_files("./BDD","mot_archive/")