In [None]:
# Cell 1: Import necessary libraries
import pandas as pd
import json
import math
import os

In [None]:
# Cell 2: Read the Label Studio CSV, handling blank rows
labelstudio_df = pd.read_csv('/content/labelstudio.csv')
labelstudio_df = labelstudio_df.dropna(how='all')  # Remove entirely blank rows
labelstudio_df = labelstudio_df[['image', 'label']]  # Keep only relevant columns

In [None]:
# Cell 3: Function to extract image_id from image path
def extract_image_id(path):
    filename = os.path.basename(path)  # Get the last part after /
    if '-' in filename:
        return filename.split('-')[-1]  # Get the part after the last -, e.g., img001.jpg
    return filename

In [None]:
# Cell 4: Process Label Studio annotations
ls_records = []

for _, row in labelstudio_df.iterrows():
    image_id = extract_image_id(row['image'])
    try:
        annotations = json.loads(row['label'])
    except json.JSONDecodeError:
        continue  # Skip invalid JSON

    for anno in annotations:
        orig_w = anno.get('original_width', 0)
        orig_h = anno.get('original_height', 0)
        x_perc = anno.get('x', 0)
        y_perc = anno.get('y', 0)
        w_perc = anno.get('width', 0)
        h_perc = anno.get('height', 0)
        rot = anno.get('rotation', 0)

        # Convert to pixels
        x = x_perc * orig_w / 100
        y = y_perc * orig_h / 100
        w = w_perc * orig_w / 100
        h = h_perc * orig_h / 100

        if rot == 0:
            x_min = int(x)
            y_min = int(y)
            x_max = int(x + w)
            y_max = int(y + h)
        else:
            # Calculate axis-aligned bounding box for rotated rectangle
            cx = x + w / 2
            cy = y + h / 2
            hw = w / 2
            hh = h / 2
            theta = math.radians(rot)
            cos = math.cos(theta)
            sin = math.sin(theta)

            corners = [(-hw, -hh), (hw, -hh), (hw, hh), (-hw, hh)]
            rotated_x = [cx + (dx * cos - dy * sin) for dx, dy in corners]
            rotated_y = [cy + (dx * sin + dy * cos) for dx, dy in corners]

            x_min = int(min(rotated_x))
            y_min = int(min(rotated_y))
            x_max = int(max(rotated_x))
            y_max = int(max(rotated_y))

        ls_records.append({
            'image_id': image_id,
            'x_min': x_min,
            'y_min': y_min,
            'x_max': x_max,
            'y_max': y_max
        })

ls_combined_df = pd.DataFrame(ls_records)

In [None]:
# Cell 5: Read the OpenCV CSV and clean it
opencv_df = pd.read_csv('/content/opencv.csv')
# Remove specified image_ids with false data
false_images = ['img093.jpg', 'img057.jpg', 'img103.jpg']
opencv_df = opencv_df[~opencv_df['image_id'].isin(false_images)]
# Drop the value column
if 'value' in opencv_df.columns:
    opencv_df = opencv_df.drop(columns=['value'])

In [None]:
# Cell 6: Combine both DataFrames
combined_df = pd.concat([ls_combined_df, opencv_df], ignore_index=True)
# Sort by image_id for clarity
combined_df = combined_df.sort_values(by='image_id').reset_index(drop=True)
# Optional: Remove duplicates if bounding boxes are exactly the same for the same image
combined_df = combined_df.drop_duplicates(subset=['image_id', 'x_min', 'y_min', 'x_max', 'y_max'])

In [None]:
# Cell 7: Save the combined output to a new CSV
combined_df.to_csv('/content/combined.csv', index=False)
print("Combined CSV saved to /content/combined.csv")

Combined CSV saved to /content/combined.csv


In [None]:
# Cell 8: Display the combined DataFrame (optional)
combined_df

Unnamed: 0,image_id,x_min,y_min,x_max,y_max
0,img001.jpg,397,1700,794,2071
1,img002.jpg,1275,1833,1784,2336
2,img003.jpg,727,1338,1055,1662
3,img004.jpg,1067,1738,1256,1930
4,img005.jpg,558,1727,1016,2195
...,...,...,...,...,...
1017,img249.jpg,1766,975,1943,1164
1018,img250.jpg,2585,916,2782,1118
1019,img250.jpg,1352,1054,1499,1237
1020,img250.jpg,1686,1611,1835,1763


In [None]:
print("Unique image IDs in combined_df:")
display(combined_df['image_id'].nunique())

Unique image IDs in combined_df:


250