In [1]:
# Data Preparation Notebook
# Process all labeler folders, save images to single folder with labels CSV

import json
import re
from pathlib import Path
from PIL import Image
import pandas as pd

# Paths
ANKLEALIGN_PATH = Path("../anklealign")
OUTPUT_PATH = Path("../data/prepared_images")
LABEL_PATH = Path("../data/labels")

# Create output directory
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
LABEL_PATH.mkdir(parents=True, exist_ok=True)


# Label mapping from Hungarian to English
LABEL_MAP = {
    '1_Pronacio': 'Pronation',
    '1_Pronáló': 'Pronation',
    'pronation': 'Pronation',
    '2_Neutralis': 'Neutral',
    'neutral': 'Neutral',
    '2_Neutral': 'Neutral',
    '3_Szupinacio': 'Supination',
    '3_Szupináló': 'Supination',
    'supination': 'Supination',
}

print(f"Output directory: {OUTPUT_PATH.absolute()}")


Output directory: d:\src\repos\ankle-align\notebook\..\data\prepared_images


## 1. Find Labeler Folders

Scan for folders with 6 alphanumeric character names (labeler IDs).


In [2]:
# Pattern for 6 alphanumeric character labeler IDs
LABELER_ID_PATTERN = re.compile(r'^[A-Za-z0-9]{6}$')

# Find all labeler folders (excluding 'consensus' and 'sample')
labeler_folders = []
for folder in ANKLEALIGN_PATH.iterdir():
    if folder.is_dir() and LABELER_ID_PATTERN.match(folder.name):
        labeler_folders.append(folder)

labeler_folders = sorted(labeler_folders, key=lambda x: x.name.upper())
labeler_folders = [f for f in labeler_folders if f.name not in {'consensus', 'sample'}]

print(f"Found {len(labeler_folders)} labeler folders:")
for folder in labeler_folders:
    # Count images in folder
    image_exts = {'.jpg', '.jpeg', '.png', '.webp', '.gif', '.bmp'}
    n_images = sum(1 for f in folder.iterdir() if f.suffix.lower() in image_exts)
    # Check for JSON file
    json_files = list(folder.glob('*.json'))
    json_status = f"✓ {json_files[0].name}" if json_files else "✗ No JSON"
    print(f"  {folder.name}: {n_images} images, {json_status}")


Found 15 labeler folders:
  B8V41Y: 20 images, ✓ b8v41y.json
  C6037J: 34 images, ✓ C6037J.json
  D6AE9F: 22 images, ✓ D6AE9F.json
  ECSGGY: 37 images, ✗ No JSON
  FGWUFP: 20 images, ✓ FGWUFP.json
  FO6K58: 32 images, ✓ FO6K58_labels.json
  GI9Y8B: 51 images, ✗ No JSON
  GK1XQ4: 52 images, ✓ project-1-at-2025-10-15-23-46-9d203653.json
  H51B9J: 23 images, ✓ H51B9J.json
  ITWQ3V: 23 images, ✓ ITWQ3V.json
  NC1O2T: 0 images, ✓ hf_labels_export.json
  NX9GA4: 20 images, ✓ NX9GA4_ankles_labeled.json
  ODZF0M: 20 images, ✓ project-2-at-2025-10-16-02-08-8ee4fdfa.json
  OJHGS8: 20 images, ✓ OJHGS8.json
  XV0M8Z: 20 images, ✓ AnkleAlign_Cimkezes_XV0M8Z.json


## 2. Load Labels from JSON

Parse Label Studio JSON files and create image name to label mappings.


In [3]:
def extract_image_name_from_upload(file_upload):
    """
    Extract original image name from Label Studio file_upload path.
    Format: "uuid-original_name.ext" -> "original_name.ext"
    """
    filename = Path(file_upload).name
    # Remove UUID prefix (8 hex chars + hyphen)
    uuid_pattern = r'^[a-f0-9]{8}-(.+)$'
    match = re.match(uuid_pattern, filename, re.IGNORECASE)
    if match:
        return match.group(1)
    return filename

def parse_labeler_json(json_path):
    """
    Parse Label Studio JSON and return {image_name: label} mapping.
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    labels = {}
    for task in data:
        file_upload = task.get('file_upload', '')
        image_name = extract_image_name_from_upload(file_upload)
        
        # Extract label
        annotations = task.get('annotations', [])
        if annotations:
            results = annotations[0].get('result', [])
            if results:
                choices = results[0].get('value', {}).get('choices', [])
                if choices:
                    raw_label = choices[0]
                    labels[image_name] = LABEL_MAP.get(raw_label, raw_label)
    
    return labels

# Test on one folder
test_folder = labeler_folders[0]
test_json = list(test_folder.glob('*.json'))[0]
test_labels = parse_labeler_json(test_json)
print(f"Test: {test_folder.name}")
print(f"  Loaded {len(test_labels)} labels from {test_json.name}")
print(f"  Sample: {list(test_labels.items())[:3]}")


Test: B8V41Y
  Loaded 20 labels from b8v41y.json
  Sample: [('internet_actualne_01.jpg', 'Pronation'), ('internet_algeos_01.jpg', 'Neutral'), ('internet_everypixel_01.jpg', 'Pronation')]


## 3. Process All Labeler Folders

Iterate through all images, save to single folder as PNG, and create per-labeler label CSV files.


In [4]:
def find_image_label(image_name, labels_dict, labeler_id):
    """
    Try to find a label for an image, handling various naming issues.
    """
    # Direct match
    if image_name in labels_dict:
        return labels_dict[image_name]
    
    # Case-insensitive match
    image_lower = image_name.lower()
    for name, label in labels_dict.items():
        if name.lower() == image_lower:
            return label
    
    # Try without labeler prefix if it was added
    if image_name.lower().startswith(labeler_id.lower() + '_'):
        stripped = image_name[len(labeler_id) + 1:]
        if stripped in labels_dict:
            return labels_dict[stripped]
        for name, label in labels_dict.items():
            if name.lower() == stripped.lower():
                return label
    
    # Try matching by finding the image name in labels keys
    stem = Path(image_name).stem
    for name, label in labels_dict.items():
        if stem.lower() in name.lower() or name.lower() in stem.lower():
            return label
    
    return None

def convert_and_save_as_png(source_path, dest_path):
    """
    Open image and save as PNG.
    """
    try:
        with Image.open(source_path) as img:
            # Convert to RGB if necessary (for images with alpha channel or palette)
            if img.mode in ('RGBA', 'LA', 'P'):
                img = img.convert('RGB')
            img.save(dest_path, 'PNG')
        return True
    except Exception as e:
        print(f"    Error converting {source_path}: {e}")
        return False

# Image extensions to process
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.webp', '.gif', '.bmp', '.tiff'}


In [5]:
# Process all labeler folders
stats = {
    'total_images': 0,
    'copied': {'Pronation': 0, 'Neutral': 0, 'Supination': 0, 'Unlabeled': 0},
    'errors': 0,
    'no_json': 0,
    'per_labeler': {}
}

print("Processing labeler folders...")
print("=" * 70)

for folder in labeler_folders:
    labeler_id = folder.name
    print(f"\n{labeler_id}:")
    
    # Find JSON file
    json_files = list(folder.glob('*.json'))
    if not json_files:
        print(f"  ⚠️ No JSON file found, skipping...")
        stats['no_json'] += 1
        continue
    
    # Load labels
    labels = parse_labeler_json(json_files[0])
    print(f"  Loaded {len(labels)} labels from {json_files[0].name}")
    
    # Process images and collect label data for CSV
    labeler_stats = {'Pronation': 0, 'Neutral': 0, 'Supination': 0, 'Unlabeled': 0, 'errors': 0}
    labeler_label_rows = []  # For CSV export
    
    for image_file in folder.iterdir():
        if image_file.suffix.lower() not in IMAGE_EXTENSIONS:
            continue
        
        stats['total_images'] += 1
        
        # Find label for this image
        label = find_image_label(image_file.name, labels, labeler_id)
        
        if label is None:
            label = 'Unlabeled'
        
        # Determine output filename: [labeler_id]_[original_filename].png
        original_stem = image_file.stem
        new_filename = f"{labeler_id}_{original_stem}.png"
        dest_path = OUTPUT_PATH / new_filename  # All images in one folder
        
        # Convert and save as PNG
        if convert_and_save_as_png(image_file, dest_path):
            labeler_stats[label] += 1
            stats['copied'][label] += 1
            # Add to label rows for CSV
            labeler_label_rows.append({
                'filename': new_filename,
                'original_filename': image_file.name,
                'label': label
            })
        else:
            labeler_stats['errors'] += 1
            stats['errors'] += 1
    
    # Save labeler's labels to CSV
    if labeler_label_rows:
        df_labeler_labels = pd.DataFrame(labeler_label_rows)
        csv_path = LABEL_PATH / f"{labeler_id}_labels.csv"
        df_labeler_labels.to_csv(csv_path, index=False)
        print(f"  Saved labels to {csv_path.name}")
    
    stats['per_labeler'][labeler_id] = labeler_stats
    print(f"  Copied: P={labeler_stats['Pronation']}, N={labeler_stats['Neutral']}, "
          f"S={labeler_stats['Supination']}, U={labeler_stats['Unlabeled']}")


Processing labeler folders...

B8V41Y:
  Loaded 20 labels from b8v41y.json
  Saved labels to B8V41Y_labels.csv
  Copied: P=7, N=13, S=0, U=0

C6037J:
  Loaded 34 labels from C6037J.json
  Saved labels to C6037J_labels.csv
  Copied: P=16, N=13, S=5, U=0

D6AE9F:
  Loaded 22 labels from D6AE9F.json
  Saved labels to D6AE9F_labels.csv
  Copied: P=7, N=11, S=4, U=0

ECSGGY:
  ⚠️ No JSON file found, skipping...

FGWUFP:
  Loaded 20 labels from FGWUFP.json
  Saved labels to FGWUFP_labels.csv
  Copied: P=8, N=12, S=0, U=0

FO6K58:
  Loaded 32 labels from FO6K58_labels.json
  Saved labels to FO6K58_labels.csv
  Copied: P=7, N=17, S=8, U=0

GI9Y8B:
  ⚠️ No JSON file found, skipping...

GK1XQ4:
  Loaded 52 labels from project-1-at-2025-10-15-23-46-9d203653.json
  Saved labels to GK1XQ4_labels.csv
  Copied: P=28, N=17, S=7, U=0

H51B9J:
  Loaded 23 labels from H51B9J.json
  Saved labels to H51B9J_labels.csv
  Copied: P=8, N=15, S=0, U=0

ITWQ3V:
  Loaded 22 labels from ITWQ3V.json
  Saved labels 

## 4. Summary Statistics


In [6]:
# Print summary
print("\n" + "=" * 70)
print("PROCESSING COMPLETE")
print("=" * 70)
print(f"\nLabeler folders processed: {len(labeler_folders) - stats['no_json']}")
print(f"Folders without JSON: {stats['no_json']}")
print(f"\nTotal images processed: {stats['total_images']}")
print(f"\nImages by label:")
for label, count in stats['copied'].items():
    pct = count / stats['total_images'] * 100 if stats['total_images'] > 0 else 0
    print(f"  {label}: {count} ({pct:.1f}%)")
print(f"\nConversion errors: {stats['errors']}")

# Verify output directory contents
n_pngs = len(list(OUTPUT_PATH.glob('*.png')))
n_csvs = len(list(OUTPUT_PATH.glob('*_labels.csv')))
print(f"\nOutput directory contents:")
print(f"  PNG images: {n_pngs}")
print(f"  Label CSV files: {n_csvs}")



PROCESSING COMPLETE

Labeler folders processed: 13
Folders without JSON: 2

Total images processed: 306

Images by label:
  Pronation: 125 (40.8%)
  Neutral: 133 (43.5%)
  Supination: 45 (14.7%)
  Unlabeled: 3 (1.0%)

Conversion errors: 0

Output directory contents:
  PNG images: 326
  Label CSV files: 0


In [7]:
# Per-labeler breakdown
df_stats = pd.DataFrame(stats['per_labeler']).T
df_stats.index.name = 'labeler_id'
df_stats['total'] = df_stats[['Pronation', 'Neutral', 'Supination', 'Unlabeled']].sum(axis=1)
df_stats = df_stats[['Pronation', 'Neutral', 'Supination', 'Unlabeled', 'errors', 'total']]
df_stats = df_stats.sort_values('total', ascending=False)

print("\nPer-labeler breakdown:")
display(df_stats)



Per-labeler breakdown:


Unnamed: 0_level_0,Pronation,Neutral,Supination,Unlabeled,errors,total
labeler_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GK1XQ4,28,17,7,0,0,52
C6037J,16,13,5,0,0,34
FO6K58,7,17,8,0,0,32
H51B9J,8,15,0,0,0,23
ITWQ3V,11,4,7,1,0,23
D6AE9F,7,11,4,0,0,22
B8V41Y,7,13,0,0,0,20
FGWUFP,8,12,0,0,0,20
NX9GA4,10,10,0,0,0,20
OJHGS8,9,4,7,0,0,20


### Special cases

NC1O2T - images sorted in directories

In [8]:
labeler_id = "NC1O2T"
folder = ANKLEALIGN_PATH / labeler_id
labeler_label_rows = []
folder_map = {
    'normal': 'Neutral',
    'supination': 'Supination',
    'pronation': 'Pronation'
}
for folder_name, label in folder_map.items():
    for image_file in (folder / folder_name).iterdir():
        if image_file.suffix.lower() not in IMAGE_EXTENSIONS:
            continue
        print(image_file.name)

        original_stem = image_file.stem
        new_filename = f"{labeler_id}_{original_stem}.png"
        dest_path = OUTPUT_PATH / new_filename  # All images in one folder
        
        # Convert and save as PNG
        if convert_and_save_as_png(image_file, dest_path):
            labeler_stats[label] += 1
            stats['copied'][label] += 1
            # Add to label rows for CSV
            labeler_label_rows.append({
                'filename': new_filename,
                'original_filename': image_file.name,
                'label': label
            })
        else:
            print(f"Error converting {image_file}")

if labeler_label_rows:
    df_labeler_labels = pd.DataFrame(labeler_label_rows)
    csv_path = LABEL_PATH / f"{labeler_id}_labels.csv"
    df_labeler_labels.to_csv(csv_path, index=False)
    print(f"  Saved labels to {csv_path.name}")

internet_google_01.jpg
internet_google_02.jpg
internet_google_03.webp
internet_google_04.jpg
internet_google_05.jpg
internet_google_06.png
internet_google_07.webp
internet_google_16.webp
internet_google_17.jpg
internet_google_18.png
internet_google_19.png
internet_google_20.png
internet_google_08.webp
internet_google_09.jpg
internet_google_10.jpg
internet_google_11.jpg
internet_google_12.jpg
internet_google_13.png
internet_google_14.png
internet_google_15.jpg
  Saved labels to NC1O2T_labels.csv


# Sanity check

In [9]:
# create a merged labels CSV file
all_label_rows = []
for labeler_id in stats['per_labeler'].keys():
    csv_path = LABEL_PATH / f"{labeler_id}_labels.csv"
    if csv_path.exists():
        df_labels = pd.read_csv(csv_path)
        all_label_rows.append(df_labels)
merged_labels_df = pd.concat(all_label_rows, ignore_index=True)
merged_csv_path = LABEL_PATH / "merged_labels.csv"

In [10]:
merged_labels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 326 entries, 0 to 325
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   filename           326 non-null    object
 1   original_filename  326 non-null    object
 2   label              326 non-null    object
dtypes: object(3)
memory usage: 7.8+ KB


In [11]:
# count files in output path
n_pngs = len(list(OUTPUT_PATH.glob('*.png')))
print(f"Total PNG images in output path: {n_pngs}")

Total PNG images in output path: 326


we have 326 images and 326 labels. Looks good!

In [12]:
# save merged CSV
merged_labels_df.to_csv(merged_csv_path, index=False)

Some images were rotated by 90 degrees, this was fixed manually.