In [6]:
import json
import uuid
import time
from datetime import datetime, timedelta, timezone
import os

def generate_id():
    """Generate a random unique identifier."""
    return str(uuid.uuid4())[:10]  # Shortened UUID for readability

def get_now():
    """Return the current timestamp in ISO format."""
    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")

def get_hourbefore():
    """Return the timestamp for one hour ago in ISO format."""
    return (datetime.now(timezone.utc) - timedelta(hours=1)).isoformat().replace("+00:00", "Z")

def prodigy_to_labelstudio(prodigy_data):
    """Convert Prodigy JSONL to Label Studio JSON."""
    all_text = "\n".join([entry["text"] for entry in prodigy_data])

    text_positions = {}
    current_position = 0

    for i, entry in enumerate(prodigy_data):
        text_positions[i] = current_position
        current_position += len(entry["text"]) + 1  # +1 for newline

    result = []
    
    for i, prodigy_entry in enumerate(prodigy_data):
        offset = text_positions[i]

        if "spans" in prodigy_entry:
            for span in prodigy_entry["spans"]:
                start = span["start"] + offset
                end = span["end"] + offset
                label = span["label"]

                result.append({
                    "value": {
                        "start": start,
                        "end": end,
                        "text": all_text[start:end],
                        "labels": [label]
                    },
                    "id": generate_id(),
                    "from_name": "label",
                    "to_name": "text",
                    "type": "labels",
                    "origin": "manual"
                })
    
    # Generate dynamic IDs based on number of annotations
    annotation_id = generate_id()
    unique_id = str(uuid.uuid4())
    file_id = f"{str(uuid.uuid4())[:8]}_document.json"

    label_studio_data = [{
        "id": 1,  # Ensure correct ID assignment
        "annotations": [{
            "id": 2,  # Adjusted to match expected output
            "completed_by": 1,
            "result": result,
            "was_cancelled": False,
            "ground_truth": False,
            "created_at": get_now(),
            "updated_at": get_now(),
            "draft_created_at": get_hourbefore(),
            "lead_time": 61.722,
            "prediction": {},
            "result_count": len(result),
            "unique_id": unique_id,
            "import_id": None,
            "last_action": None,
            "bulk_created": False,
            "task": 1,
            "project": 1,
            "updated_by": 1,
            "parent_prediction": None,
            "parent_annotation": None,
            "last_created_by": None
        }],
        "file_upload": file_id,
        "drafts": [],
        "predictions": [],
        "data": {"text": all_text},
        "meta": {},
        "created_at": get_hourbefore(),
        "updated_at": get_now(),
        "inner_id": 1,
        "total_annotations": 1,
        "cancelled_annotations": 0,
        "total_predictions": 0,
        "comment_count": 0,
        "unresolved_comment_count": 0,
        "last_comment_updated_at": None,
        "project": 1,
        "updated_by": 1,
        "comment_authors": []
    }]
    
    return label_studio_data  # Return actual JSON object, not a string

def convert_files(input_dir, output_dir):
    """Convert Prodigy JSONL files to Label-Studio JSON format."""
    try:
        for root, _, files in os.walk(input_dir):
            for file in files:
                if not file.endswith(".jsonl"):
                    continue
                
                input_path = os.path.join(root, file)
                relative_path = os.path.relpath(input_path, input_dir)
                relative_path = os.path.splitext(relative_path)[0] 
                output_path = os.path.join(output_dir, relative_path + ".json")
    
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
                prodigy_data = []
                with open(input_path, "r", encoding="utf-8") as file:
                    for line in file:
                        try:
                            if line.strip():
                                prodigy_data.append(json.loads(line))
                        except json.JSONDecodeError:
                            print(f"Skipping invalid line in {input_path}: {line.strip()}")
    
                # Convert Prodigy JSON to Label Studio JSON
                ls_json = prodigy_to_labelstudio(prodigy_data)
    
                with open(output_path, "w", encoding="utf-8") as file:
                    json.dump(ls_json, file, ensure_ascii=False, indent=4)  # Corrected JSON writing
    
                print(f"Conversion complete! File saved to {output_path}")
    except:
        print("Error writing to file")

def main():
    input_dir = "../Prodigy_Annotations/JSON_Reports_Annotated"
    output_dir = "../Label-Studio_Annotations/Prodigy_to_Label-Studio/JSON_Reports_Annotated"
    convert_files(input_dir, output_dir)

if __name__ == "__main__":
    main()


Conversion complete! File saved to ../Label-Studio_Annotations/Prodigy_to_Label-Studio/JSON_Reports_Annotated\9012467589\401489.json
Conversion complete! File saved to ../Label-Studio_Annotations/Prodigy_to_Label-Studio/JSON_Reports_Annotated\9012467589\424024.json
Conversion complete! File saved to ../Label-Studio_Annotations/Prodigy_to_Label-Studio/JSON_Reports_Annotated\9012467589\437690.json
Conversion complete! File saved to ../Label-Studio_Annotations/Prodigy_to_Label-Studio/JSON_Reports_Annotated\9012630932\1257718.json
Conversion complete! File saved to ../Label-Studio_Annotations/Prodigy_to_Label-Studio/JSON_Reports_Annotated\9012630932\882937.json
Conversion complete! File saved to ../Label-Studio_Annotations/Prodigy_to_Label-Studio/JSON_Reports_Annotated\9012630932\895557.json
Conversion complete! File saved to ../Label-Studio_Annotations/Prodigy_to_Label-Studio/JSON_Reports_Annotated\9012630932\907883.json
Conversion complete! File saved to ../Label-Studio_Annotations/Prodi