In [12]:
import json
import uuid
import time
from datetime import datetime, timedelta, timezone
import os

def generate_id():
    """Generate a random unique identifier."""
    return str(uuid.uuid4())[:10]  # Shortened UUID for readability

def get_now():
    """Return the current timestamp in ISO format."""
    return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
    

def get_hourbefore():
    """Return the timestamp for one hour ago in ISO format."""
    return (datetime.now(timezone.utc) - timedelta(hours=1)).isoformat().replace("+00:00", "Z")

def doccano_to_labelstudio(doccano_jsonl):
    """Convert Doccano JSONL to Label Studio JSON."""
    label_studio_data = []
    
    for line in doccano_jsonl.strip().split('\n'):
        doccano_entry = json.loads(line)
        text = doccano_entry["text"]
        labels = doccano_entry["label"]
        
        result = []
        for start, end, label in labels:
            result.append({
                "value": {
                    "start": start,
                    "end": end,  # Label Studio expects the original end index
                    "text": text[start:end],  # Ensure no trailing newline
                    "labels": [label]
                },
                "id": generate_id(),
                "from_name": "label",
                "to_name": "text",
                "type": "labels",
                "origin": "manual"
            })
        
        annotation_id = doccano_entry.get("id", generate_id())
        
        label_studio_data.append({
            "id": annotation_id,
            "annotations": [{
                "id": annotation_id,
                "completed_by": 1,
                "result": result,
                "was_cancelled": False,
                "ground_truth": False,
                "created_at": get_hourbefore(),
                "updated_at": get_now(),
                "draft_created_at":get_hourbefore(), 
                "lead_time": 60.15, #Doesn't matter
                "prediction":{},
                "result_count":len(labels),
                "unique_id": str(uuid.uuid4()),
                "import_id":None,
                "last_action":None,
                "bulk_created":False,
                "task": annotation_id,
                "project": 1,
                "updated_by":1,
                "parent_prediction":None,
                "parent_annotation":None,
                "last_created_by":None
            }],
            "file_upload": f"{str(uuid.uuid4())[:8]}_document.json",
            "drafts":[],
            "predictions":[],
            "data": {"text": text},
            "meta": {},
            "created_at": get_hourbefore(), #Set to same time yesterday, doesn't really matter
            "updated_at": get_now(),
            "inner_id": annotation_id,
            "total_annotations": 1,
            "cancelled_annotations": 0,
            "total_predictions": 0,
            "comment_count": 0,
            "unresolved_comment_count": 0,
            "last_comment_updated_at":None,
            "project":1,
            "updated_by":1,
            "comment_authors": []
        })
    
    return json.dumps(label_studio_data, indent=4)


def convert_files(input_dir, output_dir):
    """Convert Doccano JSONL files to Label-Studio JSON format."""
    try:
        for root, _, files in os.walk(input_dir):
            for file in files:
                if not file.endswith(".jsonl"):  # Check if file ends with .jsonl
                    continue
                input_path = os.path.join(root, file)
                relative_path = os.path.relpath(input_path, input_dir)
                relative_path = os.path.splitext(relative_path)[0] 
                output_path = os.path.join(output_dir, relative_path + ".json")
                
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                
                doccano_data = []
                with open(input_path, "r", encoding="utf-8") as file:
                    for line in file:
                        try:
                            if line.strip():
                                doccano_data.append(json.loads(line))
                        except json.JSONDecodeError as e:
                            print(f"Skipping invalid line in {input_path}: {line.strip()}")
                
                label_studio_json = doccano_to_labelstudio('\n'.join(json.dumps(obj) for obj in doccano_data))
                
                with open(output_path, "w", encoding="utf-8") as file:
                    file.write(label_studio_json)
                
                print(f"Conversion complete! File saved to {output_path}")
    except:
        print("Error in writing to file")

def main():
    input_dir = "../Doccano_Annotations/JSON_Reports_Annotated"
    output_dir = "../Label-Studio_Annotations/Doccano_to_Label-Studio/JSON_Reports_Annotated"
    convert_files(input_dir, output_dir)

if __name__ == "__main__":
    main()


Conversion complete! File saved to ../Label-Studio_Annotations/Doccano_to_Label-Studio/JSON_Reports_Annotated\9013540851\414653.json
Conversion complete! File saved to ../Label-Studio_Annotations/Doccano_to_Label-Studio/JSON_Reports_Annotated\9013540851\433023.json
Conversion complete! File saved to ../Label-Studio_Annotations/Doccano_to_Label-Studio/JSON_Reports_Annotated\9013612611\311711.json
Conversion complete! File saved to ../Label-Studio_Annotations/Doccano_to_Label-Studio/JSON_Reports_Annotated\9013612611\537699.json
Conversion complete! File saved to ../Label-Studio_Annotations/Doccano_to_Label-Studio/JSON_Reports_Annotated\9013612611\537705.json
Conversion complete! File saved to ../Label-Studio_Annotations/Doccano_to_Label-Studio/JSON_Reports_Annotated\9013612611\569928.json
