In [12]:
import json
import os

def prodigy_to_doccano(prodigy_data):
    """Convert Prodigy JSON to Doccano JSON."""
    
    # Combine texts into a single string with newlines
    combined_text = "\n".join(entry["text"] for entry in prodigy_data)
    
    i = 1  # Where we're gonna put each file ID and increment it

    # Adjust label positions
    current_offset = 0
    labels = []
    
    for entry in prodigy_data:
        if "spans" in entry:
            for span in entry["spans"]:
                labels.append([span["start"] + current_offset, span["end"] + current_offset, span["label"]])
        current_offset += len(entry["text"]) + 1  # Adding 1 for newline character
    
    # Construct final JSON object
    output = {
        "id": i,
        "text": combined_text,
        "label": labels,
        "Comments": []  # No comments from Prodigy
    }
    
    return output  # Ensure the function returns the converted data

def convert_files(input_dir, output_dir):
    """Convert Prodigy JSONL files to Doccano JSONL format."""
    try:
        for root, _, files in os.walk(input_dir):
            for file in files:
                if not file.endswith(".jsonl"):  # Ensure processing only JSONL files
                    continue
                
                input_path = os.path.join(root, file)
                relative_path = os.path.relpath(input_path, input_dir)
                relative_path = os.path.splitext(relative_path)[0] 
                output_path = os.path.join(output_dir, relative_path + ".jsonl")
                
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                
                prodigy_data = []
                with open(input_path, "r", encoding="utf-8") as file:
                    for line in file:
                        try:
                            if line.strip():  # Ignore empty lines
                                prodigy_data.append(json.loads(line))
                        except json.JSONDecodeError:
                            print(f"Skipping invalid line in {input_path}: {line.strip()}")
                
                # Convert Prodigy JSON to Doccano JSON
                doccano_json = prodigy_to_doccano(prodigy_data)
                
                with open(output_path, "w", encoding="utf-8") as file:
                    json.dump(doccano_json, file, ensure_ascii=False, indent=None, separators=(',', ':'))
                
                print(f"Conversion complete! File saved to {output_path}")
    except:
        print("Error with reading the file")

def main():
    input_dir = "../Prodigy_Annotations/_Matching_Reports/JSON_Reports_Annotated"
    output_dir = "../Doccano_Annotations/Prodigy_to_Doccano/_Matching_Reports"
    convert_files(input_dir, output_dir)

if __name__ == "__main__":
    main()


Conversion complete! File saved to ../Doccano_Annotations/Prodigy_to_Doccano/_Matching_Reports\9013084239\423576.jsonl
Conversion complete! File saved to ../Doccano_Annotations/Prodigy_to_Doccano/_Matching_Reports\9013084239\439236.jsonl
Conversion complete! File saved to ../Doccano_Annotations/Prodigy_to_Doccano/_Matching_Reports\9013236031\1186628.jsonl
Conversion complete! File saved to ../Doccano_Annotations/Prodigy_to_Doccano/_Matching_Reports\9013236031\1237713.jsonl
Conversion complete! File saved to ../Doccano_Annotations/Prodigy_to_Doccano/_Matching_Reports\9013236031\1255834.jsonl
