In [1]:

import json
import spacy
from spacy.tokens import DocBin

def convert_json_to_spacy(json_file_path, spacy_file_path, model="en_core_web_sm"):
    """
    Converts a JSON file in a specific format to a spaCy training data format (.spacy).

    Args:
        json_file_path (str): Path to the JSON file.
        spacy_file_path (str): Path to save the spaCy output file.
        model (str): Name of the spaCy language model to use. Defaults to 'en_core_web_sm'.
    """

    nlp = spacy.blank("tr")

    db = DocBin()
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for text, annotations in data:
      doc = nlp.make_doc(text)
      ents = []
      for start, end, label in annotations["entities"]:
          span = doc.char_span(start, end, label=label, alignment_mode="contract")
          if span is None:
              print(f"Skipping entity: Text '{text[start:end]}' not found in document.")
          else:
              ents.append(span)
      doc.ents = ents
      db.add(doc)
    db.to_disk(spacy_file_path)
    print(f"Successfully converted '{json_file_path}' to spaCy format and saved to '{spacy_file_path}'.")


# Example usage: Replace with your file paths and model name
json_file = "json/train_data_5th_150.json"  # Replace with the actual path
spacy_file = "train.spacy"  # Replace with the desired output path
convert_json_to_spacy(json_file, spacy_file)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Skipping entity: Text 'Turkcell' not found in document.
Successfully converted 'json/train_data_5th_150.json' to spaCy format and saved to 'train.spacy'.
