In [1]:
import spacy
import json
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
with open('./json/train_data_1-7.json','r') as f:
    train_data = json.load(f)

In [7]:
# Create a new empty spacy model.
nlp = spacy.blank("tr")


# Adding ner to nlp model.
ner = nlp.add_pipe("ner")

# Adding annotations
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Adding optimizer for training the model
optimizer = nlp.begin_training()
nlp.create_optimizer()

save_path = 'spacy_trained_model'

# Save the trained model
nlp.to_disk(save_path)

In [4]:
import random
from spacy.training.example import Example
random.seed(42)
def has_overlapping_entities(annotations):
    entities = annotations.get("entities")
    for i, (start1, end1, label1) in enumerate(entities):
        for j, (start2, end2, label2) in enumerate(entities):
            if i != j and (start1 < end2 and start2 < end1):
                return True
    return False

In [None]:
from spacy.util import minibatch

# Number of iterations
n_iter = 20


# Disable other pipes to only train NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    total_data = 0
    for itn in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            if has_overlapping_entities(annotations):
                continue
            total_data += 1
            # Create Example object
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update(
                [example],  # Batch of Example objects
                drop=0.1,  # Dropout - make it harder to memorize data
                losses=losses,
                sgd=optimizer,
            )
        print(f"Iteration {itn} - Losses: {losses}")

# Save the trained model
nlp.to_disk(save_path)

print(f"Total data used for training (excluding skipped ones): {total_data}")

In [None]:
nlp = spacy.load(save_path)

text = """
Türk Telekom'dan internet bağlattım. 20 gündür internet bağlantım yok. Kayıt açıyorum. 2 gün sonra cevap geliyor. Bina içi tesisat arızası diye. Tüm bina içi tesisat yenilenmesine rağmen her açtığım kayıt otomatik kapatılıyor. Hiçbir kişi yardımcı olmuyor. Kullanamadığım interneti iptal etmek istediğimde 6.000 TL cayma bedeli talep ediliyor. Veremedikleri hizmet için bir de ceza çıkarıyorlar.
"""

doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)