In [None]:
!pip install spacy



In [None]:
import pandas as pd



df_train = pd.read_csv("train_new_dataset.csv")
df_test = pd.read_csv("test_new_dataset.csv")



def convert_df_to_spacy_format(df):
    grouped = df.groupby('sentence_id')
    sentences = []

    for _, group in grouped:
        sentence = ' '.join(group['word'].tolist())
        entities = []
        start = 0

        for word, label in zip(group['word'], group['label']):
            end = start + len(word)
            if label != 'O':
                entities.append((start, end, label.split('-')[-1]))  # (start, end, entity_type)
            start = end + 1  # Add 1 for the space

        sentences.append((sentence, {"entities": entities}))

    return sentences

training_data = convert_df_to_spacy_format(df_train)
test_data  = convert_df_to_spacy_format(df_test)

In [None]:
import spacy
from spacy.tokens import DocBin

def create_training_spacy(training_data, output_path):
    nlp = spacy.blank('tr')
    doc_bin = DocBin()
    for text, annotations in training_data:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annotations["entities"]:
            span = doc.char_span(start, end, label=label)
            ents.append(span)
        doc.ents = ents
        doc_bin.add(doc)
    doc_bin.to_disk(output_path)

# Save the training data
create_training_spacy(training_data, "train.spacy")
create_training_spacy(test_data, "test.spacy")



In [None]:
!python -m spacy init config config.cfg --lang tr --pipeline ner


[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: tr
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./test.spacy


[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     49.22   12.92    9.90   18.59    0.13
  0     200        412.82   8885.57   64.32   64.97   63.69    0.64
  0     400        293.18   8905.57   69.17   66.79   71.73    0.69
  0     600        361.54  11266.87   70.46   72.07   68.91    0.70
  1     800        597.31  12861.44   68.79   72.66   65.31    0.69
  2    1000       1350.82  16235.89   68.32   75.43   62.44    0.68
  2    1200       1070.54  19388.56   71.55   73.45   69.75    0.72
  3    1400       1494.15  23295.28   70.44   75.07   66.35    0.70
  5    1600       2017.61  27968.13   71.39   70.79   72.01    0.71
  6    1800       3053.50  33224.56   69.60   

In [None]:
import spacy

# Load the trained model
nlp = spacy.load('/content/output/model-best')


# Test the model
doc = nlp(" GİVE YOUR TEST SENTENCES")
for ent in doc.ents:
    print(ent.text, ent.label_)


Tip PRESENT
C PRESENT
paternde PRESENT
duyarlılık PRESENT
; ANAT
fokal PRESENT
dansite PRESENT
kadranlarda ANAT
parankimle PRESENT
örtülü PRESENT
izodens PRESENT
seçilebilen ABSENT
kitle ABSENT
lezyonu ABSENT
ve ABSENT
görünümde ABSENT
büyümüş ABSENT
lenf ABSENT
; ANAT
alt ANAT
iç ANAT
kadranda ANAT
şekilli PRESENT
konturu PRESENT
olarak PRESENT
izlenen PRESENT
11 PRESENT
mm PRESENT
boyutunda PRESENT
lezyon PRESENT
parankimle PRESENT
örtülü PRESENT
izodens PRESENT
şüpheli PRESENT
lezyonlar PRESENT
ve ABSENT
görünümde ABSENT
büyümüş ABSENT
lenf ABSENT


In [None]:
spacy.displacy.render(doc, style="ent", jupyter=True)