In [1]:
# Import required libraries
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

# Load the annotated data from a JSON file
cv_data = json.load(open('training_data.json', 'r'))

# Display the number of items in the dataset
print(len(cv_data))

# Display the first item in the dataset
#print(cv_data['annotations'][4])


1


In [2]:
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(file, data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('en')
  db = DocBin()

  # Iterate through the data
  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    # Extract entities from the annotations
    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity:
        continue

      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        # Log errors for annotations that couldn't be processed
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

In [3]:
# Split the annotated data into training and testing sets
from sklearn.model_selection import train_test_split

# Assuming cv_data is a dictionary containing annotations under the key 'annotations'
annotations = cv_data['annotations']

# Split the annotations into training and testing sets
train_ann, test_ann = train_test_split(annotations, test_size=0.2)

# Display the number of items in the training and testing sets
len(train_ann), len(test_ann)


(5, 2)

In [4]:
# Open a file to log errors during annotation processing
file = open('train_file.txt','w')

# Create spaCy DocBin objects for training and testing data
db = get_spacy_doc(file, train_ann)
db.to_disk('train_data.spacy')

db = get_spacy_doc(file, test_ann)
db.to_disk('test_data.spacy')

# Close the error log file
file.close()

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 20.05it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 26.98it/s]


In [5]:
!python -m spacy train config.cfg --output /output --paths.train train_data.spacy --paths.dev test_data.spacy --gpu-id -1


[38;5;2m[+] Created output directory: \output[0m
[38;5;4m[i] Saving to output directory: \output[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    439.93    0.00    0.00    0.00    0.00
 40     200        309.04  33622.50   75.04   80.00   70.66    0.75
 80     400        128.79   1849.91   72.98   80.30   66.88    0.73
120     600        131.16    789.97   71.83   75.17   68.77    0.72
160     800        148.37    566.80   73.45   80.99   67.19    0.73
200    1000        490.17   1470.57   69.76   76.60   64.04    0.70
240    1200        171.68    405.55   73.99   79.64   69.09    0.74
280    1400        185.99    429.35   75.08   77.08   73.19    0.75
320    1600        173.95    404.32   71.58   80.63 

In [2]:
# Import the spaCy library
import spacy

# Load the trained spaCy NER model from the specified path
nlp = spacy.load('C:/output/model-best')

# Import necessary libraries for PDF processing
import sys
import fitz

# Specify the path to the PDF file containing the resume
fname = 'C:/Users/USER/Desktop/Test files/test1.pdf'

# Open the PDF document using PyMuPDF (fitz)
doc = fitz.open(fname)

# Initialize an empty string to store the extracted text from the PDF
text = " "

# Iterate through each page in the PDF and concatenate the text
for page in doc:
  text = text + str(page.get_text())

# Display the extracted text
print(text)

 Breast cancer is a type of cancer that originates in the cells of the breast tissue. It can manifest with various 
symptoms, including the presence of breast lump, changes in breast size or breast shape, nipple inversion 
or nipple discharge, skin redness or skin dimpling, breast pain, and swollen lymph nodes . Treatment for 
breast cancer typically involves a combination of therapies. Chemotherapy, hormonal therapy using drugs 
like Tamoxifen or aromatase inhibitors, targeted therapy with medications such as Herceptin, and 
radiation therapy are commonly employed to combat the disease. These treatments aim to destroy cancer 
cells, shrink tumors, and prevent the cancer from spreading or recurring. Early detection and prompt 
treatment significantly increase the chances of successful outcomes for individuals diagnosed with breast 
cancer .Skin cancer, particularly melanoma, is characterized by abnormal growth of skin cells. Symptoms 
often include changes in mole color, mole size, mol

In [3]:
# Process the extracted text using the loaded spaCy NER model
doc = nlp(text)

# Iterate through the named entities (entities) recognized by the model
for ent in doc.ents:
  # Print the recognized text and its corresponding label
  print(ent.text, "  ->>>>  ", ent.label_)

Breast cancer   ->>>>   DISEASE
nipple discharge   ->>>>   SYMPTOMS
skin redness   ->>>>   SYMPTOMS
breast pain   ->>>>   SYMPTOMS
swollen lymph nodes .   ->>>>   SYMPTOMS
Chemotherapy   ->>>>   MEDICATION
hormonal therapy   ->>>>   MEDICATION
Tamoxifen   ->>>>   MEDICATION
aromatase inhibitors   ->>>>   MEDICATION
Herceptin   ->>>>   SYMPTOMS
therapy   ->>>>   MEDICATION
changes in mole color   ->>>>   SYMPTOMS
mole size   ->>>>   SYMPTOMS
mole shape   ->>>>   SYMPTOMS
mole itching   ->>>>   SYMPTOMS
mole pain   ->>>>   SYMPTOMS
melanoma   ->>>>   SYMPTOMS
surgery   ->>>>   MEDICATION
Targeted therapy   ->>>>   MEDICATION
BRAF inhibitors   ->>>>   MEDICATION
radiation therapy   ->>>>   MEDICATION
high-energy beams   ->>>>   MEDICATION
Leukemia   ->>>>   DISEASE
fatigue   ->>>>   SYMPTOMS
weakness   ->>>>   SYMPTOMS
pale skin   ->>>>   SYMPTOMS
frequent infections   ->>>>   SYMPTOMS
fever   ->>>>   SYMPTOMS
bleeding or bruising   ->>>>   SYMPTOMS
Chemotherapy   ->>>>   MEDICATION
Targe

NameError: name 'test_ann' is not defined