1. Import Libraries

In [None]:
# Import Libraries
import PyPDF2
import pytesseract
from PIL import Image
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import json

2. Load the Fine-Tuned Model:

In [None]:
# Step 2: Load the Fine-Tuned Model
tokenizer = AutoTokenizer.from_pretrained("ner_model")
model = AutoModelForTokenClassification.from_pretrained("ner_model")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

3. Extract text:

In [None]:
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to extract text from an image (OCR)
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    return pytesseract.image_to_string(image)

4. Process the text with NER Model:

In [None]:
# Function to process text with the NER model
def process_text_with_ner(text):
    entities = ner_pipeline(text)
    cleaned_entities = post_process_predictions(entities)
    return cleaned_entities

# Post-process predictions (from your existing code)
def post_process_predictions(entities):
    # Remove subword tokens and return entities as-is
    for entity in entities:
        entity["word"] = entity["word"].replace("##", "")
    return entities


4. Main Workflow:

In [None]:
pdf_path = "Resume_DM.pdf"  # Replace with your PDF file path
text = extract_text_from_pdf(pdf_path)

# Clean the extracted text
def clean_text(text):
    text = text.replace("\n", " ").strip()
    text = " ".join(text.split())
    return text

text = clean_text(text)

# Process the extracted text with the NER model
entities = process_text_with_ner(text)

# Lower the confidence threshold
def filter_entities(entities, threshold=0.5):
    return [entity for entity in entities if entity["score"] >= threshold]

entities = filter_entities(entities, threshold=0.5)

# Display the results
print("=== Extracted Text ===")
print(text)
print("\n=== Identified Entities ===")
for entity in entities:
    print(f"Entity: {entity['word']}")
    print(f"  Type: {entity['entity']}")
    print(f"  Confidence: {entity['score']:.2f}")
    print(f"  Start: {entity['start']}, End: {entity['end']}")
    print("-" * 40)