In [70]:
import os
import pickle
from pathlib import Path
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

from utils.preprocess import clean_extracted_text
from utils.extract_fields import extract_all_fields
from models.nlp_model import analyze_claim_description

print("✓ NLP Imports Loaded")

✓ NLP Imports Loaded


In [71]:
# Base folder = insurance-claim-checker
BASE = Path().resolve().parents[1]

DATA_DIR = BASE / "data"
OCR_PATH = DATA_DIR / "ocr_output.pkl"
NLP_PATH = DATA_DIR / "nlp_output.pkl"

print("BASE:", BASE)
print("DATA:", DATA_DIR)
print("OCR_PATH:", OCR_PATH)
print("NLP_PATH:", NLP_PATH)

BASE: D:\Desktop\insurance-claim-checker
DATA: D:\Desktop\insurance-claim-checker\data
OCR_PATH: D:\Desktop\insurance-claim-checker\data\ocr_output.pkl
NLP_PATH: D:\Desktop\insurance-claim-checker\data\nlp_output.pkl


In [72]:
if not OCR_PATH.exists():
    raise FileNotFoundError(f"❌ OCR file not found at: {OCR_PATH}\nRun OCR notebook first!")

with open(OCR_PATH, "rb") as f:
    ocr_data = pickle.load(f)

print(f"Loaded OCR entries: {len(ocr_data)}")

Loaded OCR entries: 2


In [73]:
MODEL_NAME = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

print("✓ DistilBERT loaded")

✓ DistilBERT loaded


In [74]:
def get_embedding(text):
    text = clean_extracted_text(text)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        output = model(**inputs).last_hidden_state.mean(dim=1)

    vec = output.squeeze().numpy()

    # Guarantee shape = 768
    if vec.shape[0] != 768:
        padded = np.zeros(768)
        padded[:min(768, len(vec))] = vec[:768]
        vec = padded

    return vec

In [75]:
nlp_output = {}
processed = 0

for filename, ocr_entry in ocr_data.items():

    # Validate structure
    if not isinstance(ocr_entry, dict):
        print(f"Skipping: {filename} (not a dict)")
        continue

    if "clean_text" not in ocr_entry:
        print(f"Skipping: {filename} (missing clean_text)")
        continue

    raw_text = ocr_entry["clean_text"]

    if not isinstance(raw_text, str):
        print(f"Skipping: {filename} (clean_text not string)")
        continue

    cleaned = clean_extracted_text(raw_text)
    embedding = get_embedding(cleaned)
    fields = extract_all_fields(cleaned)
    analysis = analyze_claim_description(cleaned)

    nlp_output[filename] = {
        "clean_text": cleaned,
        "embedding": embedding,
        "fields": fields,
        "analysis": analysis,
        "severity_score": analysis["severity_score"]
    }

    processed += 1

print(f"\n✓ NLP Completed for {processed} files")


✓ NLP Completed for 2 files


In [76]:
os.makedirs(DATA_DIR, exist_ok=True)

with open(NLP_PATH, "wb") as f:
    pickle.dump(nlp_output, f)

print("NLP saved to:", NLP_PATH)

NLP saved to: D:\Desktop\insurance-claim-checker\data\nlp_output.pkl
