converting .txt policy documents to chunks

In [28]:
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize

def sentence_chunk_policy_docs(folder_path, chunk_size=3):
    chunks = []
    for file in Path(folder_path).glob("*.txt"):
        with open(file, "r", encoding="utf-8") as f:
            text = f.read()
        sentences = sent_tokenize(text)
        for i in range(0, len(sentences), chunk_size):
            chunk = " ".join(sentences[i:i + chunk_size])
            chunks.append({"content": chunk, "source": file.name})
    return chunks


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aimte\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


embed chunks using a sentence embedding model

In [29]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Get embeddings for chunks
chunks = sentence_chunk_policy_docs("policy_doc")
texts = [chunk["content"] for chunk in chunks]
embeddings = embedder.encode(texts, convert_to_tensor=True)


saving the chunks

In [52]:
import pickle
import numpy as np

# Save chunks
with open("policy_chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)


store and query using faiss

In [30]:
import faiss
import numpy as np

# Convert embeddings to numpy
embedding_matrix = np.array([emb.numpy() for emb in embeddings])

# Build FAISS index
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)


saving FAISS index

In [50]:
faiss.write_index(index, "policy_index.faiss")


loading faiss index

In [51]:
index = faiss.read_index("policy_index.faiss")


to query relevant policy chunks for a claim

In [31]:
def retrieve_relevant_chunks(claim_text, k=3):
    query_embedding = embedder.encode([claim_text])[0]
    distances, indices = index.search(np.array([query_embedding]), k)
    return [chunks[i] for i in indices[0]]


comnine reteived context+claim->feed into the model

In [22]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import pandas as pd
# Encode target labels
label_encoder = LabelEncoder()
df = pd.read_csv("claims_final.csv", quotechar='"')
df["SeverityEncoded"] = label_encoder.fit_transform(df["SeverityLabel"])


# Local directory where model was saved
model_save_path = "roberta_severity_model"

# Load tokenizer and model from local folder
tokenizer = RobertaTokenizer.from_pretrained(model_save_path)
classifer = RobertaForSequenceClassification.from_pretrained(model_save_path)
classifer.eval()
print("✅ Model and tokenizer loaded from local directory.")


✅ Model and tokenizer loaded from local directory.


In [48]:
import torch

def predict_with_context(claim_text):
    retrieved = retrieve_relevant_chunks(claim_text)
    context = " ".join([chunk["content"] for chunk in retrieved])
    
    # Combine context and claim
    # input_text = f"[CLAIM]: {claim_text}"

    inputs = tokenizer(claim_text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = classifer(**inputs)
        predicted_class = torch.argmax(outputs.logits, dim=1).item()
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]

    return predicted_label,retrieved


Test the full RAG pipeline

In [49]:
claim = "my eye got stabbed"
label, retrieved_chunks = predict_with_context(claim)
print("claim:", claim)
print("Predicted Severity Label:", label)

print("\nTop Retrieved Policy Chunks:")
for i, chunk in enumerate(retrieved_chunks, 1):
    print(f"\n--- Chunk {i} from {chunk['source']} ---\n{chunk['content']}")


claim: my eye got stabbed
Predicted Severity Label: High

Top Retrieved Policy Chunks:

--- Chunk 1 from POL002_Laceration_Injuries.txt ---
Policy Name: Laceration and Cut Injuries (POL002)

This policy covers accidental cuts, punctures, or lacerations sustained in the workplace while using tools, equipment, or materials. Commonly covered scenarios include injuries from utility knives, glass, or sharp machine edges. To be eligible, the injury must be promptly reported (within 24 hours) and documented with a supervisor's report and a clinical evaluation.

--- Chunk 2 from POL006_Repetitive_Stress.txt ---
Policy Name: Repetitive Stress Injuries (POL006)

Coverage includes injuries caused by repetitive movements such as typing, scanning, lifting, or tool use. These include carpal tunnel syndrome, tendonitis, and joint inflammation. A medical history review and work activity log must accompany the claim.

--- Chunk 3 from POL004_Falling_Object_Injuries.txt ---
Policy Name: Falling Object I