<a href="https://colab.research.google.com/github/Boreddyakshithareddy3k/predii-vehicle-spec-extraction/blob/main/Untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
!pip install pymupdf sentence-transformers faiss-cpu openai




In [43]:
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import os
from openai import OpenAI



In [44]:
from google.colab import files
uploaded = files.upload()


Saving sample-service-manual 1.pdf to sample-service-manual 1 (2).pdf


In [45]:
pdf_path = list(uploaded.keys())[0]

def extract_pdf_text(path):
    doc = fitz.open(path)
    pages = []
    for i in range(len(doc)):
        text = doc[i].get_text("text")
        if text.strip():
            pages.append(text)
    return pages

pages = extract_pdf_text(pdf_path)
print("Total pages extracted:", len(pages))



Total pages extracted: 852


In [46]:
def chunk_text(text, max_chars=2000):
    chunks = []
    buf = []
    count = 0
    for word in text.split():
        buf.append(word)
        count += len(word) + 1
        if count >= max_chars:
            chunks.append(" ".join(buf))
            buf, count = [], 0
    if buf:
        chunks.append(" ".join(buf))
    return chunks

all_chunks = []
for page in pages:
    all_chunks.extend(chunk_text(page))

print("Total chunks:", len(all_chunks))
print("Sample chunk:\n", all_chunks[0][:300])



Total chunks: 950
Sample chunk:
 Suspension System Inspection and Verification 1. Road test. z Verify the customer concern by carrying out a road test on a smooth road. If any vibrations are apparent, refer to Section 100-04 . 2. Inspect tires. z Check the tire pressure with all normal loads in the vehicle and the tires cold. Refer


In [47]:
model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(
    all_chunks,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

embeddings = embeddings.astype("float32")

print("Embeddings shape:", embeddings.shape)


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Embeddings shape: (950, 384)


In [48]:
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

print("FAISS index size:", index.ntotal)


FAISS index size: 950


In [49]:
def retrieve(query, k=5):
    q_emb = model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True
    ).astype("float32")

    D, I = index.search(q_emb, k)
    return [all_chunks[i] for i in I[0]]


In [50]:
import re

def extract_specs(query, k=5):
    """
    Heuristic spec extractor (no external LLM).
    Uses retrieved chunks and searches for torque-like patterns.
    Returns JSON in the required format.
    """
    retrieved_chunks = retrieve(query, k=k)
    context = "\n".join(retrieved_chunks)

    # Break into lines
    lines = [ln.strip() for ln in context.splitlines() if ln.strip()]

    # Keywords from the query to help filter lines (except very short/common words)
    query_words = [w.lower() for w in re.findall(r"\w+", query) if len(w) > 3]
    query_words = [w for w in query_words if w not in ["torque", "specification", "specifications"]]

    candidate_lines = []
    for ln in lines:
        lower_ln = ln.lower()
        if ("tighten to" in lower_ln or "torque" in lower_ln) and any(
            qw in lower_ln for qw in query_words
        ):
            candidate_lines.append(ln)

    # Fallback: if nothing matched by keywords, just take any torque-like line
    if not candidate_lines:
        for ln in lines:
            if "tighten to" in ln.lower() or "torque" in ln.lower():
                candidate_lines.append(ln)

    if not candidate_lines:
        # Nothing found at all – return empty-ish JSON
        return [{
            "component": query.replace("Torque for", "").strip(),
            "spec_type": "Torque",
            "value": "",
            "unit": ""
        }]

    best_line = candidate_lines[0]

    # Find number + unit like "175 Nm" or "90 Nm (66 lb-ft)"
    m = re.search(r"(\d+)\s*(Nm|N·m|N-m|lb-ft|lb\s*ft|lb-in|lb\s*in)", best_line, re.IGNORECASE)
    if m:
        value = m.group(1)
        unit = m.group(2)
    else:
        value, unit = "", ""

    component_name = query.replace("Torque for", "").replace("torque for", "").strip().rstrip("?")

    return [{
        "component": component_name,
        "spec_type": "Torque",
        "value": value,
        "unit": unit
    }]


In [51]:
import requests
import os
HF_API_KEY = ""


In [39]:
import requests
import json
import re

HF_API_KEY = ""   # <-- Paste your HuggingFace key here

def _try_extract_json_from_text(text):
    text = re.sub(r"```(?:json)?", "", text, flags=re.IGNORECASE).strip()
    m = re.search(r"(\[.*\]|\{.*\})", text, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON found.")
    candidate = m.group(1)
    return json.loads(candidate)

def extract_specs(query, k=5):
    retrieved_chunks = retrieve(query, k=k)
    context = "\n\n---\n\n".join(retrieved_chunks)

    prompt = f"""
Extract structured vehicle specifications.

Return ONLY valid JSON:
[
  {{
    "component": "",
    "spec_type": "",
    "value": "",
    "unit": ""
  }}
]

Context:
{context}

Query:
{query}
"""

    url = "https://router.huggingface.co/hf-inference/models/mistralai/Mistral-7B-Instruct"
    headers = {"Authorization": f"Bearer {HF_API_KEY}"}

    payload = {
        "inputs": prompt,
        "parameters": {"max_new_tokens": 300, "temperature": 0.1},
        "options": {"wait_for_model": True}
    }

    response = requests.post(url, headers=headers, json=payload)
    output = response.json()

    if isinstance(output, dict) and "error" in output:
        print("HF API Error:", output)
        return None

    text = output[0]["generated_text"]
    text = re.sub(r"```(?:json)?", "", text).strip()

    return json.loads(text)



In [41]:
res1 = extract_specs("Torque for brake caliper bolts")
res2 = extract_specs("Torque for upper ball joint nut")
res3 = extract_specs("Torque for shock absorber upper mount nut")

import json
print("Brake caliper bolts:")
print(json.dumps(res1, indent=2))

print("\nUpper ball joint nut:")
print(json.dumps(res2, indent=2))

print("\nShock absorber upper mount nut:")
print(json.dumps(res3, indent=2))




HF API Error: {'error': 'This authentication method does not have sufficient permissions to call Inference Providers on behalf of user akshithaaaaab'}
HF API Error: {'error': 'This authentication method does not have sufficient permissions to call Inference Providers on behalf of user akshithaaaaab'}
HF API Error: {'error': 'This authentication method does not have sufficient permissions to call Inference Providers on behalf of user akshithaaaaab'}
Brake caliper bolts:
null

Upper ball joint nut:
null

Shock absorber upper mount nut:
null
