In [9]:
import os
import pytesseract
from pdf2image import convert_from_path
import requests
import json
from sentence_transformers import SentenceTransformer
import re
import weaviate
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType,ReferenceProperty
import ollama
from weaviate.util import generate_uuid5
import subprocess

In [20]:

# Initialize SentenceTransformer model
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Set directory containing PDFs
pdf_directory = r"C:\Users\AbhinavKasubojula\OneDrive - Kenall Inc\Desktop\resumes\2"

url = "http://localhost:11434/api/generate"
headers = {"Content-Type": "application/json"}


In [21]:


# Compute embeddings for documents
def compute_embeddings(documents):
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings


In [22]:

# Extract text from a PDF using OCR
def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)  # Convert PDF to images
        text = ""
        for page_num, image in enumerate(images):
            text += pytesseract.image_to_string(image)  # OCR on the image
            print(f"Extracted text from page {page_num + 1} of {os.path.basename(pdf_path)}...")
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None


In [23]:
import re

def text_to_dict(text):
    result = {}
    current_key = None
    
    # Split text into lines
    lines = text.strip().split("\n")
    
    # Check if the first line looks like an unwanted heading
    if re.match(r"^(Here is a summary|Summary of)", lines[0], re.IGNORECASE):
        lines = lines[1:]  # Skip the first line if it's an unwanted heading

    for line in lines:
        if ":" in line:  # Line contains a key-value pair
            key, value = line.split(":", 1)
            current_key = key.strip()
            result[current_key] = value.strip()
        else:
            if current_key:  # Append to the previous key's value
                result[current_key] += " " + line.strip()
    
    return result


In [25]:
def summarize_text_with_llm(text):
    prompt = f"""
    You are an expert in summarizing and structuring resumes. Your task is to extract information and organize it into the exact structure below. The output format must not change across different resumes, regardless of the content.

    Rules:
    Use the exact headings listed below without any modification or omission.
    Leave headings blank if no information is available for that section.
    Do not rearrange, rename, or skip any headings.
    Licenses must appear only under the "Licenses" heading.
    Extract company names consistently, ensuring accuracy.
    Output Format (retain these exact headings):

     Name: [Name here]  
     Education Qualification: [Qualifications here, e.g., PE, BA, etc.]  
     Role: [Role here]  
     Licenses: [Licenses here]  
     Company: [Company names here]  
     Qualifications: [Additional qualifications here]  
     Experience: [Experience details here]  
     Projects he/she worked on: [Projects listed here]  
     Roles and Responsibilities: [Responsibilities here]  

    Resume Text:
    {text}
    Output Instructions:
    Follow the exact structure provided above.
    Include all information under the correct headings.
    Do not modify or skip any section, even if a section is blank.

    """
    data = {"model": "llama3.2:latest", "prompt": prompt, "stream": False}
    response = requests.post(url, headers=headers, data=json.dumps(data))
    
    response_json = response.json()

    text = response_json.get("response", "No response found")
    unwanted_prefix = "Here is a summary of the resume:"
    if text.startswith(unwanted_prefix):
        text = text[len(unwanted_prefix):].strip()
    return text


In [26]:
def remove_asterisks(text):
    # Remove all occurrences of single and double asterisks
    clean_text = re.sub(r"\*{1,2}", "", text)
    return clean_text

In [None]:

# Process each PDF, extract text, summarize, and store in Weaviate
def process(sum_dict, emb, summary_text):
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        if text:
            # Summarize the extracted text
            summary1 = summarize_text_with_llm(text)
            #print(f"Summary of {file_name}: {summary[:100]}...")  # Print first 100 characters of summary
            summary = remove_asterisks(summary1)
            summary_dict= text_to_dict(summary)
            print(summary_dict)
            

            # Generate vector embedding
            embedding = compute_embeddings([summary1])[0]           
            # Store data in Weaviate
            #store_in_weaviate(summary, embedding,collection_name)
        sum_dict.append(summary_dict)
        emb.append(embedding)
        summary_text.append(summary)

    return(sum_dict, emb, summary_text)

# Run the process
sum_dict=[]
embeddings =[]
summary_text =[]
sum_dict, embeddings, summary_text = process(sum_dict, embeddings, summary_text)
for i in range(len(sum_dict)):
    sum_dict[i]["vector"] = embeddings[i].tolist()
    sum_dict[i]["summary"] = summary_text[i]

Processing: page_1.pdf
Extracted text from page 1 of page_1.pdf...
{'Here is the extracted and organized resume in the required format': ' ', 'Name': 'Kris D. Prasad', 'Education Qualification': 'MS, Civil Engineering, 1992', 'Role': 'Program Manager/ Senior Structural Engineer', 'Licenses': 'PE (Civil): TX #91952 (2003), LA #34186, NM #20737, PR# 28316', 'Company': 'SAT-Kenall JV | Houston, TX', 'Qualifications': 'Additional qualifications not mentioned in the provided text.', 'Experience': 'Over 29 years of professional engineering and management experience with various federal, state, municipal and local programs. Currently serving as Program manager for USACE Fort Worth and Tulsa Districts on various civil works and military projects.', 'Projects he/she worked on': '', '1. USACE Fort Worth District - Renovation of Buildings 16, 44, 615, 2002, and Progress Services': '2022', '2. USACE Fort Worth District - Repair Fire Protection, Install Security Fences PROFESSIONAL services': '2021

In [128]:
sum_dict

[{'Name': 'Kris D. Prasad ',
  'Education Qualification': 'MS, Civil Engineering, 1992; PE (Civil): TX #91952 (2003), LA #34186, NM #20737, PR# 28316 ',
  'Role': 'Program Manager/ Senior Structural Engineer ',
  'Licenses': ' ',
  'Company': 'SAT-Kenall JV | Houston, TX ',
  'Qualifications': 'SAME, ASCE, ACEC, College of Engineers and Surveyors of Puerto Rico ',
  'Experience': ' - USACE Fort Worth District - Renovation of Buildings 16, 44, 615, 2002',
  'Program Manager/ Senior Structural v Performed contract management, coordinated with USACE and Engineer for the preparation of full con-struction plans and specs to renovate five historical buildings (total 100,400 SF renovation area) that range from 100-130 years old to provide functional administrative office space with a 25-year usable life. Cost': '$31M',
  '- USACE Fort Worth District - Repair Fire Protection, Install Security Fences PROFESSIONAL sERvices': '2021 Program Manager/ Senior Structural VV Responsible for contract ma

In [129]:

client = weaviate.connect_to_local(port=8082)
# Delete a collection
employee = "employee"
experience = "experience"

if client.collections.exists(name="experience"):
    client.collections.delete(name="experience")
    print("Existing", experience ,"collection deleted.")

client.collections.create(
    name=experience,  # Name of the data collection
    properties=[
        Property(name="Name", data_type=DataType.TEXT),
        Property(name="emp_id", data_type=DataType.TEXT),
        Property(name="role", data_type=DataType.TEXT),
        Property(name="company", data_type=DataType.TEXT),
        Property(name="qualification", data_type=DataType.TEXT),
        Property(name="experience", data_type=DataType.TEXT),
        Property(name="projects", data_type=DataType.TEXT),
        Property(name="roles_responsibilities", data_type=DataType.TEXT),
        Property(name="summary", data_type = DataType.TEXT)
    ])
print("Collection",experience," created successfully.")

if client.collections.exists(name=employee):
    client.collections.delete(name=employee)
    print("Existing",employee,"collection deleted.")


client.collections.create(
    name=employee,  # Name of the data collection
    properties=[
        Property(name="Name", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="role", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="company", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="qualification", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="experience", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="projects", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="roles_responsibilities", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="summary", data_type = DataType.TEXT,index_searchable=True,index_filterable=True)
    ],
    references=[
        ReferenceProperty(
            name="exp_id",
            target_collection=experience
        )
    ]

)
print("Collection",employee," created successfully.")

if client.collections.exists(name="dup_coll"):
    client.collections.delete(name="dup_coll")
    print("Existing collection deleted.")


client.collections.create(
    name="dup_coll",  # Name of the data collection
    properties=[
        Property(name="Name", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="role", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="company", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="qualification", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="experience", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="projects", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="roles_responsibilities", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="summary", data_type = DataType.TEXT,index_searchable=True,index_filterable=True)
    ]

)
print("collection dup_coll created")
client.close()

Existing experience collection deleted.
Collection experience  created successfully.
Existing employee collection deleted.
Collection employee  created successfully.
Existing collection deleted.
collection dup_coll created


In [130]:
client = weaviate.connect_to_local(port=8082)

# Get the collections
emp = client.collections.get("employee")
exp = client.collections.get("experience")
dup = client.collections.get("dup_coll")

# Batch process for the 'employee' collection
employee_names = []
employee_uuids ={}
dup_emp = {}
with emp.batch.dynamic() as batch:

    for d in sum_dict:
        if d["Name"] not in employee_names:
            employee_uuid = batch.add_object({

                "Name": d["Name"],
                "license":next(
                        (d.get(key) for key in ["Licenses/Certifications", "Licenses", "PE (Civil)"] if key in d),
                        "Not provided"),
                "role": next(
                        (d.get(key) for key in ["Role", "Roles and Responsibilities"] if key in d),
                        "Not provided"),
                "company": d["Company"],
                "qualification": d["Qualifications"],
                "experience": d["Experience"],
                "summary": d["summary"],
            }, vector=d["vector"])
            employee_names.append(d["Name"])
            employee_uuids[d["Name"]] = employee_uuid  # Map Name to UUID for later reference
        else:
            dup_emp[d["Name"]] = d["summary"]
for item in emp.iterator():
    if item.properties["name"] in dup_emp:
        print("updating ", item.properties["name"])
        emp.data.update(
            uuid=employee_uuids[item.properties["name"]],
            properties={
                "summary": dup_emp[item.properties["name"]],
            }
        )

client.close()


KeyError: 'Experience'

clear till here(data loaded in Db)

In [None]:
client = weaviate.connect_to_local(port=8082)
emp = client.collections.get("employee")
query_vector = compute_embeddings("get who worked as civil engineer")

response = emp.query.near_vector(
    near_vector=query_vector,
    limit=2,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

print(response.objects[0].properties["summary"])

In [None]:
client = weaviate.connect_to_local(port=8082)
collection = client.collections.get("employee")

for item in collection.iterator():
    print(item.uuid, item.properties["summary"])