In [8]:
import os
import pytesseract
from pdf2image import convert_from_path
import requests
import json
from sentence_transformers import SentenceTransformer
import re
import weaviate
from weaviate import client
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType
import ollama
from weaviate.util import generate_uuid5
import Levenshtein

In [9]:

# Initialize SentenceTransformer model
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Set directory containing PDFs
pdf_directory = "C:/Users/AbhinavKasubojula/OneDrive - Kenall Inc/Desktop/code/docs/"

# Ollama API details (if needed for later use)
url = "http://localhost:11434/api/generate"
headers = {"Content-Type": "application/json"}


In [10]:


# Compute embeddings for documents
def compute_embeddings(documents):
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings


In [11]:

# Extract text from a PDF using OCR
def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)  # Convert PDF to images
        text = ""
        for page_num, image in enumerate(images):
            text += pytesseract.image_to_string(image)  # OCR on the image
            print(f"Extracted text from page {page_num + 1} of {os.path.basename(pdf_path)}...")
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None


In [12]:
import re

def text_to_dict(text):
    result = {}
    current_key = None
    
    # Split text into lines
    lines = text.strip().split("\n")
    
    # Check if the first line looks like an unwanted heading
    if re.match(r"^(Here is a summary|Summary of)", lines[0], re.IGNORECASE):
        lines = lines[1:]  # Skip the first line if it's an unwanted heading

    for line in lines:
        if ":" in line:  # Line contains a key-value pair
            key, value = line.split(":", 1)
            current_key = key.strip()
            result[current_key] = value.strip()
        else:
            if current_key:  # Append to the previous key's value
                result[current_key] += " " + line.strip()
    
    return result


In [13]:
def summarize_text_with_ollama(text):
    prompt = f"""
    you are expert in summarizing and sagrigating things. licenses need to be clearly mentioned in licenses category itself. you should get company's name very clearly and get every time in same way.
    structure to get Summarized resume and highlight main points specifically.
    - Name
    - education qualification(you are able to find at name ending (like PE, BA, etc..,))
    - Role
    - Licenses
    - company
    - Qualifications
    - Experience
    - Projects he/she worked before
    - Roles and Responsibilities

    keep headings as it is. for my future reference
    Resume Text:
    {text}
    """
    data = {"model": "llama3.2-vision:latest", "prompt": prompt, "stream": False}
    response = requests.post(url, headers=headers, data=json.dumps(data))
    
    response_json = response.json()

    text = response_json.get("response", "No response found")
    unwanted_prefix = "Here is a summary of the resume:"
    if text.startswith(unwanted_prefix):
        text = text[len(unwanted_prefix):].strip()
    return text


In [14]:

# Function to clean text
def clean_text(text):
    """Remove asterisks and trim whitespace from the given text."""
    return re.sub(r"\*{1,}", "", text).strip()

# Normalize the dictionary
def normalize_data(data):
    """
    Normalize dictionary keys and values by removing unwanted characters and whitespace.
    """
    normalized_data = []
    for record in data:
        if isinstance(record, dict):
            normalized_record = {clean_text(key): clean_text(value) for key, value in record.items()}
            normalized_data.append(normalized_record)
        else:
            print(f"Warning: Expected a dictionary but got {type(record)}")
    return normalized_data


In [15]:

# Process each PDF, extract text, summarize, and store in Weaviate
def process(sum_dict, emb, summary_text):
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        if text:
            # Summarize the extracted text
            summary1 = clean_text(text)
            summary = summarize_text_with_ollama(summary1)
            summary = clean_text(summary)
            summary_dict= text_to_dict(summary)
            print(summary_dict)
            

            # Generate vector embedding
            embedding = compute_embeddings([summary1])[0]           
            # Store data in Weaviate
            #store_in_weaviate(summary, embedding,collection_name)
        sum_dict.append(summary_dict)
        emb.append(embedding)
        summary_text.append(summary)

    return(sum_dict, emb, summary_text)




process starts here

In [16]:
# Run the process
sum_dict=[]
embeddings =[]
summary_text =[]
sum_dict, embeddings, summary_text = process(sum_dict, embeddings, summary_text)
#
for i in range(len(sum_dict)):
    sum_dict[i]["vector"] = embeddings[i].tolist()
    sum_dict[i]["summary"] = summary_text[i]

Processing: page_1 (2).pdf
Extracted text from page 1 of page_1 (2).pdf...
{'Here is the summarized version of the resume': ' ', 'Name': 'Kris D. Prasad', 'Role': 'Program/Project Manager ', 'Company': 'Kenall-Hana JV, LLC', 'Location': 'Meridian, MS', 'Education': ' - M.S., Civil Engineering (1992) ', 'Licenses': '', '- P.E. (Civil)': 'TX #91952, LA #34186, NM #20737, PR #28316 ', 'Qualifications': ' - Over 29 years of professional engineering and management experience - Currently serving as task order manager for USACE Fort Worth and Tulsa Districts - Experience includes over 1,000 projects with a value of over $100M - Member of SAME, ASCE, ACEC, College of Engineers and Surveyors of Puerto Rico ', 'Experience': ' - Program/Project Manager at Kenall-Hana JV, LLC for various federal, state, municipal, and local programs - Task order manager for USACE Fort Worth and Tulsa Districts on civil works and military projects - Managed contracts, coordinated with USACE and subcontractors, and 

In [17]:
sum_dict

[{'Here is the summarized version of the resume': ' ',
  'Name': 'Kris D. Prasad',
  'Role': 'Program/Project Manager ',
  'Company': 'Kenall-Hana JV, LLC',
  'Location': 'Meridian, MS',
  'Education': ' - M.S., Civil Engineering (1992) ',
  'Licenses': '',
  '- P.E. (Civil)': 'TX #91952, LA #34186, NM #20737, PR #28316 ',
  'Qualifications': ' - Over 29 years of professional engineering and management experience - Currently serving as task order manager for USACE Fort Worth and Tulsa Districts - Experience includes over 1,000 projects with a value of over $100M - Member of SAME, ASCE, ACEC, College of Engineers and Surveyors of Puerto Rico ',
  'Experience': ' - Program/Project Manager at Kenall-Hana JV, LLC for various federal, state, municipal, and local programs - Task order manager for USACE Fort Worth and Tulsa Districts on civil works and military projects - Managed contracts, coordinated with USACE and subcontractors, and prepared full construction plans ',
  'Projects': '  1. 

In [18]:

def Create_collection4resume():
    from weaviate.classes.config import ReferenceProperty
    client = weaviate.connect_to_local(port=8082)
    # Delete a collection
    employee = "testing_employee"
    companies = "testing_Companies"

    if client.collections.exists(name=companies):
        client.collections.delete(name=companies)
        print("Existing", companies ,"collection deleted.")

    client.collections.create(
        name=companies,  # Name of the data collection for collecting company names
        properties=[
            Property(name="company_name", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="emp_names", data_type = DataType.TEXT,index_searchable=True,index_filterable=True)
        ])
    print("Collection",companies," created successfully.")

    if client.collections.exists(name=employee):
        client.collections.delete(name=employee)
        print("Existing",employee,"collection deleted.")


    client.collections.create(
        name=employee,  # Name of the data collection for collecting employee details
        properties=[
            Property(name="Name", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="role", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="company", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="qualification", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="experience", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="projects", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="roles_responsibilities", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="summary", data_type = DataType.TEXT,index_searchable=True,index_filterable=True)
        ],
        references=[
            ReferenceProperty(
                name="company_name",
                target_collection = companies
            )
        ]

    )
    print("Collection",employee," created successfully.")

    #making connection from "companies" to "employee"
    company = client.collections.get(companies)
    company.config.add_reference(
        ReferenceProperty(
            name="Name",
            target_collection=employee
        )
    )






    client.close()

In [19]:
def load_ResumeInCollection():
    
    client = weaviate.connect_to_local(port = 8082)

    # Get the collections
    emp = client.collections.get("testing_employee")
    comp = client.collections.get("testing_Companies")

    # Batch process for the 'employee' collection
    employee_names = []
    employee_uuids ={}
    dup_emp = {}
    company_names = {}
    with emp.batch.dynamic() as batch:

        for d in sum_dict:
            if d["Name"] in employee_names:
                dup_emp.setdefault(d["Name"],[])
                dup_emp[d["Name"]].append(d["summary"])

            else:
                employee_uuid = batch.add_object({

                    "Name": d["Name"],
                    "role": d["Role"],
                    "company": d["Company"],
                    "license": next(
                        (d.get(key) for key in ["Licenses/Certifications", "Licenses", "PE (Civil)"] if key in d),
                        "Not provided"  # Default value if none of the keys exist
                    ),
                    "qualification": next(
                        (d.get(key) for key in ["Qualifications", "Summary of Qualifications"] if key in d),
                        "Not provided"  # Default value
                    ),
                    "experience": d["Experience"],
                    "projects": d.get("Projects", "not found"),
                    "summary": d["summary"],
                }, vector=d["vector"])

                employee_names.append(d["Name"])

                employee_uuids[d["Name"]] = employee_uuid  # Map Name to UUID for later reference

                company_names.setdefault(d["Company"],[])
                company_names[d["Company"]].append(d["Name"])
                




    for i in dup_emp:
        for item in emp.iterator():
            if (i == item.properties["name"]):
                temp = item.properties["summary"] + " "+ dup_emp[i][0]
                print("updating ", i)
                
                emp.data.update(
                    uuid=employee_uuids[i],
                    properties={
                        "summary": temp,
                        
                    }
                )

    with comp.batch.dynamic() as batch:

        for d in company_names:
            temp = ""
            for i in company_names[d]:
                temp= i +" | "+ temp
            batch.add_object({
                "company_name" : d,
                "emp_names" : temp
            })





    client.close()


Create collection and load resumes here

load resumes here

In [23]:
client = weaviate.connect_to_local(port="8082")
obj = client.collections.get("testing_Companies")
for i in obj.iterator():
    print(i.properties)
client.close()

{'emp_names': 'Robert Morris, AIA, RID DQC Manager | ', 'company_name': 'SAI-Kenall JV'}
{'emp_names': 'Robert Gaylord | Brett Witte, PE  | ', 'company_name': 'SAI-Kenall JV, Orlando, FL '}
{'emp_names': 'Kris D. Prasad | ', 'company_name': 'Kenall-Hana JV, LLC'}
{'emp_names': 'Kris D. Prasad, PE  | ', 'company_name': 'SAT-Kenall JV, Houston, TX '}
{'emp_names': 'Srujan Chikyala | ', 'company_name': 'SAT-Kenall JV'}


In [24]:
client = weaviate.connect_to_local(port="8082")
emp = client.collections.get("testing_employee")
for item in emp.iterator():
    print(item.properties)
client.close()

{'name': 'Brett Witte, PE ', 'company': 'SAI-Kenall JV, Orlando, FL ', 'qualification': '  Over 18 years of experience in conducting engineering design, analysis, calculations, and developing technical specifications for civil infrastructure design projects Skilled at H&H modeling using numerous programs, civil infrastructure design, construction plans production using MicroStation and AutoCAD, quantity take-offs, cost estimates, construction phase support and as-built certifications ', 'summary': 'Here is the summarized resume in the required format:\n\nName: Brett Witte, PE\n\nEducation Qualification: BS, Civil Engineering (2005)\n\nRole: Project Manager / Civil Engineer\n\nLicenses:\n\n PE (Civil): FL #70720 (2010)\n Current Professional Registration (State and Discipline): Not specified\n\nCompany: SAI-Kenall JV, Orlando, FL\n\nQualifications:\n\n Over 18 years of experience in conducting engineering design, analysis, calculations, and developing technical specifications for civil 

clear till here(data loaded in Db)

In [25]:
client = weaviate.connect_to_local(port="8082")
emp = client.collections.get("testing_employee")
query_vector = compute_embeddings("""the following engineers may be required for this project:

1. Civil engineers with experience in construction or infrastructure development.
2. Structural engineers to design and ensure structural integrity of buildings, bridges, or other structures.
3. Environmental engineers to assess and mitigate environmental impacts.
4. Geotechnical engineers to evaluate soil and rock conditions.""")

response = emp.query.near_vector(
    near_vector=query_vector,
    limit=4,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

print(response.objects[0].properties["summary"])
client.close()

Here is a summarized version of the resume text:

Name: Robert Gaylord
Education Qualification: PE (Civil), BS in Civil Engineering
Role: Sr. Civil Engineer
Licenses:
 PE (Civil): FL #51373 (1997)
Company: SAI-Kenall JV, Orlando, FL

Qualifications:

 35 years of experience conducting engineering design, analysis, and developing technical specifications for civil infrastructure design projects
 Expertise in complex H&H modeling and flood control design as well as administering construction & implementation of project designs

Experience:

 Sr. Civil Engineer for various USACE Districts (including Jacksonville), US Air Force, and other federal agencies including military installations and federal facilities in Florida and Puerto Rico
 Led and supported projects with expertise in civil infrastructure design including site design, utilities, roads, parking, airfields, parks, pump stations, and stormwater management

Projects:

1. Paint and Sandblast Facility, USCG Miami Station, FL (2021)

In [22]:
#step-1 run only once, that too for the very first time
#creates two collections 1.employee, 2.companies 
Create_collection4resume()
#loads data in both employee and companies collections
load_ResumeInCollection()

Collection testing_Companies  created successfully.
Collection testing_employee  created successfully.


In [29]:


client = weaviate.connect_to_local(port="8082")

questions = client.collections.get("testing_employee")
q = """the following engineers may be required for this project:

1. Civil engineers with experience in construction or infrastructure development.
2. Structural engineers to design and ensure structural integrity of buildings, bridges, or other structures.
3. Environmental engineers to assess and mitigate environmental impacts.
4. Geotechnical engineers to evaluate soil and rock conditions."""

response = questions.query.near_text(
    query=q,
    limit=4
)
t=""
for obj in response.objects:
    t += obj.properties["summary"]
    print(obj.properties["summary"])

client.close()  # Free up resources

WeaviateQueryError: Query call with protocol GRPC search failed with message <AioRpcError of RPC that terminated with:
	status = StatusCode.UNKNOWN
	details = "explorer: get class: vectorize params: could not vectorize input for collection Testing_employee with search-type nearText. Make sure a vectorizer module is configured for this collection"
	debug_error_string = "UNKNOWN:Error received from peer  {grpc_message:"explorer: get class: vectorize params: could not vectorize input for collection Testing_employee with search-type nearText. Make sure a vectorizer module is configured for this collection", grpc_status:2, created_time:"2025-01-03T18:15:30.7229335+00:00"}"
>.

In [286]:
prompt_template = f"""Using this data: {t}. Respond to this prompt: the following engineers may be required for this project:

1. Civil engineers with experience in construction or infrastructure development.
2. Structural engineers to design and ensure structural integrity of buildings, bridges, or other structures.
3. Environmental engineers to assess and mitigate environmental impacts.
4. Geotechnical engineers to evaluate soil and rock conditions.
your task is to select 3 candidates from the given data/resumes of employee. give reason for the selection
"""

In [None]:
# Generate a response combining the prompt and data we retrieved in step 2
output = ollama.generate(
  model = "llama3",
  prompt = prompt_template,
)

print(output['response'])