In [None]:
import os
import pytesseract
from pdf2image import convert_from_path
import requests
import json
from sentence_transformers import SentenceTransformer
import re
import weaviate
from weaviate import client
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType
import ollama
from weaviate.util import generate_uuid5

In [4]:

# Initialize SentenceTransformer model
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Set directory containing PDFs
pdf_directory = "C:/Users/AbhinavKasubojula/OneDrive - Kenall Inc/Desktop/code/docs/"

# Ollama API details (if needed for later use)
url = "http://localhost:11434/api/generate"
headers = {"Content-Type": "application/json"}


In [5]:


# Compute embeddings for documents
def compute_embeddings(documents):
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings


In [6]:

# Extract text from a PDF using OCR
def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)  # Convert PDF to images
        text = ""
        for page_num, image in enumerate(images):
            text += pytesseract.image_to_string(image)  # OCR on the image
            print(f"Extracted text from page {page_num + 1} of {os.path.basename(pdf_path)}...")
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None


In [7]:
import re

def text_to_dict(text):
    result = {}
    current_key = None
    
    # Split text into lines
    lines = text.strip().split("\n")
    
    # Check if the first line looks like an unwanted heading
    if re.match(r"^(Here is a summary|Summary of)", lines[0], re.IGNORECASE):
        lines = lines[1:]  # Skip the first line if it's an unwanted heading

    for line in lines:
        if ":" in line:  # Line contains a key-value pair
            key, value = line.split(":", 1)
            current_key = key.strip()
            result[current_key] = value.strip()
        else:
            if current_key:  # Append to the previous key's value
                result[current_key] += " " + line.strip()
    
    return result


In [108]:


def summarize_text_with_ollama(text):
    prompt = f"""
    You are an expert in summarizing and structuring resumes. Your task is to extract information and organize it into the exact structure below. The output format must not change across different resumes, regardless of the content.

    Rules:
    - Use the exact headings listed below without any modification or omission.
    - Leave headings blank if no information is available for that section.
    - Do not rearrange, rename, or skip any headings.
    - Licenses must appear only under the "Licenses" heading.
    - Extract company names consistently, ensuring accuracy.
    - Output Format (retain these exact headings) to form dictionaries later:

     Name: [Name here]  
     Education Qualification: [Qualifications here, e.g., PE, BA, etc.]  
     Role: [Role here]  
     Licenses: [Licenses here]  
     Company: [Company names here]  
     Qualifications: [Additional qualifications here]  
     Experience: [Experience details here]  
     Projects he/she worked on: [Projects listed here]  
     Roles and Responsibilities: [Responsibilities here]  

    Resume Text:
    {text}
    
    Output Instructions:
    Follow the exact structure provided above.
    Include all information under the correct headings.
    Do not modify or skip any section, even if a section is blank.
    """

    data = {"model": "llama3.2:latest", "prompt": prompt, "stream": False}
    try:
        response = requests.post(url, headers=headers, data=json.dumps(data))
        response.raise_for_status()
        response_json = response.json()
        summarized_text = response_json.get("response", "").strip()

        # Validate the output structure
        expected_headings = [
            "Name:", 
            "Education Qualification:", 
            "Role:", 
            "Licenses:", 
            "Company:", 
            "Qualifications:", 
            "Experience:", 
            "Projects he/she worked on:", 
            "Roles and Responsibilities:"
        ]
        
        lines = summarized_text.split("\n")
        headings_present = [line.split(":")[0].strip() + ":" for line in lines if ":" in line]

        if not all(heading in headings_present for heading in expected_headings):
            raise ValueError("The output format does not match the expected structure.")

        # Clean up text if there's any unwanted prefix
        unwanted_prefix = "Here is a summary of the resume:"
        if summarized_text.startswith(unwanted_prefix):
            summarized_text = summarized_text[len(unwanted_prefix):].strip()

        return summarized_text

    except requests.exceptions.RequestException as e:
        return f"API request failed: {e}"
    except ValueError as ve:
        return f"Validation error: {ve}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"


In [85]:
def dict(text):
    prompt = f"""
    fetch these info from given text
    Name: 
     Education Qualification: 
     Role: 
     Licenses: 
     Company:   
     Qualifications: 
     Experience:  
     Projects he/she worked on:  
     Roles and Responsibilities: 

     given text = {text}
    """
    data = {"model": "llama3.2:latest", "prompt": prompt, "stream": False}
    response = requests.post(url, headers=headers, data=json.dumps(data))
    
    response_json = response.json()

    text = response_json.get("response", "No response found")
    unwanted_prefix = "Here is a summary of the resume:"
    if text.startswith(unwanted_prefix):
        text = text[len(unwanted_prefix):].strip()
    return text

In [111]:

# Function to clean text
def clean_text(text):
    """Remove asterisks and trim whitespace from the given text."""
    return re.sub(r"\*{1,}", "", text).strip()

# Normalize the dictionary
def normalize_data(data):
    """
    Normalize dictionary keys and values by removing unwanted characters and whitespace.
    """
    normalized_data = []
    for record in data:
        if isinstance(record, dict):
            normalized_record = {clean_text(key): clean_text(value) for key, value in record.items()}
            normalized_data.append(normalized_record)
        else:
            print(f"Warning: Expected a dictionary but got {type(record)}")
    return normalized_data


In [112]:

# Process each PDF, extract text, summarize, and store in Weaviate
def process(sum_dict, emb, summary_text):
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        if text:
            # Summarize the extracted text
            summary1 = clean_text(text)
            summary = summarize_text_with_ollama(summary1)
            #summary = dict(summary)
            print(summary)
            summary = clean_text(summary)
            summary_dict= text_to_dict(summary)
            print(summary_dict)
            

            # Generate vector embedding
            embedding = compute_embeddings([summary1])[0]           
            # Store data in Weaviate
            #store_in_weaviate(summary, embedding,collection_name)
        sum_dict.append(summary_dict)
        emb.append(embedding)
        summary_text.append(summary)

    return(sum_dict, emb, summary_text)




process starts here

In [113]:
# Run the process
sum_dict=[]
embeddings =[]
summary_text =[]
sum_dict, embeddings, summary_text = process(sum_dict, embeddings, summary_text)
#


Processing: page_1 (2).pdf
Extracted text from page 1 of page_1 (2).pdf...
Here is the extracted and organized resume in the required format:

 Name: Kris D. Prasad  
Education Qualification: M.S., Civil Engineering, 1992; P.E. (Civil): TX #91952, LA #34186, NM #20737, PR #28316  
Role: Program/Project Manager  
Licenses: Secret Clearance  
Company: Kenall-Hana JV, LLC | Houston, TX  
Qualifications: Organizations: SAME, ASCE, ACEC, College of Engineers and Surveyors of Puerto Rico  
Experience: 29 years of professional engineering and management experience  
Projects he/she worked on: 
(1) USACE Fort Worth District - Renovation of Buildings 16, 44, 615, 2002, and 2006, Fort Sam Houston, TX construction: 2022
    Brief scope: Renovation of five historical buildings with a total 100,400 SF renovation area. Cost: $31M.
(2) USACE Fort Worth District - Repair Fire Protection, Install Security Fences and Repairs of Buildings at Red River Army Depot, Texarkana, TX construction: 2022
    Brie

In [91]:
for i in range(len(sum_dict)):
    sum_dict[i]["vector"] = embeddings[i].tolist()
    sum_dict[i]["summary"] = summary_text[i]

In [96]:
sum_dict

[{'Here is the extracted and organized resume in the specified format': ' ',
  'Name': 'Kris D. Prasad',
  'Role': 'Program/Project Manager',
  'Licenses': ' ',
  'Education Qualification': 'M.S., Civil Engineering, 1992',
  'Qualifications': ' ',
  'Company': 'Kenall-Hana JV, LLC | Houston, TX ',
  'Experience': ' ',
  'Years Experience': '',
  'a. Total': '29',
  'b. With Current Firm': '20  ',
  'Other Professional Qualifications': ' ',
  'Organizations': 'SAME, ASCE, ACEC, College of Engineers and Surveyors of Puerto Rico.  ',
  'Roles and Responsibilities': ' Program/Project Manager for the v Managed contract, coordinated with USACE and subcontractors, Program/Project Manager for prep of VY Responsible for contract management, client interaction, and QC. Program/Project Manager for the v Performed contract management, coordinated with USACE, Fort Hood Program/Project Manager for the v Oversaw site investigation and reviewed seismic site evaluation as per Program/Project Manager fo

In [97]:
client = weaviate.connect_to_local()
print(f"Client: {weaviate.__version__}, Server: client.get_meta().get('version')")
companies = "testing_Companies"
if client.collections.exists(name=companies):
    client.collections.delete(name=companies)
    print("Existing", companies ,"collection deleted.")
collection = client.collections.create(
        name=companies,
        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_ollama(
            api_endpoint="http://host.docker.internal:11434",
            model="snowflake-arctic-embed"
        ),
        generative_config=wvc.config.Configure.Generative.ollama(
            api_endpoint="http://host.docker.internal:11434",  
            model="llama3.2"
        ),
        properties=[
            Property(name="Name", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="role", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="company", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="qualification", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="experience", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="projects", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="roles_responsibilities", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="summary", data_type = DataType.TEXT,index_searchable=True,index_filterable=True)
        ]
            
    )
with collection.batch.dynamic() as batch:

    for d in sum_dict:
        employee_uuid = batch.add_object({

            "Name": d["Name"],
            "role":  next(
                (d.get(key) for key in ["Role","Roles and Responsibilities"] if key in d),
                "Not provided"  # Default value if none of the keys exist
            ),
            "company":  next(
                (d.get(key) for key in ["Company"] if key in d),
                "Not provided"  # Default value if none of the keys exist
            ),
            "license": next(
                (d.get(key) for key in ["Licenses/Certifications", "Licenses", "PE (Civil)"] if key in d),
                "Not provided"  # Default value if none of the keys exist
            ),
            "qualification": next(
                (d.get(key) for key in ["Qualifications", "Summary of Qualifications"] if key in d),
                "Not provided"  # Default value
            ),
            "experience": d["Experience"],
            "projects": d.get("Projects", "not found"),
            #"summary": d["summary"],
        })

collection.data.insert(sum_dict[0])
print(collection.query.fetch_objects(include_vector=True).objects[0].properties)
print(collection.generate.fetch_objects(single_prompt="answer: {text}").objects[0].generated)
client.close()


Client: 4.10.4, Server: client.get_meta().get('version')
Existing testing_Companies collection deleted.


KeyError: 'Name'

In [102]:


collection_name = "Resume"

try:
    # Delete the collection if it exists
    existing_collection = client.collections.get(collection_name)
    if existing_collection:
        client.collections.delete(collection_name)
        print(f"Existing collection '{collection_name}' deleted.")

    # Create a new collection (schema definition)
    client.collections.create(
        name=collection_name,
        properties=[
            {"name": "Name", "data_type": DataType.TEXT, "index_searchable": True},
            {"name": "Role", "data_type": DataType.TEXT, "index_searchable": True},
            {"name": "Company", "data_type": DataType.TEXT, "index_searchable": True},
            {"name": "Licenses", "data_type": DataType.TEXT, "index_searchable": True},
            {"name": "Qualification", "data_type": DataType.TEXT, "index_searchable": True},
            {"name": "Experience", "data_type": DataType.TEXT, "index_searchable": True},
            {"name": "Projects", "data_type": DataType.TEXT, "index_searchable": True},
            {"name": "RolesAndResponsibilities", "data_type": DataType.TEXT, "index_searchable": True},
        ],
    )

    print(f"Collection '{collection_name}' created successfully.")

except Exception as e:
    print(f"Error: {e}")


# Insert objects directly into the collection
for record in sum_dict:
    client.objects.create(
        collection_name=collection_name,
        data=record,
    )

print(f"Inserted {len(data)} records into the collection '{collection_name}'.")

# Query data from the collection
query_response = client.query.get(
    collection_name=collection_name,
    properties=["Name", "Role", "Company", "Qualification"],
    limit=10,
)

print("Query Response:")
for obj in query_response.objects:
    print(obj.properties)

# Close the client connection
client.close()


Error: Query call with protocol GQL Aggregate failed with message Error in GraphQL response: [
  {
    "locations": [
      {
        "column": 12,
        "line": 1
      }
    ],
    "message": "Cannot query field \"Resume\" on type \"AggregateObjectsObj\".",
    "path": null
  }
], for the following query: {Aggregate{Resume{meta{count}}}}.


AttributeError: 'WeaviateClient' object has no attribute 'objects'

In [104]:

def Create_collection4resume():
    from weaviate.classes.config import ReferenceProperty
    client = weaviate.connect_to_local()
    # Delete a collection
    employee = "testing_employee"
    companies = "testing_Companies"

    if client.collections.exists(name=companies):
        client.collections.delete(name=companies)
        print("Existing", companies ,"collection deleted.")

    client.collections.create(
        name=companies,  # Name of the data collection for collecting company names
        properties=[
            Property(name="company_name", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="emp_names", data_type = DataType.TEXT,index_searchable=True,index_filterable=True)
        ])
    print("Collection",companies," created successfully.")

    if client.collections.exists(name=employee):
        client.collections.delete(name=employee)
        print("Existing",employee,"collection deleted.")


    client.collections.create(
        name=employee,  # Name of the data collection for collecting employee details
        properties=[
            Property(name="Name", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="role", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="company", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="qualification", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="experience", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="projects", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="roles_responsibilities", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
            Property(name="summary", data_type = DataType.TEXT,index_searchable=True,index_filterable=True)
        ],
        references=[
            ReferenceProperty(
                name="company_name",
                target_collection = companies
            )
        ]

    )
    print("Collection",employee," created successfully.")

    #making connection from "companies" to "employee"
    company = client.collections.get(companies)
    company.config.add_reference(
        ReferenceProperty(
            name="Name",
            target_collection=employee
        )
    )






    client.close()

In [105]:
def load_ResumeInCollection():
    
    client = weaviate.connect_to_local()

    # Get the collections
    emp = client.collections.get("testing_employee")
    comp = client.collections.get("testing_Companies")

    # Batch process for the 'employee' collection
    employee_names = []
    employee_uuids ={}
    dup_emp = {}
    company_names = {}
    with emp.batch.dynamic() as batch:

        for d in sum_dict:
            if d["Name"] in employee_names:
                dup_emp.setdefault(d["Name"],[])
                dup_emp[d["Name"]].append(d["summary"])

            else:
                employee_uuid = batch.add_object({

                    "Name": d["Name"],
                    "role":  next(
                        (d.get(key) for key in ["Role","Roles and Responsibilities"] if key in d),
                        "Not provided"  # Default value if none of the keys exist
                    ),
                    "company":  next(
                        (d.get(key) for key in ["Company"] if key in d),
                        "Not provided"  # Default value if none of the keys exist
                    ),
                    "license": next(
                        (d.get(key) for key in ["Licenses/Certifications", "Licenses", "PE (Civil)"] if key in d),
                        "Not provided"  # Default value if none of the keys exist
                    ),
                    "qualification": next(
                        (d.get(key) for key in ["Qualifications", "Summary of Qualifications"] if key in d),
                        "Not provided"  # Default value
                    ),
                    "experience": d["Experience"],
                    "projects": d.get("Projects", "not found"),
                    "summary": d["summary"],
                }, vector=d["vector"])

                employee_names.append(d["Name"])

                employee_uuids[d["Name"]] = employee_uuid  # Map Name to UUID for later reference

                company_names.setdefault(d["Company"],[])
                company_names[d["Company"]].append(d["Name"])
                




    for i in dup_emp:
        for item in emp.iterator():
            if (i == item.properties["Name"]):
                temp = item.properties["summary"] + " "+ dup_emp[i][0]
                print("updating ", i)
                
                emp.data.update(
                    uuid=employee_uuids[i],
                    properties={
                        "summary": temp,
                        
                    }
                )

    with comp.batch.dynamic() as batch:

        for d in company_names:
            temp = ""
            for i in company_names[d]:
                temp= i +" | "+ temp
            batch.add_object({
                "company_name" : d,
                "emp_names" : temp
            })





    client.close()


Create collection and load resumes here

In [106]:
Create_collection4resume()

Existing testing_Companies collection deleted.
Collection testing_Companies  created successfully.
Existing testing_employee collection deleted.
Collection testing_employee  created successfully.


load resumes here

In [107]:

load_ResumeInCollection()

KeyError: 'summary'

In [46]:
client = weaviate.connect_to_local()
obj = client.collections.get("testing_Companies") 
for i in obj.iterator():
    print(i.properties)
client.close()

In [None]:
client = weaviate.connect_to_local()
emp = client.collections.get("testing_employee")
for item in emp.iterator():
    print(item.properties)
client.close()

clear till here(data loaded in Db)

In [None]:
client = weaviate.connect_to_local()
emp = client.collections.get("testing_employee")
query_vector = compute_embeddings("""the following engineers may be required for this project:

1. Civil engineers with experience in construction or infrastructure development.
2. Structural engineers to design and ensure structural integrity of buildings, bridges, or other structures.
3. Environmental engineers to assess and mitigate environmental impacts.
4. Geotechnical engineers to evaluate soil and rock conditions. 
                                  select 3 engineers""")

response = emp.query.near_vector(
    near_vector=query_vector,
    limit=4,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

print(response.objects[0].properties["summary"])
client.close()

clear till here... 

===============================================================

In [None]:
#step-1 run only once, that too for the very first time
#creates two collections 1.employee, 2.companies 
Create_collection4resume()
#loads data in both employee and companies collections
load_ResumeInCollection()

In [None]:


client = weaviate.connect_to_local(port="8082")

questions = client.collections.get("testing_employee")
q = """the following engineers may be required for this project:

1. Civil engineers with experience in construction or infrastructure development.
2. Structural engineers to design and ensure structural integrity of buildings, bridges, or other structures.
3. Environmental engineers to assess and mitigate environmental impacts.
4. Geotechnical engineers to evaluate soil and rock conditions."""

response = questions.query.near_text(
    query=q,
    limit=4
)
t=""
for obj in response.objects:
    t += obj.properties["summary"]
    print(obj.properties["summary"])

client.close()  # Free up resources

In [286]:
prompt_template = f"""Using this data: {t}. Respond to this prompt: the following engineers may be required for this project:

1. Civil engineers with experience in construction or infrastructure development.
2. Structural engineers to design and ensure structural integrity of buildings, bridges, or other structures.
3. Environmental engineers to assess and mitigate environmental impacts.
4. Geotechnical engineers to evaluate soil and rock conditions.
your task is to select 3 candidates from the given data/resumes of employee. give reason for the selection
"""

In [None]:
client = weaviate.connect_to_local()
c = client.collections.get("Test")
for i in c.iterator():
    print(i.properties)