In [70]:
import os
import pytesseract
from pdf2image import convert_from_path
import requests
import json
from sentence_transformers import SentenceTransformer
import re
import weaviate
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType
import ollama
from weaviate.util import generate_uuid5

In [72]:

# Initialize SentenceTransformer model
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Set directory containing PDFs
pdf_directory = "C:/Users/AbhinavKasubojula/OneDrive - Kenall Inc/Desktop/code/docs/"

# Ollama API details (if needed for later use)
url = "http://localhost:11434/api/generate"
headers = {"Content-Type": "application/json"}


In [73]:


# Compute embeddings for documents
def compute_embeddings(documents):
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings


In [74]:

# Extract text from a PDF using OCR
def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)  # Convert PDF to images
        text = ""
        for page_num, image in enumerate(images):
            text += pytesseract.image_to_string(image)  # OCR on the image
            print(f"Extracted text from page {page_num + 1} of {os.path.basename(pdf_path)}...")
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None


In [75]:
import re

def text_to_dict(text):
    result = {}
    current_key = None
    
    # Split text into lines
    lines = text.strip().split("\n")
    
    # Check if the first line looks like an unwanted heading
    if re.match(r"^(Here is a summary|Summary of)", lines[0], re.IGNORECASE):
        lines = lines[1:]  # Skip the first line if it's an unwanted heading

    for line in lines:
        if ":" in line:  # Line contains a key-value pair
            key, value = line.split(":", 1)
            current_key = key.strip()
            result[current_key] = value.strip()
        else:
            if current_key:  # Append to the previous key's value
                result[current_key] += " " + line.strip()
    
    return result


In [76]:
def summarize_text_with_ollama(text):
    prompt = f"""
    you are expert in summarizing and sagrigating things. licenses need to be clearly mentioned in licenses category itself
    structure to get Summarized resume
    - Name
    - education qualification(you are able to find at name ending (like PE, BA, etc..,))
    - Role
    - Licenses
    - company
    - Qualifications
    - Experience
    - Projects he/she worked before
    - Roles and Responsibilities

    Resume Text:
    {text}
    """
    data = {"model": "llama3.2-vision:latest", "prompt": prompt, "stream": False}
    response = requests.post(url, headers=headers, data=json.dumps(data))
    
    response_json = response.json()

    text = response_json.get("response", "No response found")
    unwanted_prefix = "Here is a summary of the resume:"
    if text.startswith(unwanted_prefix):
        text = text[len(unwanted_prefix):].strip()
    return text


In [77]:
def remove_asterisks(text):
    # Remove all occurrences of single and double asterisks
    clean_text = re.sub(r"\*{1,2}", "", text)
    return clean_text

In [None]:

# Process each PDF, extract text, summarize, and store in Weaviate
def process(sum_dict, emb, summary_text):
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        if text:
            # Summarize the extracted text
            summary1 = summarize_text_with_ollama(text)
            #print(f"Summary of {file_name}: {summary[:100]}...")  # Print first 100 characters of summary
            summary = remove_asterisks(summary1)
            summary_dict= text_to_dict(summary)
            print(summary_dict)
            

            # Generate vector embedding
            embedding = compute_embeddings([summary1])[0]           
            # Store data in Weaviate
            #store_in_weaviate(summary, embedding,collection_name)
        sum_dict.append(summary_dict)
        emb.append(embedding)
        summary_text.append(summary)

    return(sum_dict, emb, summary_text)

# Run the process
sum_dict=[]
embeddings =[]
summary_text =[]
sum_dict, embeddings, summary_text = process(sum_dict, embeddings, summary_text)


In [79]:
#
for i in range(len(sum_dict)):
    sum_dict[i]["vector"] = embeddings[i].tolist()
    sum_dict[i]["summary"] = summary_text[i]

In [None]:
from weaviate.classes.config import ReferenceProperty
client = weaviate.connect_to_local()
# Delete a collection
employee = "employee"
experience = "experience"
#client.collections.delete(name=collection_name)
#print(f"Collection '{collection_name}' deleted successfully.")

if client.collections.exists(name="experience"):
    client.collections.delete(name="experience")
    print("Existing", experience ,"collection deleted.")

client.collections.create(
    name=experience,  # Name of the data collection
    properties=[
        Property(name="Name", data_type=DataType.TEXT),
        Property(name="emp_id", data_type=DataType.TEXT),
        Property(name="role", data_type=DataType.TEXT),
        Property(name="company", data_type=DataType.TEXT),
        Property(name="qualification", data_type=DataType.TEXT),
        Property(name="experience", data_type=DataType.TEXT),
        Property(name="projects", data_type=DataType.TEXT),
        Property(name="roles_responsibilities", data_type=DataType.TEXT),
        Property(name="summary", data_type = DataType.TEXT)
    ])
print("Collection",experience," created successfully.")

if client.collections.exists(name=employee):
    client.collections.delete(name=employee)
    print("Existing",employee,"collection deleted.")


client.collections.create(
    name=employee,  # Name of the data collection
    properties=[
        Property(name="Name", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="role", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="company", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="qualification", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="experience", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="projects", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="roles_responsibilities", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="summary", data_type = DataType.TEXT,index_searchable=True,index_filterable=True)
    ],
    references=[
        ReferenceProperty(
            name="exp_id",
            target_collection=experience
        )
    ]

)
print("Collection",employee," created successfully.")

if client.collections.exists(name="dup_coll"):
    client.collections.delete(name="dup_coll")
    print("Existing collection deleted.")


client.collections.create(
    name="dup_coll",  # Name of the data collection
    properties=[
        Property(name="Name", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="role", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="company", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="qualification", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="experience", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="projects", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="roles_responsibilities", data_type=DataType.TEXT,index_searchable=True,index_filterable=True),
        Property(name="summary", data_type = DataType.TEXT,index_searchable=True,index_filterable=True)
    ]

)
print("collection dup_coll created")
client.close()

In [None]:
#if you want to store ids
client = weaviate.connect_to_local()
collection = client.collections.get("resume")

for item in collection.iterator():
    print(item.uuid)
client.close()

In [None]:
sum_dict

In [None]:
client = weaviate.connect_to_local()

# Get the collections
emp = client.collections.get("employee")
exp = client.collections.get("experience")
dup = client.collections.get("dup_coll")

# Batch process for the 'employee' collection
employee_names = []
employee_uuids ={}
dup_emp = {}
with emp.batch.dynamic() as batch:

    for d in sum_dict:
        if d["Name"] not in employee_names:
            employee_uuid = batch.add_object({

                "Name": d["Name"],
                "role": d["Role"],
                "company": d["Company"],
                "qualification": d["Qualifications"],
                "experience": d["Experience"],
                "projects": d.get("Projects", "not found"),
                "summary": d["summary"],
            }, vector=d["vector"])
            employee_names.append(d["Name"])
            employee_uuids[d["Name"]] = employee_uuid  # Map Name to UUID for later reference
        else:
            dup_emp[d["Name"]] = d["summary"]
for item in emp.iterator():
    if item.properties["name"] in dup_emp:
        print("updating ", item.properties["name"])
        emp.data.update(
            uuid=employee_uuids[item.properties["name"]],
            properties={
                "summary": dup_emp[item.properties["name"]],
            }
        )

client.close()


clear till here(data loaded in Db)

In [160]:
client = weaviate.connect_to_local()
emp = client.collections.get("employee")
query_vector = compute_embeddings("get who worked as civil engineer")

response = emp.query.near_vector(
    near_vector=query_vector,
    limit=2,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

print(response.objects[0].properties["summary"])

Here is the summarized resume in the desired format:

Name: Robert Gaylord
Role: Sr. Civil Engineer

Education Qualification: PE (Civil), BS, Civil Engineering

Licenses:

 PE (Civil): FL #51373 (1997)

Company: SAI-Kenall JV, Orlando, FL

Qualifications:

 35 years of experience conducting engineering design, analysis, and developing technical specifications for civil infrastructure design projects
 Expertise in complex H&H modeling and flood control design
 Administering construction & implementation of project designs

Experience:

 Sr. Civil Engineer, SAI-Kenall JV (34 years)
	+ Led and supported projects for various USACE Districts, US Air Force, and other federal agencies
	+ Conducted engineering design, analysis, and developed technical specifications for civil infrastructure design projects
	+ Administered construction & implementation of project designs

Projects:

1. Paint and Sandblast Facility, USCG Miami Station, FL (2021)
	 Sr. Civil Engineer for the construction of a new

            Please make sure to close the connection using `client.close()`.


In [144]:


client = weaviate.connect_to_local()

questions = client.collections.get("experience")

response = questions.query.near_text(
    query="civil",
    limit=2
)

for obj in response.objects:
    print(obj.properties[''])

client.close()  # Free up resources

In [147]:
client = weaviate.connect_to_local()
collection = client.collections.get("employee")

for item in collection.iterator():
    print(item.uuid, item.properties["summary"])

019d6d28-1a63-461c-8114-5b3ee7c3ba11 Based on the provided text, I will create a summarized resume for Srujan Chikyala in the specified format:

Name: Srujan Chikyala
Education Qualification: PE (Civil), MS, Civil Engineering
Role: Project Manager
Licenses:
	 PE (Civil): TX #122374 (2015)
Company: SAT-Kenall JV
Location: Houston, TX

Qualifications:

 13 years of experience leading DB and DBB RFP packages for MILCON and SRM projects at USACE and NAVFAC military facilities and civil works projects for USACE, USDA-NRCS, IBWC, and US Fish & Wildlife Service.
 Served as a Project Manager on about 75+ engineering projects for federal, public works and public entities that include military facilities, utilities, drainage structures, roadways, bridges, dams, levees, facilities, and other flood control structures for various federal clients.
 Member of Organizations: SAME, ASCE, ACEC

Experience:

 Project Manager at SAT-Kenall JV
	+ Led multi-disciplines teams on Fort Worth/ Tulsa District pr

            Please make sure to close the connection using `client.close()`.


In [66]:
prompt_template = f"Using this data: {data}. Respond to this prompt: {prompt}"

In [None]:
# Generate a response combining the prompt and data we retrieved in step 2
output = ollama.generate(
  model = "llama3",
  prompt = prompt_template,
)

print(output['response'])