In [266]:
import os
import pytesseract
from pdf2image import convert_from_path
import requests
import json
from sentence_transformers import SentenceTransformer
import re
import weaviate
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType
import ollama

In [267]:

# Initialize SentenceTransformer model
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Set directory containing PDFs
pdf_directory = "C:/Users/AbhinavKasubojula/OneDrive - Kenall Inc/Desktop/code/docs/"

# Ollama API details (if needed for later use)
url = "http://localhost:11434/api/generate"
headers = {"Content-Type": "application/json"}




In [269]:


# Compute embeddings for documents
def compute_embeddings(documents):
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings


In [270]:

# Extract text from a PDF using OCR
def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)  # Convert PDF to images
        text = ""
        for page_num, image in enumerate(images):
            text += pytesseract.image_to_string(image)  # OCR on the image
            print(f"Extracted text from page {page_num + 1} of {os.path.basename(pdf_path)}...")
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None


In [271]:
import re

def text_to_dict(text):
    result = {}
    current_key = None
    
    # Split text into lines
    lines = text.strip().split("\n")
    
    # Check if the first line looks like an unwanted heading
    if re.match(r"^(Here is a summary|Summary of)", lines[0], re.IGNORECASE):
        lines = lines[1:]  # Skip the first line if it's an unwanted heading

    for line in lines:
        if ":" in line:  # Line contains a key-value pair
            key, value = line.split(":", 1)
            current_key = key.strip()
            result[current_key] = value.strip()
        else:
            if current_key:  # Append to the previous key's value
                result[current_key] += " " + line.strip()
    
    return result


In [299]:
def summarize_text_with_ollama(text):
    prompt = f"""
    you are expert in summarizing and sagrigating things. licenses need to be clearly mentioned in licenses category itself
    Summarize the following resume
    - Name
    - education qualification(you are able to find at name ending (like PE, BA, etc..,))
    - Role
    - Licenses
    - company
    - Qualifications
    - Experience
    - Projects he/she worked before
    - Roles and Responsibilities

    Resume Text:
    {text}
    """
    data = {"model": "llama3.2-vision:latest", "prompt": prompt, "stream": False}
    response = requests.post(url, headers=headers, data=json.dumps(data))
    
    response_json = response.json()

    text = response_json.get("response", "No response found")
    unwanted_prefix = "Here is a summary of the resume:"
    if text.startswith(unwanted_prefix):
        text = text[len(unwanted_prefix):].strip()
    return text


In [300]:
def remove_asterisks(text):
    # Remove all occurrences of single and double asterisks
    clean_text = re.sub(r"\*{1,2}", "", text)
    return clean_text

In [301]:

# Process each PDF, extract text, summarize, and store in Weaviate
def process(sum_dict, emb, summary_text):
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        if text:
            # Summarize the extracted text
            summary1 = summarize_text_with_ollama(text)
            #print(f"Summary of {file_name}: {summary[:100]}...")  # Print first 100 characters of summary
            summary = remove_asterisks(summary1)
            print(summary)
            summary_dict= text_to_dict(summary)
            print(summary_dict)
            

            # Generate vector embedding
            embedding = compute_embeddings([summary1])[0]           
            # Store data in Weaviate
            #store_in_weaviate(summary, embedding,collection_name)
        sum_dict.append(summary_dict)
        emb.append(embedding)
        summary_text.append(summary)

    return(sum_dict, emb, summary_text)

# Run the process
sum_dict=[]
embeddings =[]
summary_text =[]
sum_dict, embeddings, summary_text = process(sum_dict, embeddings, summary_text)




Processing: page_1.pdf
Extracted text from page 1 of page_1.pdf...
Here is a summarized version of Kris D. Prasad's resume:

Name: Kris D. Prasad, PE
Role: Program Manager/Senior Structural Engineer
Company: SAT-Kenall JV, Houston, TX
Education:
	 MS, Civil Engineering (1992)

Licenses:
	 PE (Civil): TX #91952, LA #34186, NM #20737, PR# 28316

Qualifications:
	 Over 29 years of professional engineering and management experience
	 Member of organizations such as SAME, ASCE, ACEC, College of Engineers and Surveyors of Puerto Rico
	 Expertise in contract management, client interaction, QA reviews, structural design, and project scheduling

Experience:

1. Program Manager/Senior Structural Engineer, USACE Fort Worth District (various projects)
	 Managed contracts, coordinated with USACE and subcontractors, ensured timely submission of deliverables
	 Performed QA reviews, monitored project schedules, and reviewed structural designs
2. Task Order Manager, NAVFAC Southeast (Meridian Base)
	 R

In [302]:
for i in range(len(sum_dict)):
    sum_dict[i]["vector"] = embeddings[i].tolist()
    sum_dict[i]["summary"] = summary_text[i]

In [303]:
client = weaviate.connect_to_local()
# Delete a collection
collection_name = "resume"
client.collections.delete(name=collection_name)
print(f"Collection '{collection_name}' deleted successfully.")


collection = client.collections.create(
    name="resume",  # Name of the data collection
    properties=[
        Property(name="Name", data_type=DataType.TEXT),
        Property(name="role", data_type=DataType.TEXT),
        Property(name="licenses", data_type=DataType.TEXT),
        Property(name="company", data_type=DataType.TEXT),
        Property(name="qualification", data_type=DataType.TEXT),
        Property(name="experience", data_type=DataType.TEXT),
        Property(name="projects", data_type=DataType.TEXT),
        Property(name="roles_responsibilities", data_type=DataType.TEXT),
        Property(name="summary", data_type = DataType.TEXT)
    ],
)
print("Collection 'resume' created successfully.")
client.close()

Collection 'resume' deleted successfully.
Collection 'resume' created successfully.


In [304]:
sum_dict

[{"Here is a summarized version of Kris D. Prasad's resume": ' ',
  'Name': 'Kris D. Prasad, PE',
  'Role': 'Program Manager/Senior Structural Engineer',
  'Company': 'SAT-Kenall JV, Houston, TX',
  'Education': ' MS, Civil Engineering (1992) ',
  'Licenses': '',
  'PE (Civil)': 'TX #91952, LA #34186, NM #20737, PR# 28316 ',
  'Qualifications': ' Over 29 years of professional engineering and management experience Member of organizations such as SAME, ASCE, ACEC, College of Engineers and Surveyors of Puerto Rico Expertise in contract management, client interaction, QA reviews, structural design, and project scheduling ',
  'Experience': '  1. Program Manager/Senior Structural Engineer, USACE Fort Worth District (various projects) Managed contracts, coordinated with USACE and subcontractors, ensured timely submission of deliverables Performed QA reviews, monitored project schedules, and reviewed structural designs 2. Task Order Manager, NAVFAC Southeast (Meridian Base) Received excellent

In [305]:
client = weaviate.connect_to_local()

resume = client.collections.get("resume")

with resume.batch.dynamic() as batch:
    for d in sum_dict:
        print(d["Name"])
        batch.add_object({
            "Name": d["Name"],
            "role": d["Roles and Responsibilities"],
            "licenses":d["Licenses"],
            "company":d["company"],
            "qualification":d["Qualifications"],
            "experience":d["Experience"],
            "projects": d.get("Projects","not found"),
            "summary" : d["summary"]

        }, vector = d["vector"] )


client.close()

Kris D. Prasad, PE


KeyError: 'Roles and Responsibilities'

In [286]:
client = weaviate.connect_to_local()
resume = client.collections.get("resume")
query_vector = compute_embeddings("srujan")

response = resume.query.near_vector(
    near_vector=query_vector,
    limit=2,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

print(response)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('957d13f5-5a5e-4f53-b122-a4b1e8b56f11'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=0.620631217956543, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'projects': '  1. USACE Fort Worth District - Renovation of Buildings 16, 44, 615, and B2002 (2022) Program Manager/Senior Structural Engineer Coordinated with USACE and subcontractors, ensured timely submission of deliverables 2. USACE Fort Worth District - Repair Fire Protection, Install Security Fences and Repairs of Buildings at Red River Army Depot (2021-2022) Responsible for contract management, client interaction, and QC 3. USACE Fort Worth District - Design of Tactical Equipment Maintenance Facility Renovations at Fort Hood (2019-2022) Program Manager/Senior Structural Engineer Monitored project schedule and performed QA reviews ', 'company': 'SAT-Kenall JV, Houston, TX ', 'qualification': '  Over

In [287]:


client = weaviate.connect_to_local()

questions = client.collections.get("resume")

response = questions.query.near_text(
    query="civil",
    limit=2
)

for obj in response.objects:
    print(json.dumps(obj.properties, indent=2))

client.close()  # Free up resources

{
  "projects": "",
  "company": "SAI-Kenall JV, Orlando, FL ",
  "qualification": "  35 years of experience conducting engineering design, analysis, and developing technical specifications for civil infrastructure design projects. Expertise in complex H&H modeling and flood control design. ",
  "roles_responsibilities": null,
  "summary": "Here is a summary of Robert Gaylord's resume:\n\nName: Robert Gaylord\nEducation Qualification: BS in Civil Engineering (1987)\nRole: Sr. Civil Engineer\nLicenses:\n PE (Civil): FL #51373 (1997)\n\nWorking for: SAI-Kenall JV, Orlando, FL\n\nQualifications:\n\n 35 years of experience conducting engineering design, analysis, and developing technical specifications for civil infrastructure design projects.\n Expertise in complex H&H modeling and flood control design.\n\nExperience:\nRobert has extensive experience working on various USACE Districts, US Air Force, and other federal agencies projects, including:\n Paint and Sandblast Facility, USCG Miami

In [288]:
sum_dict

[{'Name': 'Kris D. Prasad',
  'Education Qualification': 'MS, Civil Engineering (1992)',
  'Role': 'Program Manager/Senior Structural Engineer',
  'Licenses': '',
  'PE (Civil)': 'TX #91952 (2003), LA #34186, NM #20737, PR# 28316 ',
  'Working for': 'SAT-Kenall JV, Houston, TX ',
  'Qualifications': '  Over 29 years of professional engineering and management experience Skilled in contract management, client interaction, and quality control Member of various organizations including SAME, ASCE, ACEC, and College of Engineers and Surveyors of Puerto Rico ',
  'Experience': '  Program Manager/Senior Structural Engineer for USACE Fort Worth District on various civil works and military projects Task order manager for two NAVFAC Southeast projects in Meridian Base Experienced with over 1000 projects worth over $100M ',
  'Projects': '  1. USACE Fort Worth District - Renovation of Buildings 16, 44, 615, and B2002 (2022) Program Manager/Senior Structural Engineer Coordinated with USACE and subc

In [None]:
# Delete a collection
collection_name = "Resume"
client.collections.delete(name=collection_name)
print(f"Collection '{collection_name}' deleted successfully.")


In [55]:
from weaviate.classes.config import Property, DataType

# Create a new data collection
collection = client.collections.create(
    name = "resume", # Name of the data collection
    properties=[
        Property(name="text", data_type=DataType.TEXT), # Name and data type of the property
    ],
)
with collection.batch.dynamic() as batch:
    for i, d in enumerate(summary_text):  # Use i as the index and d as the data
        response = compute_embeddings([d])  # Assuming it returns a list of embeddings
        embedding = response[0]  # Get the first embedding
        batch.add_object(
            properties={"text": d},
            vector=embedding,  # Use the embedding vector directly
        )


In [66]:
prompt_template = f"Using this data: {data}. Respond to this prompt: {prompt}"

In [None]:
# Generate a response combining the prompt and data we retrieved in step 2
output = ollama.generate(
  model = "llama3",
  prompt = prompt_template,
)

print(output['response'])