In [68]:
import os
import pytesseract
from pdf2image import convert_from_path
import requests
import json
from sentence_transformers import SentenceTransformer
import re
import weaviate
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType
import ollama

In [69]:

# Initialize SentenceTransformer model
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Set directory containing PDFs
pdf_directory = "C:/Users/AbhinavKasubojula/OneDrive - Kenall Inc/Desktop/code/docs/"

# Ollama API details (if needed for later use)
url = "http://localhost:11434/api/generate"
headers = {"Content-Type": "application/json"}


In [None]:
client = weaviate.connect_to_local()
print(client.is_ready()) 

In [None]:
# Delete a collection
collection_name = "resume"
client.collections.delete(name=collection_name)
print(f"Collection '{collection_name}' deleted successfully.")



collection = client.collections.create(
    name="resume",  # Name of the data collection
    properties=[
        Property(name="Name", data_type=DataType.TEXT),
        Property(name="role", data_type=DataType.TEXT),
        Property(name="licenses", data_type=DataType.TEXT),
        Property(name="working_for", data_type=DataType.TEXT),
        Property(name="qualification", data_type=DataType.TEXT),
        Property(name="experience", data_type=DataType.TEXT),
        Property(name="projects", data_type=DataType.TEXT),
        Property(name="roles_responsibilities", data_type=DataType.TEXT),
    ],
)
print("Collection 'resume' created successfully.")


In [73]:


# Compute embeddings for documents
def compute_embeddings(documents):
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings


In [74]:

# Extract text from a PDF using OCR
def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)  # Convert PDF to images
        text = ""
        for page_num, image in enumerate(images):
            text += pytesseract.image_to_string(image)  # OCR on the image
            print(f"Extracted text from page {page_num + 1} of {os.path.basename(pdf_path)}...")
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None


In [75]:
import re

def text_to_dict(text):
    result = {}
    current_key = None
    
    # Split text into lines
    lines = text.strip().split("\n")
    
    # Check if the first line looks like an unwanted heading
    if re.match(r"^(Here is a summary|Summary of)", lines[0], re.IGNORECASE):
        lines = lines[1:]  # Skip the first line if it's an unwanted heading

    for line in lines:
        if ":" in line:  # Line contains a key-value pair
            key, value = line.split(":", 1)
            current_key = key.strip()
            result[current_key] = value.strip()
        else:
            if current_key:  # Append to the previous key's value
                result[current_key] += " " + line.strip()
    
    return result


In [76]:
def summarize_text_with_ollama(text):
    prompt = f"""
    Summarize the following resume
    - Name
    - education qualification(you are able to find at name ending (like PE, BA, etc..,))
    - Role
    - Licenses
    - working for (company name)
    - Qualifications
    - Experience
    - Projects he/she worked before
    - Roles and Responsibilities

    Resume Text:
    {text}
    """
    data = {"model": "llama3.2-vision:latest", "prompt": prompt, "stream": False}
    response = requests.post(url, headers=headers, data=json.dumps(data))
    
    response_json = response.json()

    text = response_json.get("response", "No response found")
    unwanted_prefix = "Here is a summary of the resume:"
    if text.startswith(unwanted_prefix):
        text = text[len(unwanted_prefix):].strip()
    return text


In [77]:
def remove_asterisks(text):
    # Remove all occurrences of single and double asterisks
    clean_text = re.sub(r"\*{1,2}", "", text)
    return clean_text

In [14]:
#xxxxxxxxxx
# Extract key points from the summary
def extract_key_point(summary, key):
    start = summary.find(key)
    end = summary.find("\n", start)
    return summary[start:end].strip() if start != -1 else ""


In [101]:
#XXXXXXXXXX
def store_in_weaviate(summary, embedding, collecton_name):
with collection.batch.dynamic() as batch:
  for i, d in enumerate(documents):
    # Generate embeddings
    response = ollama.embeddings(model = "llama3",
                                 prompt = d)

    # Add data object with text and embedding
    batch.add_object(
        properties = {"text" : d},
        vector = response["embedding"],
    )



In [None]:

# Process each PDF, extract text, summarize, and store in Weaviate
def process(sum_dict, emb, summary_text):
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        if text:
            # Summarize the extracted text
            summary1 = summarize_text_with_ollama(text)
            #print(f"Summary of {file_name}: {summary[:100]}...")  # Print first 100 characters of summary
            summary = remove_asterisks(summary1)
            print(summary)
            summary_dict= text_to_dict(summary)
            print(summary_dict)
            

            # Generate vector embedding
            embedding = compute_embeddings([summary1])[0]           
            # Store data in Weaviate
            #store_in_weaviate(summary, embedding,collection_name)
        sum_dict.append(summary_dict)
        emb.append(embedding)
        summary_text.append(summary)

    return(sum_dict, emb, summary_text)

# Run the process
sum_dict=[]
embeddings =[]
summary_text =[]
sum_dict, embeddings, summary_text = process(sum_dict, embeddings, summary_text)


In [115]:
sum_dict

[{'Name': 'Kris D. Prasad, PE',
  'Education Qualification': 'MS, Civil Engineering (1992) ',
  'Role': 'Program Manager/Senior Structural Engineer ',
  'Licenses': '',
  'PE (Civil)': 'TX #91952 (2003), LA #34186, NM #20737, PR# 28316 ',
  'Working for': 'SAT-Kenall JV, Houston, TX ',
  'Qualifications': '  Over 29 years of professional engineering and management experience Current member of various organizations such as SAME, ASCE, ACEC, and College of Engineers and Surveyors of Puerto Rico Experience with federal, state, municipal, and local programs ',
  'Experience': ' Program Manager/Senior Structural Engineer for USACE Fort Worth District on various civil works and military projects Task Order Manager for NAVFAC Southeast projects in Meridian Base Managed over 1000 projects worth over $100M ',
  'Projects': '  1. USACE Fort Worth District - Renovation of Buildings 16, 44, 615, 2002, and _ renovation services (2022) Program Manager/Senior Structural Engineer Coordinated with USAC

In [None]:
# Delete a collection
collection_name = "resume"
client.collections.delete(name=collection_name)
print(f"Collection '{collection_name}' deleted successfully.")


collection = client.collections.create(
    name="resume",  # Name of the data collection
    properties=[
        Property(name="Name", data_type=DataType.TEXT),
        Property(name="role", data_type=DataType.TEXT),
        Property(name="licenses", data_type=DataType.TEXT),
        Property(name="working_for", data_type=DataType.TEXT),
        Property(name="qualification", data_type=DataType.TEXT),
        Property(name="experience", data_type=DataType.TEXT),
        Property(name="projects", data_type=DataType.TEXT),
        Property(name="roles_responsibilities", data_type=DataType.TEXT),
    ],
)
print("Collection 'resume' created successfully.")


In [126]:
client = weaviate.connect_to_local()


data = sum_dict

resume = client.collections.get("resume")

with resume.batch.dynamic() as batch:
    for d in data:
        batch.add_object({
            "name": d["Name"],
            "education_qualification": d["Education Qualification"],
            "Role": d["Role"],
            "licenses": d.get("PE (Civil)", "Not available"),
            "Working for":d["Working for"],
            "Qualification":d["Qualifications"],
            "Experience":d["Experience"],
            "Roles and Responsibilities":d["Roles and Responsibilities"]
        })

client.close()

In [127]:
client = weaviate.connect_to_local()

resume = client.collections.get("resume")

response = resume.query.near_text(
    query="get me who work as civil engineer",
    limit=2
)

for obj in response.objects:
    print(json.dumps(obj.properties, indent=2))

client.close()

{
  "name": "Robert Gaylord",
  "qualification": null,
  "roles_responsibilities": null,
  "education_qualification": "BS in Civil Engineering, 1987",
  "role": "Sr. Civil Engineer",
  "experience": null,
  "working_for": null,
  "licenses": null,
  "projects": null
}
{
  "projects": null,
  "qualification": null,
  "roles_responsibilities": null,
  "education_qualification": "BS in Civil Engineering (2005)",
  "role": "Project Manager / Civil Engineer",
  "experience": null,
  "working_for": null,
  "licenses": null,
  "name": "Brett Witte, PE"
}




In [None]:
# Delete a collection
collection_name = "Resume"
client.collections.delete(name=collection_name)
print(f"Collection '{collection_name}' deleted successfully.")


In [55]:
from weaviate.classes.config import Property, DataType

# Create a new data collection
collection = client.collections.create(
    name = "resume", # Name of the data collection
    properties=[
        Property(name="text", data_type=DataType.TEXT), # Name and data type of the property
    ],
)
with collection.batch.dynamic() as batch:
    for i, d in enumerate(summary_text):  # Use i as the index and d as the data
        response = compute_embeddings([d])  # Assuming it returns a list of embeddings
        embedding = response[0]  # Get the first embedding
        batch.add_object(
            properties={"text": d},
            vector=embedding,  # Use the embedding vector directly
        )


In [None]:
# Example prompt
prompt = "civil engineers more than 10 years of experience"

# Generate an embedding for the prompt
response = compute_embeddings([prompt])  # response might be a list of embeddings



# If response is a list of embeddings, access the first item (embedding) from the list
embedding = response[0]  # Assuming the response is a list of embeddings

# Query the collection for the most relevant document
results = collection.query.near_vector(
    near_vector=embedding,  # Provide the embedding vector
    limit=4
)

# Extract the text of the most relevant document
data = results.objects[0].properties['text']

# Print the retrieved data
print("Most relevant document:", data)


In [66]:
prompt_template = f"Using this data: {data}. Respond to this prompt: {prompt}"

In [None]:
# Generate a response combining the prompt and data we retrieved in step 2
output = ollama.generate(
  model = "llama3",
  prompt = prompt_template,
)

print(output['response'])