In [None]:
import os
import pytesseract
from pdf2image import convert_from_path
import requests
import json
import numpy as np
from sentence_transformers import SentenceTransformer


model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Set the directory path containing the PDFs
pdf_directory = "C:/Users/AbhinavKasubojula/OneDrive - Kenall Inc/Desktop/code/docs/"

# Ollama API endpoint and headers
url = "http://localhost:11434/api/generate"
headers = {"Content-Type": "application/json"}


def compute_embeddings(documents):
    model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings



def extract_text_from_pdf(pdf_path):
    try:
        
        images = convert_from_path(pdf_path)
        
        # Extract text from each image using Tesseract OCR
        text = ""
        for page_num, image in enumerate(images):
            text += pytesseract.image_to_string(image)  # OCR on the image
            print(f"Extracted text from page {page_num + 1} of {os.path.basename(pdf_path)}...")
        
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None

# Function to summarize text using the Ollama API
def summarize_text_with_ollama(text):
    prompt = f"""
    Summarize the following resume:
    - Name
    - Role
    - Licenses
    - working for (company name)
    - Qualifications
    - Experience
    - Projects
    - Roles and Responsibilities

    Resume Text:
    {text}
    """
    data = {"model": "llama3.2-vision:latest", "prompt": prompt, "stream": False}

    response = requests.post(url, headers=headers, data=json.dumps(data))
    
    response_json = response.json()
    summary = response_json.get("response", "No response found")
    return summary



# Function to extract key points (simple placeholder function for demo)
def extract_key_point(summary, key):
    # This is a simple way to extract key points, you can refine it based on your summary format
    start = summary.find(key)
    end = summary.find("\n", start)
    return summary[start:end].strip() if start != -1 else ""

# Process each PDF file, extract text, summarize, and add data to FAISS
def process(sum, file_names):
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
    
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Extract text from the PDF using OCR
        text = extract_text_from_pdf(pdf_path)
        if text:
            # Summarize the extracted text
            summary = summarize_text_with_ollama(text)
            filename = extract_key_point(summary, "Name")
            print(f"Summary of {filename}: {summary[:100]}...")  # Limit printed summary lengt
        sum.append(summary)
        file_names.append(filename)
    return(sum,file_names)


sum=[]
file_names=[]
# Run the process
sum , file_names= process(sum, file_names)



In [35]:
import os
import pytesseract
from pdf2image import convert_from_path
import requests
import json
from sentence_transformers import SentenceTransformer
from weaviate import Client


In [36]:

# Initialize SentenceTransformer
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Set directory containing PDFs
pdf_directory = "C:/Users/AbhinavKasubojula/OneDrive - Kenall Inc/Desktop/code/docs/"

# Ollama API details
url = "http://localhost:11434/api/generate"
headers = {"Content-Type": "application/json"}

# Initialize Weaviate client
client = Client("http://localhost:8080")  # Adjust URL if needed


# Define Weaviate schema for resumes
resume_schema = {
    "class": "Resume",
    "vectorizer": "none",  # We provide custom embeddings
    "properties": [
        {"name": "name", "dataType": ["text"]},
        {"name": "role", "dataType": ["text"]},
        {"name": "licenses", "dataType": ["text[]"]},
        {"name": "working_for", "dataType": ["text"]},
        {"name": "qualifications", "dataType": ["text"]},
        {"name": "experience", "dataType": ["text"]},
        {"name": "projects", "dataType": ["text[]"]},
        {"name": "roles_responsibilities", "dataType": ["text[]"]},
    ]
}


# Create schema if it doesn't already exist
try:
    client.schema.create_class(resume_schema)
    print("Weaviate schema created successfully!")
except Exception as e:
    print(f"Schema already exists or error: {e}")

Python client v3 `weaviate.Client(...)` connections and methods are deprecated and will
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration

            If you have to use v3 code, install the v3 client and pin the v3 dependency in your requirements file: `weaviate-client>=3.26.7;<4.0.0`
  client = Client("http://localhost:8080")  # Adjust URL if needed


Schema already exists or error: Create class! Unexpected status code: 422, with response body: {'error': [{'message': 'class name Resume already exists'}]}.


In [37]:


# Compute embeddings for documents
def compute_embeddings(documents):
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings


In [38]:

# Extract text from a PDF using OCR
def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)  # Convert PDF to images
        text = ""
        for page_num, image in enumerate(images):
            text += pytesseract.image_to_string(image)  # OCR on the image
            print(f"Extracted text from page {page_num + 1} of {os.path.basename(pdf_path)}...")
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None


In [45]:

# Summarize text using Ollama API
def summarize_text_with_ollama(text):
    prompt = f"""
    Summarize the following resume:
    - Name
    - education qualification(you are able to find at name ending (like PE, BA, etc..,))
    Variable parameters: 
    - Role
    - Licenses
    - working for (company name)
    - Qualifications
    - Experience
    - Projects he/she worked before
    - Roles and Responsibilities

    Resume Text:
    {text}
    """
    data = {"model": "llama3.2-vision:latest", "prompt": prompt, "stream": False}
    response = requests.post(url, headers=headers, data=json.dumps(data))
    response_json = response.json()
    return response_json.get("response", "No response found")


In [40]:

# Extract key points from the summary
def extract_key_point(summary, key):
    start = summary.find(key)
    end = summary.find("\n", start)
    return summary[start:end].strip() if start != -1 else ""


In [42]:

# Store data and embeddings in Weaviate
def store_in_weaviate(summary, embedding, file_name):
    unique_id = f"{summary['Name']}"

     # Check if the resume already exists
    query = client.query.get("Resume", ["name", "education"]).with_where({
        "path": ["unique_id"],
        "operator": "Equal",
        "valueString": unique_id
    })
    results = query.do()

    if results["data"]["Get"]["Resume"]:
        print("Resume already exists. Skipping...")
        return
    
    # Store metadata and embedding
    client.data_object.create(
        {
            "unique_id": unique_id,
            "name": summary["Name"],
            "education": metadata["Education"],
            "role": metadata["Role"],
            "company": metadata["Company name"],
            "projects": metadata["Projects"],
            "roles_responsibilities": metadata["Roles and Responsibilities"],
        },
        vector=embedding,
        class_name="Resume"
    )



In [43]:

# Process each PDF, extract text, summarize, and store in Weaviate
def process():
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        if text:
            # Summarize the extracted text
            summary = summarize_text_with_ollama(text)
            file_name = extract_key_point(summary, "Name")
            #print(f"Summary of {file_name}: {summary[:100]}...")  # Print first 100 characters of summary
            
            # Generate vector embedding
            embedding = compute_embeddings([summary])[0]
            
            # Store data in Weaviate
            store_in_weaviate(summary, embedding, file_name)

# Run the process
process()


Processing: page_1.pdf
Extracted text from page 1 of page_1.pdf...
Stored Name:** Kris D. Prasad, PE in Weaviate!
Processing: page_2.pdf
Extracted text from page 1 of page_2.pdf...
Stored Name:** Srujan Chikyala, PE (Project Manager) in Weaviate!
Processing: page_3.pdf
Extracted text from page 1 of page_3.pdf...
Stored Name:** Brett Witte, PE in Weaviate!
Processing: page_4.pdf
Extracted text from page 1 of page_4.pdf...
Stored Name:** Robert Morris, AIA, RID DQC Manager in Weaviate!
Processing: page_5.pdf
Extracted text from page 1 of page_5.pdf...
Stored Name:** Robert Gaylord in Weaviate!


In [44]:

# Dynamic query-based resume search
def search_resumes(query, limit=2):
    # Generate query embedding for the dynamic query
    query_embedding = compute_embeddings([query])[0]

    # Retrieve results from Weaviate
    results = client.query.get("Resume", ["name", "role", "experience","projects"]) \
        .with_near_vector({"vector": query_embedding}) \
        .with_limit(limit) \
        .do()

    # Extract and clean data
    resumes = results.get('data', {}).get('Get', {}).get('Resume', [])
    return resumes


In [None]:

# Example: User query to search for resumes with experience in civil engineering
user_query = "Resumes with experience in civil engineering"
resumes = search_resumes(user_query)

# Display results
if resumes:
    for resume in resumes:
        name = resume.get("name", "Name not available").strip()
        role = resume.get("role", "Role not available").strip()
        experience = resume.get("experience", "Experience not available").strip()
        projects = resume.get("projects","projects not available").strip()
        print(f"Name: {name}\nRole: {role}\nExperience: {experience}\n")
else:
    print("No matching resumes found.")
