In [1]:
import os
import pytesseract
from pdf2image import convert_from_path
import requests
import json
from sentence_transformers import SentenceTransformer
import re
import weaviate
from weaviate import client
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType
import ollama
from weaviate.util import generate_uuid5
import Levenshtein

  from tqdm.autonotebook import tqdm, trange


In [15]:

# Initialize SentenceTransformer model
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

# Set directory containing PDFs
pdf_directory = "C:/Users/AbhinavKasubojula/OneDrive - Kenall Inc/Desktop/code/docs/"

# Ollama API details (if needed for later use)
url = "http://localhost:11434/api/generate"
headers = {"Content-Type": "application/json"}




In [16]:


# Compute embeddings for documents
def compute_embeddings(documents):
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings


In [17]:

# Extract text from a PDF using OCR
def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)  # Convert PDF to images
        text = ""
        for page_num, image in enumerate(images):
            text += pytesseract.image_to_string(image)  # OCR on the image
            print(f"Extracted text from page {page_num + 1} of {os.path.basename(pdf_path)}...")
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None


In [18]:
import re

def text_to_dict(text):
    result = {}
    current_key = None
    
    # Split text into lines
    lines = text.strip().split("\n")
    
    # Check if the first line looks like an unwanted heading
    if re.match(r"^(Here is a summary|Summary of)", lines[0], re.IGNORECASE):
        lines = lines[1:]  # Skip the first line if it's an unwanted heading

    for line in lines:
        if ":" in line:  # Line contains a key-value pair
            key, value = line.split(":", 1)
            current_key = key.strip()
            result[current_key] = value.strip()
        else:
            if current_key:  # Append to the previous key's value
                result[current_key] += " " + line.strip()
    
    return result


In [19]:
def summarize_text_with_ollama(text):
    prompt = f"""
    you are expert in summarizing and sagrigating things. licenses need to be clearly mentioned in licenses category itself. you should get company's name very clearly and get every time in same way.
    structure to get Summarized resume and highlight main points specifically.
    - Name
    - education qualification(you are able to find at name ending (like PE, BA, etc..,))
    - Role
    - Licenses
    - company
    - Qualifications
    - Experience
    - Projects he/she worked before
    - Roles and Responsibilities

    keep headings as it is. for my future reference
    Resume Text:
    {text}
    """
    data = {"model": "llama3.2-vision:latest", "prompt": prompt, "stream": False}
    response = requests.post(url, headers=headers, data=json.dumps(data))
    
    response_json = response.json()

    text = response_json.get("response", "No response found")
    unwanted_prefix = "Here is a summary of the resume:"
    if text.startswith(unwanted_prefix):
        text = text[len(unwanted_prefix):].strip()
    return text


In [20]:

# Function to clean text
def clean_text(text):
    """Remove asterisks and trim whitespace from the given text."""
    return re.sub(r"\*{1,}", "", text).strip()

# Normalize the dictionary
def normalize_data(data):
    """
    Normalize dictionary keys and values by removing unwanted characters and whitespace.
    """
    normalized_data = []
    for record in data:
        if isinstance(record, dict):
            normalized_record = {clean_text(key): clean_text(value) for key, value in record.items()}
            normalized_data.append(normalized_record)
        else:
            print(f"Warning: Expected a dictionary but got {type(record)}")
    return normalized_data


In [21]:

# Process each PDF, extract text, summarize, and store in Weaviate
def process(sum_dict, emb, summary_text):
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        if text:
            # Summarize the extracted text
            summary1 = clean_text(text)
            summary = summarize_text_with_ollama(summary1)
            summary = clean_text(summary)
            summary_dict= text_to_dict(summary)
            print(summary_dict)
            

            # Generate vector embedding
            embedding = compute_embeddings([summary1])[0]           
            # Store data in Weaviate
            #store_in_weaviate(summary, embedding,collection_name)
        sum_dict.append(summary_dict)
        emb.append(embedding)
        summary_text.append(summary)

    return(sum_dict, emb, summary_text)




In [25]:
import requests

url = "http://localhost:11434/api/models"
headers = {"Content-Type": "application/json"}

try:
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        models = response.json()
        print("Available models:")
        for model in models:
            print(f"- {model}")
    else:
        print(f"Failed to retrieve models. Status code: {response.status_code}")
        print("Response:", response.text)
except Exception as e:
    print(f"An error occurred: {e}")


An error occurred: HTTPConnectionPool(host='localhost', port=11434): Max retries exceeded with url: /api/models (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001FFCF1A3FB0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))


In [25]:
# Run the process
sum_dict=[]
embeddings =[]
summary_text =[]
sum_dict, embeddings, summary_text = process(sum_dict, embeddings, summary_text)
#
for i in range(len(sum_dict)):
    sum_dict[i]["vector"] = embeddings[i].tolist()
    sum_dict[i]["summary"] = summary_text[i]

Processing: page_1 (2).pdf
Extracted text from page 1 of page_1 (2).pdf...
{}
Processing: page_1.pdf
Extracted text from page 1 of page_1.pdf...
{}
Processing: page_2.pdf


KeyboardInterrupt: 

In [13]:
import weaviate
from weaviate.classes.config import Configure
import weaviate
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType
import ollama


# Work with Weaviate


In [11]:
import weaviate

client = weaviate.connect_to_local(port=8082)

try:
    meta_info = client.get_meta()
    print(meta_info)

finally:
    client.close()

{'grpcMaxMessageSize': 10485760, 'hostname': 'http://[::]:8080', 'modules': {}, 'version': '1.28.1'}


In [14]:
client = weaviate.connect_to_local(port=8082)
companies = "DemoCollection"
if client.collections.exists(name=companies):
    client.collections.delete(name=companies)
    print("Existing", companies ,"collection deleted.")
client.collections.create(
    name="DemoCollection",
    generative_config=wvc.config.Configure.Generative.ollama(
        api_endpoint="http://host.docker.internal:11434",  # If using Docker, use this to contact your local Ollama instance
        model="llama3.2-vision:latest"  
    ),
    vectorizer_config=Configure.Vectorizer.text2vec_ollama(
    api_endpoint="http://host.docker.internal:11434",
    model="snowflake-arctic-embed"
    )


    # Additional parameters not shown
)

client.close()


            Please make sure to close the connection using `client.close()`.


UnexpectedStatusCodeError: Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': 'vectorizer: no module with name "text2vec-ollama" present'}]}.

In [93]:
client = weaviate.connect_to_local(port=8080)
demo = client.collections.get("DemoCollection")
with demo.batch.dynamic() as batch:
    for d in sum_dict:
        employee_uuid = batch.add_object({
            
            "Name": d["Name"],
            "role": d["Role"],
            "company": d["Company"],
            "experience": d["Experience"],
            "projects": d.get("Projects", "not found"),
            "summary": d["summary"],
        })
client.close()

In [94]:
client = weaviate.connect_to_local(port=8080)
demo = client.collections.get("DemoCollection")
for i in demo.iterator():
    print(i.properties)


{'projects': '', 'company': 'SAI-Kenall JV', 'qualification': None, 'roles_responsibilities': None, 'summary': 'Company: SAI-Kenall JV\nName: Brett Witte, PE\nSummary of Qualifications:\n\n Education: BS in Civil Engineering, 2005\n Role: Project Manager / Civil Engineer with 18 years of experience\n Licenses: PE (Civil), FL #70720\n Qualifications: Skilled at H&H modeling, civil infrastructure design, construction plans production using MicroStation and AutoCAD, quantity take-offs, cost estimates, construction phase support, and as-built certifications.\n Experience:\n\t+ Project Manager / Civil Engineer for various projects including:\n\t\t- Paint and Sandblast Facility, USCG Miami Station, FL (2021)\n\t\t- Relocate AGE Refueling Station, MacDill AFB, Tampa, FL (2019)\n\t\t- CES and FES Warehouses, MacDill AFB, Tampa, FL (2020)\n\t\t- USACE Fort Worth District - A-E Services for Red River Army Depot — Install Security Fences and Repair Buildings, Texarkana, TX (2019)\n Projects:\n\t+

In [97]:
collection = client.collections.get("DemoCollection")

response = collection.generate.near_text(
    query="srujan" ,  # The model provider integration will automatically vectorize the query
    single_prompt="Translate this into French: {projects}",
    limit=2
)

for obj in response.objects:
    print(obj.properties["title"])
    print(f"Generated output: {obj.generated}")  # Note that the generated output is per object

WeaviateQueryError: Query call with protocol GRPC search failed with message <AioRpcError of RPC that terminated with:
	status = StatusCode.UNKNOWN
	details = "explorer: get class: vectorize params: could not vectorize input for collection DemoCollection with search-type nearText. Make sure a vectorizer module is configured for this collection"
	debug_error_string = "UNKNOWN:Error received from peer  {created_time:"2025-01-15T05:54:36.168911+00:00", grpc_status:2, grpc_message:"explorer: get class: vectorize params: could not vectorize input for collection DemoCollection with search-type nearText. Make sure a vectorizer module is configured for this collection"}"
>.