In [18]:
import os
import re
import pytesseract
from pdf2image import convert_from_path
import ollama
import requests


In [4]:
pdf_directory = "C:/Users/AbhinavKasubojula/OneDrive - Kenall Inc/Desktop/code/kenall_docs/"

# Extract text from a PDF using OCR
def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)  # Convert PDF to images
        text = ""
        for page_num, image in enumerate(images):
            text += pytesseract.image_to_string(image)  # OCR on the image
            print(f"Extracted text from page {page_num + 1} of {os.path.basename(pdf_path)}...")
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None
def remove_asterisks(text):
    # Remove all occurrences of single and double asterisks
    clean_text = re.sub(r"\*{1,2}", "", text)
    return clean_text
def normalize_data(data):
    """
    Normalize dictionary keys and values by removing unwanted characters and whitespace.
    """
    normalized_data = []
    for record in data:
        if isinstance(record, dict):
            normalized_record = {clean_text(key): clean_text(value) for key, value in record.items()}
            normalized_data.append(normalized_record)
        else:
            print(f"Warning: Expected a dictionary but got {type(record)}")
    return normalized_data

def process():
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        text = remove_asterisks(text)
        text = normalize_data(text)

    return(text)

In [None]:
pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_directory, pdf_file)
    print(f"Processing: {pdf_path}")
    text = extract_text_from_pdf(pdf_path)
    print(text)

In [None]:
# An example prompt


prompt = """ 
you are expert in sorting things out with analizing solicitaions bringing up new projects, with exalent proposal writing.
here is a solicitation text you need to analyse the 1.purpose of the solicitation. 2.resources need to complete that project most effectivly. 3.engineers required. 4.importtant dates.
 **Company Information**

* Company Overview
	+ Company Name
	+ Contact Information (Address, Phone, Email)
	+ Brief Description of Services
* Professional Experience and Qualifications
	+ Certifications/Licenses
	+ Education Background
	+ Work Experience
* Past Performance
	+ Relevant Projects
	+ Contracts
	+ References
* Expertise and Capabilities
	+ Service Offerings
	+ Technical Expertise
	+ Business Areas
* Staffing and Organizational Structure
"""

# Generate an embedding for the prompt and retrieve the most relevant doc
response = ollama.embeddings(
  model = "llama3.2",
  prompt = prompt,
)

prompt_template = f"Using this data: {text}. Respond to this prompt: {prompt}"
# Generate a response combining the prompt and data we retrieved in step 2
output = ollama.generate(
  model = "llama3.2",
  prompt = prompt_template,
)
summary = output['response']
print(summary)

In [None]:
import weaviate
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType
from sentence_transformers import SentenceTransformer

In [10]:
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

def compute_embeddings(documents):
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings

In [None]:
client = weaviate.connect_to_local(port=8080)
collection = client.collections.get("Notices")
query_vector = compute_embeddings(text)
response = collection.query.near_vector(
    near_vector=query_vector,
    limit=10,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

print(len(response.objects))
for i in response.objects:
    print(i.properties)
client.close()