In [2]:
import os
import re
import pytesseract
from pdf2image import convert_from_path
import ollama
import requests


In [4]:
pdf_directory = "C:/Users/AbhinavKasubojula/OneDrive - Kenall Inc/Desktop/code/kenall_docs/"

# Extract text from a PDF using OCR
def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)  # Convert PDF to images
        text = ""
        for page_num, image in enumerate(images):
            text += pytesseract.image_to_string(image)  # OCR on the image
            print(f"Extracted text from page {page_num + 1} of {os.path.basename(pdf_path)}...")
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None
def remove_asterisks(text):
    # Remove all occurrences of single and double asterisks
    clean_text = re.sub(r"\*{1,2}", "", text)
    return clean_text
def normalize_data(data):
    """
    Normalize dictionary keys and values by removing unwanted characters and whitespace.
    """
    normalized_data = []
    for record in data:
        if isinstance(record, dict):
            normalized_record = {clean_text(key): clean_text(value) for key, value in record.items()}
            normalized_data.append(normalized_record)
        else:
            print(f"Warning: Expected a dictionary but got {type(record)}")
    return normalized_data

def process():
    pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_directory, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        text = remove_asterisks(text)
        text = normalize_data(text)

    return(text)

In [7]:
pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith(".pdf")]
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_directory, pdf_file)
    print(f"Processing: {pdf_path}")
    text = extract_text_from_pdf(pdf_path)
    print(text)

Processing: C:/Users/AbhinavKasubojula/OneDrive - Kenall Inc/Desktop/code/kenall_docs/Solicitation Vicksburg.pdf
Extracted text from page 1 of Solicitation Vicksburg.pdf...
Extracted text from page 2 of Solicitation Vicksburg.pdf...
Extracted text from page 3 of Solicitation Vicksburg.pdf...
Extracted text from page 4 of Solicitation Vicksburg.pdf...
Extracted text from page 5 of Solicitation Vicksburg.pdf...
Extracted text from page 6 of Solicitation Vicksburg.pdf...
Extracted text from page 7 of Solicitation Vicksburg.pdf...
Extracted text from page 8 of Solicitation Vicksburg.pdf...
Extracted text from page 9 of Solicitation Vicksburg.pdf...
Extracted text from page 10 of Solicitation Vicksburg.pdf...
= An official website of the United States government Here’s how you know

f) Important Reps and Certs Update Show Details () See All Alerts

Mar 1, 2024

Planned Maintenance Schedule Show Details (x)
May 21, 2024

+L
+f |

_am_ SAM.GOV’

Home Search Data Bank Data Services Help

TEnit

In [None]:


# Define the prompt template
prompt_template = f"""
Read this text and remove any unwanted elements such as symbols, numbers, brackets, parentheses, and special characters. 
Only keep the clean, readable text.

Here is the text:
{text}
"""

# Generate the cleaned output
output = ollama.generate(
    model="llama3.2",
    prompt=prompt_template,
)

# Extract the response
cleaned_text = output['response']

# Print the cleaned text
print("Cleaned Text:")
#print(cleaned_text)

Cleaned Text:
The text appears to be a presolicitation notice for a federal government contract, specifically the USACE (United States Army Corps of Engineers) contracting office in Vicksburg, MS. Here's a summary of the key points:

**Contract Opportunity:**

* Presolicitation Number: W912EE2400003
* Contracting Office: KO Contracting Division, 4155 Clay Street, Vicksburg, MS 39183-3435

**Contact Information:**

* Primary Point of Contact: Lawren Boolos (M lawren.boolos@usace.army.mil and 6016315211)
* Secondary Point of Contact: Robert Ellis Screws (M Ellis.Screws@usace.army.mil and 6016317527)

**Pre-Solicitation Notice:**

* The presolicitation notice is available on the USACE website and through the ProSolve system.
* A bidder inquiry key is provided to access the system.

**Inquiries:**

* Bidders are invited to submit inquiries via email or phone to the contracting office.
* Inquiries will be responded to in a timely manner.

**Note:**

* This contract opportunity is subject to

In [None]:
# Define the prompt
prompt = """ 
You are an expert in analyzing solicitations, bringing up new projects, and writing excellent proposals.
Here is a solicitation text you need to analyze:
1. Purpose of the solicitation.
2. Resources needed to complete the project most effectively.
3. Engineers required.
4. Important dates.
"""

# Define the solicitation data explicitly
text = """
**Solicitation Text**
- Purpose: To develop a new software platform for managing logistics.
- Resources Needed: Cloud infrastructure, development tools, project management software.
- Engineers Required: 5 Software Engineers, 2 DevOps Engineers, 1 Project Manager.
- Important Dates: Proposal Submission - 2023-10-15, Project Start - 2023-11-01, Project End - 2024-05-31.
"""

# Create a prompt template combining the prompt and the data
prompt_template = f"""
{prompt}

Here is the solicitation data to analyze:
{text}

Please provide a structured response based on the data above.
"""

# Generate a response combining the prompt and data
output = ollama.generate(
    model="llama3.2",
    prompt=prompt_template,
)

# Extract the response
summary = output['response']

# Print the structured output
print("Structured Output:")
print(summary)

In [None]:
import weaviate
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType
from sentence_transformers import SentenceTransformer

In [10]:
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

def compute_embeddings(documents):
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings

In [None]:
client = weaviate.connect_to_local(port=8080)
collection = client.collections.get("Notices")
query_vector = compute_embeddings(text)
response = collection.query.near_vector(
    near_vector=query_vector,
    limit=10,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

print(len(response.objects))
for i in response.objects:
    print(i.properties)
client.close()