In [1]:
import pinecone
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
import pandas as pd
import time
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import os

# Initialize Pinecone


# Load embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm


In [2]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

def read_pdf_file(file_path):
    try:
        pages = convert_from_path(file_path, dpi=300)
        extracted_text = []
        
        for page in pages:
            text = pytesseract.image_to_string(page, lang='eng')
            extracted_text.append(text)
        
        # Combine text from all pages
        full_text = '\n'.join(extracted_text)
        
        return full_text
    except Exception as e:
        print(f"Error reading PDF file {file_path}: {e}")
        return ""


In [3]:
import os

def read_doc(directory):
    docs = []
    try:
        for filename in os.listdir(directory):
            if filename.lower().endswith('.pdf'):
                file_path = os.path.join(directory, filename)
                text = read_pdf_file(file_path)
                if text:  # Only add non-empty texts
                    docs.append(text)
    except Exception as e:
        print(f"Error processing directory {directory}: {e}")
    return docs


In [4]:
def compute_embeddings(documents):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(documents, convert_to_tensor=False)
    return embeddings



In [5]:
directory_path = 'docs/'
documents = read_doc(directory_path)

# Compute embeddings
v = compute_embeddings(documents)



d = len(v[0])


In [6]:
len(v[0])

384

In [7]:
api_key = "b452bfeb-70f3-4f8a-8c14-3bf3b50405ce"
pc = Pinecone(api_key=api_key)

index_name = "t1"
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
else:
    print(index_name, "doesn't exist")
    
pc.create_index(
    name = index_name,
    dimension = d,
    metric = "cosine",
    spec = ServerlessSpec(
        region = "us-east-1",
        cloud = "aws"
    )
)


while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)



In [8]:
# Prepare data for upsert
ids = [f'doc_{i}' for i in range(len(v))]
metadata = [{'text': doc} for doc in documents]

# Upsert vectors into Pinecone
index.upsert(vectors=zip(ids, v,metadata))
print("Initial files successfully uploaded to Pinecone.")
index.describe_index_stats()

Initial files successfully uploaded to Pinecone.


{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [9]:
print(len(v))

5


In [71]:
q = input("enter query")
qv = compute_embeddings(q).tolist()

results = index.query(
    vector=qv,
    top_k=2,
)

print(results)

{'matches': [{'id': 'doc_4', 'score': 0.488640845, 'values': []},
             {'id': 'doc_0', 'score': 0.380975217, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}


In [72]:
q

'What are the professional Qualifications required for this Architect-Engineer Solicitation?'

In [73]:
ids = [match['id'] for match in results['matches']]
texts = index.fetch(ids).vectors
texts


cx = ''.join(text['metadata'].get('text', '').strip() for key, text in texts.items())


print(cx)


C -- ARCHITECT AND ENGINEERING SERVICES
W912DQ24R4002

SYNOPSIS Architect-Engineer (A-E) SERVICES
INDEFINITE DELIVERY CONTRACTS (IDC)
FOR MILITARY SERVICES
FOR THE KANSAS CITY DISTRICT

1. CONTRACT INFORMATION: The U.S. Army Corps of Engineers (USACE) Kansas
City District (NWK) intends solicit, negotiate, and award a total small business Multiple
Award Task Order Contract (MATOC) with a target of five (5) indefinite delivery
contracts (IDCs) in accordance with 40 U.S.C. Chap 11, Selection of Architects and
Engineers (formerly known as the Brooks Architect Engineer Act) as implemented in
FAR Subpart 36.6 and DFARS Subpart 236.6. Firms will be selected for negotiation
based on demonstrated competence and qualifications for the required work. The A-E
IDCs are required to support Military projects assigned to NWK and throughout the
geographical boundaries of the Northwestern Division (NWD). Selection of the IDCs will
be based on planning and design criteria described below. Selection of ta

In [80]:
import requests
import json

chat_history = []

def generate_response(cx, q):
    
    global chat_history

    retrieved_text = cx

    # Combine the chat history and the current context to form the prompt
    conversation_history = "\n".join(chat_history)
    prompt = f"{conversation_history} Here is some context:\n{retrieved_text}\n\nBased on this, can you answer the following question: {q}"

    # Ollama API call to generate a response
    url = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}
    data = {
        "model": "llama3.1:latest",
        "prompt": prompt,
        "stream": False  
    }

    response = requests.post(url, headers=headers, data=json.dumps(data))

    # Check the response from the model
    if response.status_code == 200:
        result = response.json()
        ans = result['response']
        print("Model response:", ans)

        # Append the current question and answer to the chat history
        chat_history.append(f"Question: {q}\nAnswer: {ans}")

        # Keep only the last 5 responses (last 5 question-answer pairs)
        if len(chat_history) > 5:
            chat_history = chat_history[-5:]  

        return ans
    else:
        print("Error:", response.text)
        return None

    

    


In [15]:
def prompting(cx,q):

    answer = generate_response(cx,q)
    a = int(input("enter input: '1' for ok, '2' for correcting, '3' for skipping to next qstn"))
    if(a==1):
        pass
    elif(a==2):
        q = input("enter prompt to give correct answer")
        answer = correct_response(cx+answer,q)



In [16]:
def correct_response(cx, q):
    
    global chat_history

    retrieved_text = cx

    # Combine the chat history and the current context to form the prompt
    conversation_history = "\n".join(chat_history)
    prompt = f"{conversation_history}\n\nHere is context and your answer you gave :\n{retrieved_text}\n\nBut i need answer in some specific way: {q}"

    # Ollama API call to generate a response
    url = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}
    data = {
        "model": "llama3.1:latest",
        "prompt": prompt,
        "stream": False  
    }

    response = requests.post(url, headers=headers, data=json.dumps(data))

    # Check the response from the model
    if response.status_code == 200:
        result = response.json()
        ans = result['response']
        print("Model response:", ans)

        # Append the current question and answer to the chat history
        chat_history.append(f"Question: {q}\nAnswer: {ans}")

        # Keep only the last 5 responses (last 5 question-answer pairs)
        if len(chat_history) > 5:
            chat_history = chat_history[-5:]  

        return ans
    else:
        print("Error:", response.text)
        return None

In [81]:
generate_response(cx , q)

Model response: Unfortunately, the provided text does not mention the professional qualifications required for this Architect-Engineer Solicitation. The text appears to be a set of instructions and guidelines for accessing the ProjNet system and submitting an electronic SF 330 proposal, but it does not provide information on the qualifications or requirements for the A-E solicitation itself.

If you need to find out what the professional qualifications are required for this Architect-Engineer Solicitation, I would recommend checking other publicly available documents or searching for the specific solicitation number (if provided) in a government procurement database.


'Unfortunately, the provided text does not mention the professional qualifications required for this Architect-Engineer Solicitation. The text appears to be a set of instructions and guidelines for accessing the ProjNet system and submitting an electronic SF 330 proposal, but it does not provide information on the qualifications or requirements for the A-E solicitation itself.\n\nIf you need to find out what the professional qualifications are required for this Architect-Engineer Solicitation, I would recommend checking other publicly available documents or searching for the specific solicitation number (if provided) in a government procurement database.'

In [None]:
prompting(cx,q)

In [16]:
import os
import logging
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from sentence_transformers import SentenceTransformer
import pytesseract
from pdf2image import convert_from_path



In [17]:
# OCR Processor class
class OCRProcessor:
    def __init__(self, tesseract_path):
        pytesseract.pytesseract.tesseract_cmd = tesseract_path
    
    def process_pdf(self, pdf_path):
        try:
            print(f"Processing PDF for OCR: {pdf_path}")
            pages = convert_from_path(pdf_path, dpi=300)
            extracted_text = []
            
            for page_num, page in enumerate(pages):
                text = pytesseract.image_to_string(page, lang='eng')
                extracted_text.append(text)
            
            return '\n'.join(extracted_text)
        except Exception as e:
            logging.error(f"Error processing OCR for {pdf_path}: {str(e)}")
            return None



In [18]:
# File Monitor class
class FileMonitor(FileSystemEventHandler):
    def __init__(self, directory_to_watch, callback):
        self.directory_to_watch = directory_to_watch
        self.callback = callback
    
    def on_created(self, event):
        if event.is_directory or not event.src_path.endswith('.pdf'):
            return
        
        print(f"New PDF detected: {event.src_path}")
        logging.info(f"New PDF detected: {event.src_path}")
        try:
            self.callback(event.src_path)
        except Exception as e:
            logging.error(f"Error processing {event.src_path}: {str(e)}")
    
    def start(self):
        observer = Observer()
        observer.schedule(self, self.directory_to_watch, recursive=False)
        observer.start()
        print(f"Monitoring directory: {self.directory_to_watch}")
        logging.info(f"Started monitoring directory: {self.directory_to_watch}")
        
        try:
            while True:
                time.sleep(1)
        except KeyboardInterrupt:
            observer.stop()
            logging.info("File monitoring stopped.")
        
        observer.join()



In [None]:
# Function to process new files and update Pinecone
def process_new_file(file_path):
    text = ocr_processor.process_pdf(file_path)
    if text:
        vector = embedding_model.encode([text])[0]
        vector_id = os.path.basename(file_path)
        inderectory: docs/x.upsert(vectors=[(vector_id, vector)])
        print(f"New file '{file_path}' successfully uploaded to Pinecone.")

# Initialize components
tesseract_path = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
ocr_processor = OCRProcessor(tesseract_path)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Start monitoring
directory_to_watch = 'docs/'
file_monitor = FileMonitor(directory_to_watch, process_new_file)
file_monitor.start()
