In [None]:
!pip install html2text pymupdf4llm torch transformers pandas chromadb 

In [1]:
import chromadb
import html2text
import pymupdf4llm
import torch
import re
import pandas as pd
from io import StringIO
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def html_to_markdown(html_path):
    with open(html_path, 'r', encoding='utf-8') as html_file:
        html_content = html_file.read()
    
    #
    h = html2text.HTML2Text()
    h.ignore_links = False  # Set to True if you want to ignore links in the HTML
    markdown_content = h.handle(html_content)
    
    return markdown_content


device = 0 if torch.cuda.is_available() else -1
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad",device=device)

fields = {
    "Bid Number": "What is the Bid Number or identifier mentioned in the contract or proposal?n",
    "Title": "What is the title or name of the contract or proposal?(mentioned in the cover page)",
    "Due Date": "What is the due date for the submission of bids or responses?a",
    "Bid Submission Type": "How is the bid or proposal submission expected (e.g., online, by mail)?",
    "Term of Bid": "What is the duration or term of the bid or contract? For example, how long is the contract valid?",
    "Pre Bid Meeting": "Does the contract mention any pre-bid meeting or event before submission?",
    "Installation": "Does the contract mention installation or deployment requirements?",
    "Bid Bond Requirement": "Does the contract require a bid bond, deposit, or guarantee?",
    "Delivery Date": "What is the delivery date or schedule mentioned in the contract or proposal?in format dd/mm/yyyy",
    "Payment Terms": "What are the payment terms outlined in the contract? (e.g., installments, milestones, quarterly)",
    "Any Additional Documentation Required": "Are there any additional documents or requirements mentioned for the contract?list them",
    "MFG for Registration": "What is the manufacturer or the entity required for registration or certification?",
    "Contract or Cooperative to use": "Is there any specific contract or cooperative that must be used for this agreement?",
    "Model No": "What is the model number or specific product identifier mentioned in the contract?if multple list each of them in newlines.",
    "Part No": "What is the part number or component ID mentioned in the contract?if multiple list each in a new line.",
    "Product": "What product or type of products is being referenced or requested in the contract?.if multiple list each in newline.",
    "Contact Info": "What contact information is provided for inquiries or support regarding the contract?mention both suppliers and clients",
    "Company Name": "What is the name of the company or organization associated with the contract?",
    "Bid Summary": "What is the summary or general description of the bid or contract provided in the document?",
    "Product Specification": "What are the product specifications, features, or requirements mentioned in the contract?.list for each product."
}
# Function to detect markdown tables
def extract_markdown_tables(text):
    table_pattern = re.compile(r"(\|.+?\|\n(\|[-:]+?\|\n)+(\|.+?\|\n)+)", re.DOTALL)
    return table_pattern.findall(text)

# Function to parse markdown tables into DataFrames
def parse_markdown_table(markdown_table):
    table_df = pd.read_csv(StringIO(markdown_table), sep="|", skipinitialspace=True)
    return table_df.drop(columns=table_df.columns[[0, -1]])  # Remove empty columns from "|"

# Function to process mixed content
def process_mixed_content(file_content):
    # Extract tables
    tables = extract_markdown_tables(file_content)
    processed_tables = [parse_markdown_table(table[0]) for table in tables]
    
    # Extract text sections outside tables
    text_sections = re.split(r"(\|.+?\|\n(\|[-:]+?\|\n)+(\|.+?\|\n)+)", file_content)
    text_parts = [
        part.strip() for part in text_sections if not part.startswith("|") and part.strip()
    ]
    
    return {"tables": processed_tables, "text": text_parts}

# Function to apply QA pipeline to free text
def extract_fields_from_text(text_parts, fields):
    results = {}
    for text in text_parts:
        for field, question in fields.items():
            try:
                answer = qa_pipeline(question=question, context=text)
                results[field] = results.get(field, []) + [answer["answer"]]
            except Exception as e:
                results[field] = results.get(field, []) + [f"Error: {str(e)}"]
    return results


# Pipeline Execution
def mixed_content_pipeline(file_content, fields):
    # Step 1: Process content to separate tables and text
    processed_content = process_mixed_content(file_content)
    
    # Step 2: Extract fields from text sections
    text_results = extract_fields_from_text(processed_content["text"], fields)
    
    # Step 3: Only include extracted fields in the output (omit tables)
    output = {
        "extracted_fields": text_results
    }
    return output


In [4]:
doc1 = pymupdf4llm.to_markdown('Bid1/Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf')
doc2 = pymupdf4llm.to_markdown('Bid1/Addendum 2 RFP JA-207652 Student and Staff Computing Devices.pdf')
doc3 = pymupdf4llm.to_markdown('Bid1/JA-207652 Student and Staff Computing Devices FINAL.pdf')
html_path = 'Bid1/Student and Staff Computing Devices __SOURCING #168884__ - Bid Information - {3} _ BidNet Direct.html'  # Replace with your HTML file path
doc4 = html_to_markdown(html_path)

data1=doc1+doc2+doc3+doc4

output1 = mixed_content_pipeline(data1, fields)
import json
output_filename = "output/DallasOutput.json"
with open(output_filename, "w") as f:
    json.dump(output1, f, indent=4)

print(f"Pipeline results saved to {output_filename}")

Processing Bid1/Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf...
Processing Bid1/Addendum 2 RFP JA-207652 Student and Staff Computing Devices.pdf...
Processing Bid1/JA-207652 Student and Staff Computing Devices FINAL.pdf...
Pipeline results saved to output/DallasOutput.json


In [5]:
doc1 = pymupdf4llm.to_markdown('Bid2/PORFP_-_Dell_Laptop_Final.pdf')
doc2 = pymupdf4llm.to_markdown('Bid2/Contract_Affidavit.pdf')
doc3 = pymupdf4llm.to_markdown('Bid2/Dell_Laptop_Specs.pdf')
doc4 = pymupdf4llm.to_markdown('Bid2/Mercury_Affidavit.pdf')
html_path = 'Bid2/Dell Laptops w_Extended Warranty - Bid Information - {3} _ BidNet Direct.html'  
doc5 = html_to_markdown(html_path)

data2=doc1+doc2+doc3+doc4+doc5

output2 = mixed_content_pipeline(data2, fields)

import json
output_filename = "output/DellOutput.json"
with open(output_filename, "w") as f:
    json.dump(output2, f, indent=4)

print(f"Pipeline results saved to {output_filename}")

Processing Bid2/PORFP_-_Dell_Laptop_Final.pdf...
Processing Bid2/Contract_Affidavit.pdf...
Processing Bid2/Dell_Laptop_Specs.pdf...
Processing Bid2/Mercury_Affidavit.pdf...
Pipeline results saved to output/DellOutput.json
