In [21]:
import os
import re
import json
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
import spacy

# Load NLP model
nlp = spacy.load("en_core_web_sm")

# Define paths for the input files
INPUT_FOLDER = r"C:\Users\SHAMEER.K\OneDrive\Desktop\New folder\Campus hiring-2024-2025 assignment\Bid1"
OUTPUT_FOLDER = "./output_files"

# Ensure output folder exists
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Function to parse HTML files
def parse_html(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
    return soup.get_text()

# Function to parse PDF files
def parse_pdf(file_path):
    reader = PdfReader(file_path)
    text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
    return text

# Refined patterns for fields
PATTERNS = {
    "bid_number": r"(?i)Bid Number:\s*(\S+)",
    "title": r"(?i)Title:\s*(.+?)\n",
    "due_date": r"(?i)(Due Date|Deadline):\s*(\d{2}/\d{2}/\d{4}|\d{4}-\d{2}-\d{2})",
    "bid_submission_type": r"(?i)Bid Submission Type:\s*(\w+)",
    "pre_bid_meeting": r"(?i)Pre[-\s]*Bid Meeting:\s*(.+?)\n",
    "bid_bond_requirement": r"(?i)Bid Bond Requirement:\s*(.+?)\n",
    "delivery_date": r"(?i)Delivery Date:\s*(\d{2}/\d{2}/\d{4}|\d{4}-\d{2}-\d{2})",
    "payment_terms": r"(?i)Payment Terms:\s*(.+?)\n",
    "additional_documentation_required": r"(?i)Additional Documentation Required:\s*(.+?)\n",
    "manufacturer_for_registration": r"(?i)Manufacturer for Registration:\s*(.+?)\n",
    "contract_or_cooperative_to_use": r"(?i)Contract or Cooperative to Use:\s*(.+?)\n",
    "model_number": r"(?i)Model[-\s]*Number:\s*(.+?)\n",
    "part_number": r"(?i)Part[-\s]*Number:\s*(.+?)\n",
    "product": r"(?i)Product:\s*(.+?)\n",
    "contact_info": r"(?i)Contact Info:\s*(.+?)\n",
    "company_name": r"(?i)Company[-\s]*Name:\s*(.+?)\n",
    "bid_summary": r"(?i)Bid Summary:\s*(.+?)\n",
    "product_specification": r"(?i)Product Specification:\s*(.+?)\n"
}

# Extract structured information using regex and NLP
def extract_information(text):
    doc = nlp(text)
    structured_data = {
        "bid_number": None,
        "title": None,
        "due_date": None,
        "bid_submission_type": "Online",  # Default value
        "pre_bid_meeting": None,
        "installation_required": True,  # Default value
        "bid_bond_requirement": None,
        "delivery_date": None,
        "payment_terms": None,
        "additional_documentation_required": None,
        "manufacturer_for_registration": None,
        "contract_or_cooperative_to_use": None,
        "model_number": None,
        "part_number": None,
        "product": None,
        "contact_info": None,
        "company_name": None,
        "bid_summary": None,
        "product_specification": None
    }

    # Apply regex patterns
    for field, pattern in PATTERNS.items():
        match = re.search(pattern, text)
        if match:
            structured_data[field] = match.group(1).strip()

    # Use NLP for fallback extraction (e.g., dates, organization names)
    for ent in doc.ents:
        if ent.label_ == "DATE" and not structured_data["due_date"]:
            structured_data["due_date"] = ent.text
        elif ent.label_ == "ORG" and not structured_data["company_name"]:
            structured_data["company_name"] = ent.text
        elif ent.label_ == "MONEY" and not structured_data["bid_bond_requirement"]:
            structured_data["bid_bond_requirement"] = ent.text

    return structured_data

# Main function to process files
def process_files(input_folder, output_folder):
    results = {}

    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)

        try:
            if file_name.endswith(".html"):
                text = parse_html(file_path)
            elif file_name.endswith(".pdf"):
                text = parse_pdf(file_path)
            else:
                print(f"Unsupported file format: {file_name}")
                continue

            # Extract structured data
            structured_data = extract_information(text)
            results[file_name] = structured_data

            # Save each result as JSON
            output_file = os.path.join(output_folder, f"{file_name}.json")
            with open(output_file, "w", encoding="utf-8") as json_file:
                json.dump(structured_data, json_file, indent=4)

        except Exception as e:
            print(f"Error processing {file_name}: {e}")

    return results

# Run the script
if __name__ == "__main__":
    structured_results = process_files(INPUT_FOLDER, OUTPUT_FOLDER)
    print("Processing complete. Structured data saved in:", OUTPUT_FOLDER)


Processing complete. Structured data saved in: ./output_files
