<a href="https://colab.research.google.com/github/Ali-mohammadi-design/RFP_TASKS_EXTRACTOR/blob/main/RFP_TASKS_EXTRACTOR_Version3_AFFORDABlE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# STEP 1: Install required libraries
!pip install PyMuPDF openai

Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.5


In [None]:
import fitz  # PyMuPDF
import openai
from typing import List
from google.colab import files
import os

In [None]:
uploaded = files.upload()

Saving PART 3 - Requirements for Deliverables.pdf to PART 3 - Requirements for Deliverables.pdf


In [None]:
from openai import OpenAI

client = OpenAI(api_key="Your_Open_AI_API")


In [None]:
def extract_text_from_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

In [None]:
def chunk_text(text: str, chunk_size: int = 1000) -> List[str]:
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

In [None]:
def extract_tasks_from_text_chunk(chunk: str) -> List[str]:
    prompt = (
        "You are analyzing a section of an engineering RFP document. "
        "From the following text, extract any clearly defined tasks or activities that "
        "should be included in a proposal."
        "Please do not return tasks that takes less than 1 hour to be done"
        "Return the tasks as a bullet point list:\n\n"
        f"{chunk}\n\nTasks:"
    )
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
    )

    tasks_text = response.choices[0].message.content
    tasks = [line.strip("-• \n") for line in tasks_text.split("\n") if line.strip()]
    return tasks

In [None]:
def analyze_uploaded_pdfs(uploaded_files) -> List[str]:
    all_tasks = set()
    for filename in uploaded_files.keys():
        print(f"\nProcessing: {filename}")
        text = extract_text_from_pdf(filename)
        chunks = chunk_text(text)
        for chunk in chunks:
            try:
                tasks = extract_tasks_from_text_chunk(chunk)
                all_tasks.update(tasks)
            except Exception as e:
                print(f"Error processing chunk: {e}")
    return sorted(all_tasks)


In [None]:
tasks = analyze_uploaded_pdfs(uploaded)


Processing: PART 3 - Requirements for Deliverables.pdf


In [None]:
print("\n--- ✅ Extracted Tasks ---")
for task in tasks:
    print(f"- {task}")


--- ✅ Extracted Tasks ---
- Access and review available record drawings and/or document libraries of the Transfer Stations and Collection Yards.
- Add a new Section A.2.4 (Confined Space List) as specified.
- Additional engineering design efforts for the implementation of recommendations from technical memorandums developed by the Successful Supplier.
- Address all emergency lighting deficiencies related to emergency lighting levels, equipment hardware, power sources, conduits, and roofing, etc., as necessary to maintain a state of good repair and aesthetics.
- Address all lighting deficiencies related to lighting levels, equipment hardware, power sources, conduits, and roofing to maintain a state of good repair and aesthetics.
- Address social procurement requirements
- Address the interior lighting standardized protocol to meet the criteria specified in Section 1.2.1.3.1 and include the components outlined in Section 1.2.1.3.2.
- Adhere to the City's guidelines for Detailed Design E

In [None]:
len(tasks)

455

In [None]:
tasks_r=tasks

In [None]:
def chunk_tasks_for_filtering(tasks: List[str], max_chunk_chars: int = 1000) -> List[List[str]]:
    """Split tasks into smaller groups based on character limit."""
    chunks = []
    current_chunk = []
    current_length = 0

    for task in tasks:
        task_str = f"- {task}"
        if current_length + len(task_str) > max_chunk_chars:
            chunks.append(current_chunk)
            current_chunk = [task_str]
            current_length = len(task_str)
        else:
            current_chunk.append(task_str)
            current_length += len(task_str)

    if current_chunk:
        chunks.append(current_chunk)

    return chunks


In [None]:
def filter_proposal_tasks_chunked(tasks: List[str]) -> List[str]:
    task_chunks = chunk_tasks_for_filtering(tasks_r)
    all_filtered_tasks = []
    seen = set()

    for i, chunk in enumerate(task_chunks):
        print(f"\n🔍 Processing chunk {i+1}/{len(task_chunks)}...")
        prompt = (
            "You are a proposal engineer reviewing a list of tasks extracted from an RFP. "
            "Return only the tasks that should be included in the proposal while we are preparing the TTM. "
            "The returned tasks must be either some actions that we should do or specific report or document that we should prepare for the project owner"
            "do not return tasks that could be done in less than 1 hour."
            "Ignore vague or non-actionable items.\n\n"
            "Tasks:\n" + "\n".join(chunk) + "\n\nReturn only valid tasks as a bullet list."
        )

        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.2,
            )

            filtered = response.choices[0].message.content.strip()
            filtered_list = [line.strip("-• \n") for line in filtered.split("\n") if line.strip()]

            # Print each result as it's processed
            for task in filtered_list:
                if task not in seen:
                    seen.add(task)
                    all_filtered_tasks.append(task)
                    print(f"✅ {task}")

        except Exception as e:
            print(f"❌ Error in chunk {i+1}: {e}")

    return all_filtered_tasks


In [None]:
filtered_tasks = filter_proposal_tasks_chunked(tasks)

print("\n--- 📌 Final Filtered Tasks for Proposal ---")
for task in filtered_tasks:
    print(f"- {task}")



🔍 Processing chunk 1/56...
✅ Access and review available record drawings and/or document libraries of the Transfer Stations and Collection Yards.
✅ Additional engineering design efforts for the implementation of recommendations from technical memorandums developed by the Successful Supplier.
✅ Address all emergency lighting deficiencies related to emergency lighting levels, equipment hardware, power sources, conduits, and roofing, etc., as necessary to maintain a state of good repair and aesthetics.
✅ Address all lighting deficiencies related to lighting levels, equipment hardware, power sources, conduits, and roofing to maintain a state of good repair and aesthetics.
✅ Address the interior lighting standardized protocol to meet the criteria specified in Section 1.2.1.3.1 and include the components outlined in Section 1.2.1.3.2.

🔍 Processing chunk 2/56...
✅ Administer contracts during site construction
✅ Allocate necessary labor hours for construction contract administration services

In [None]:
len(filtered_tasks)

355

In [None]:
!pip install -q sentence-transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load sentence embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Small + efficient

# Split PDF text into chunks
def chunk_text_for_rag(text, chunk_size=500):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Chunk and embed the PDF text
pdf_filename = next(iter(uploaded))
text = extract_text_from_pdf(pdf_filename)
pdf_chunks = chunk_text_for_rag(text)
chunk_embeddings = embedder.encode(pdf_chunks, convert_to_tensor=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def analyze_task_with_rag(task: str, chunks: List[str], chunk_embeddings, top_k=3) -> dict:
    # Embed the task
    task_embedding = embedder.encode(task, convert_to_tensor=True)

    # Find most relevant chunks
    hits = util.semantic_search(task_embedding, chunk_embeddings, top_k=top_k)
    relevant_chunks = "\n\n".join([chunks[hit['corpus_id']] for hit in hits[0]])

    # GPT system & user prompt
    system_prompt = (
        "You are an expert project planner analyzing engineering RFPs. "
        "Given a task and related RFP context, determine:\n"
        "1. If it's a deliverable or an activity\n"
        "2. Who should do it (role and expertise). start the answer whith we need a (name of the expert):\n"
        "3. The expertise level (junior/mid/senior). Start the answer with the expertise level is \n"
        "4. In a short sentence why this tasks has been defined in the project. Start your response with the reason of having this task is"
        "5. in a short sentence mention how it should be done."
        "6. What is your time estimation? (in hours)"


    )

    user_prompt = f"Task:\n{task}\n\nRFP Context:\n{relevant_chunks}\n\nAnalyze the task based on the context."

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=0.4,
        )

        analysis = response.choices[0].message.content.strip()
        return {"task": task, "analysis": analysis}

    except Exception as e:
        return {"task": task, "analysis": f"Error: {e}"}


In [None]:
rag_based_analyses = []

for task in filtered_tasks:
    result = analyze_task_with_rag(task, pdf_chunks, chunk_embeddings)
    rag_based_analyses.append(result)
    print(f"\n📝 Task: {result['task']}")
    print(f"🔍 Analysis:\n{result['analysis']}")



📝 Task: Access and review available record drawings and/or document libraries of the Transfer Stations and Collection Yards.
🔍 Analysis:
1. This is an activity.
2. We need a project engineer.
3. The expertise level is mid.
4. The reason of having this task is to ensure that the engineering team has access to accurate and up-to-date information for effective project planning and execution.
5. This should be done by systematically accessing the provided records and libraries, reviewing the documents, and summarizing the findings for the project team.
6. My time estimation is 20 hours.

📝 Task: Additional engineering design efforts for the implementation of recommendations from technical memorandums developed by the Successful Supplier.
🔍 Analysis:
1. This is a deliverable.
2. We need a Senior Electrical Engineer.
3. The expertise level is senior.
4. The reason of having this task is to ensure that all recommendations from the technical memorandums are effectively implemented in complian

In [None]:
def parse_analysis(analysis: str) -> dict:
    parts = {
        "deliverable_or_activity": None,
        "expert": None,
        "expert_level": None,
        "task_reason": None,
        "how_to_do": None,
        "time_estimate": None
    }

    lines = analysis.lower().split('\n')

    for line in lines:
        if "we need a" in line:
            parts["expert"] = line.strip()
        elif "the expertise level is" in line:
            parts["expert_level"] = line.strip()
        elif "the reason of having this task is" in line:
            parts["task_reason"] = line.strip()
        elif "how it should be done" in line or "it should be done" in line:
            parts["how_to_do"] = line.strip()
        elif "time estimation" in line or "estimated time" in line or "in hours" in line:
            parts["time_estimate"] = line.strip()
        elif "deliverable" in line or "activity" in line:
            parts["deliverable_or_activity"] = line.strip()

    return parts

# ✅ Now generate parsed_rows
parsed_rows = []
for item in rag_based_analyses:
    parsed = parse_analysis(item["analysis"])
    parsed["task"] = item["task"]
    parsed_rows.append(parsed)


In [None]:
import pandas as pd
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment, Font
from openpyxl import load_workbook

# --- Create and reorder DataFrame ---
df = pd.DataFrame(parsed_rows)
desired_order = ['task', 'deliverable_or_activity', 'expert', 'expert_level', 'task_reason', 'how_to_do', 'time_estimate']
df = df[desired_order]

# --- Export to Excel ---
excel_path = "rag_task_analysis_detailed.xlsx"
df.to_excel(excel_path, index=False)

# --- Load workbook and format ---
wb = load_workbook(excel_path)
ws = wb.active

# Style headers
header_font = Font(bold=True)
for col_num, column_title in enumerate(df.columns, 1):
    cell = ws.cell(row=1, column=col_num)
    cell.font = header_font
    cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)

# Auto-adjust column widths & wrap text for all cells
for col_num, column_cells in enumerate(ws.columns, 1):
    max_length = 0
    col_letter = get_column_letter(col_num)

    for cell in column_cells:
        try:
            cell.alignment = Alignment(wrap_text=True, vertical='top')
            if cell.value:
                max_length = max(max_length, len(str(cell.value)))
        except:
            pass

    adjusted_width = min(max_length + 4, 60)  # Cap width to keep it tidy
    ws.column_dimensions[col_letter].width = adjusted_width

# Save styled workbook
wb.save(excel_path)

print("✅ Professionally formatted Excel saved as:", excel_path)


✅ Professionally formatted Excel saved as: rag_task_analysis_detailed.xlsx


In [None]:
from google.colab import files
files.download("rag_task_analysis_detailed.xlsx")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>