<a href="https://colab.research.google.com/github/Bint7bara/EnsembleLearningProject/blob/master/OllamaFaster_Pipe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

#We are processing 9000 PDFs stored in /home/shared/SAMPLEFILES with quotes and spaces in filenames. The goal is to:

#Extract text from PDFs efficiently using pymupdf (fast, handles corrupt PDFs).
#Run all PDFs against multiple prompts using Ollama (LLaMA 3.2 3B).
#Use GPU acceleration for super-fast processing.
#Categorize responses into StrongMatch, PartialMatch, or NoMatch.
#Save results into structured folders (/home/shared/CategorizedPDFs/).
#Export a CSV file containing the filename, prompt used, response, and match category.

In [None]:
# Why This is Faster
# Uses pymupdf (Fitz) instead of pdfplumber – 3-5x faster
# Ray for parallelization – utilizes all available CPU cores
# Handles corrupt PDFs gracefully (logs errors, skips bad files)
# Expected runtime: 20-40 minutes (instead of 3 hours)

In [None]:
# Step 1: Extract Text from PDFs (Handles Errors & Uses OCR)

import os
import json
import re
import fitz  # pymupdf - Fast PDF processing
import pytesseract  # OCR for scanned PDFs
from pdf2image import convert_from_path  # Converts PDF to images
import ray  # Distributed computing

# Initialize Ray for Parallel Execution
ray.init(ignore_reinit_error=True, num_cpus=os.cpu_count())

# Define Paths
pdf_dir = "/home/shared/SAMPLEFILES_100files"
output_file = "extracted_text.json"
failed_pdfs_file = "failed_pdfs.log"
reprocessed_pdfs_file = "reprocessed_pdfs.json"

# Function to clean filenames (strip quotes and spaces)
def clean_filename(filename):
    return filename.strip().strip('"')

# Function to extract text from PDFs
@ray.remote
def extract_text_from_pdf(filename):
    """
    Extracts text from a PDF file using pymupdf (fitz) for faster processing.
    """
    filename = clean_filename(filename)
    pdf_path = os.path.join(pdf_dir, filename)

    if not os.path.isfile(pdf_path):
        return {"filename": filename, "error": "File not found"}

    try:
        doc = fitz.open(pdf_path)
        text = "\n".join([page.get_text("text") for page in doc])

        if not text.strip():
            return {"filename": filename, "error": "No text extracted"}

        # Extract proposal and award numbers using regex
        proposal_match = re.search(r"Proposal\s*#?\s*(\d+)", text)
        award_match = re.search(r"Award\s*#?\s*(\d+)", text)

        return {
            "filename": filename,
            "text": text,
            "proposal_number": proposal_match.group(1) if proposal_match else None,
            "award_number": award_match.group(1) if award_match else None
        }
    except Exception as e:
        return {"filename": filename, "error": str(e)}

# Get list of all PDFs, cleaning filenames
pdf_files = [clean_filename(f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

# Run text extraction in parallel using Ray
results = ray.get([extract_text_from_pdf.remote(f) for f in pdf_files])

# Separate successful and failed extractions
successful_extractions = [r for r in results if "error" not in r]
failed_extractions = [r for r in results if "error" in r]

# Save extracted text to JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(successful_extractions, f, ensure_ascii=False, indent=4)

# Log failed PDFs
with open(failed_pdfs_file, "w", encoding="utf-8") as log:
    for failure in failed_extractions:
        log.write(f"{failure['filename']}: {failure['error']}\n")

print(f"✅ Extracted text from {len(successful_extractions)} PDFs and saved to {output_file}")
print(f"⚠️ Logged {len(failed_extractions)} failed PDFs in {failed_pdfs_file}")


2025-01-31 21:26:44,448	INFO worker.py:1832 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8266 [39m[22m


✅ Extracted text from 93 PDFs and saved to extracted_text.json
⚠️ Logged 7 failed PDFs in failed_pdfs.log


In [None]:
#  Step 2: Process Failed PDFs Using OCR
# Load Failed PDFs
failed_pdfs = []
with open(failed_pdfs_file, "r") as f:
    failed_pdfs = [clean_filename(line.split(":")[0]) for line in f.readlines()]

# Function to extract text using OCR
@ray.remote
def ocr_extract_text(pdf_filename):
    """
    Uses OCR to extract text from scanned PDFs.
    Converts PDF pages to images and applies Tesseract OCR.
    """
    pdf_path = os.path.join(pdf_dir, pdf_filename)

    if not os.path.isfile(pdf_path):
        return {"filename": pdf_filename, "error": "File not found"}

    try:
        images = convert_from_path(pdf_path, dpi=300)
        extracted_text = "\n".join([pytesseract.image_to_string(img) for img in images])

        if not extracted_text.strip():
            return {"filename": pdf_filename, "error": "OCR failed - No text found"}

        return {"filename": pdf_filename, "text": extracted_text}

    except Exception as e:
        return {"filename": pdf_filename, "error": str(e)}

# Process failed PDFs using OCR
ocr_results = ray.get([ocr_extract_text.remote(f) for f in failed_pdfs])

# Save OCR results
with open(reprocessed_pdfs_file, "w", encoding="utf-8") as f:
    json.dump(ocr_results, f, ensure_ascii=False, indent=4)


KeyboardInterrupt: 

In [None]:
# Step 3: Load Prompts for Querying

prompts_dir = "/home/shared/Prompts"
prompts = {}

for prompt_file in os.listdir(prompts_dir):
    prompt_path = os.path.join(prompts_dir, prompt_file)

    if os.path.isfile(prompt_path) and prompt_file.endswith(".txt"):
        with open(prompt_path, "r", encoding="utf-8") as f:
            prompts[prompt_file] = f.read().strip()

print(f"✅ Loaded {len(prompts)} prompts from {prompts_dir}")


✅ Loaded 8 prompts from /home/shared/Prompts


In [None]:
# step 4 that extracts confidence scores from llama:

import ollama
import ray
import re  # ✅ Needed to extract confidence score

# Initialize Ray for GPU parallel execution
ray.init(ignore_reinit_error=True, num_gpus=1)

@ray.remote
def query_ollama(text, prompt_text, prompt_name, filename):
    """
    Queries Ollama using extracted text and a given prompt.
    Runs queries in parallel using Ray.
    """
    try:
        messages = [{"role": "system", "content": prompt_text}, {"role": "user", "content": text}]
        response = ollama.chat(model="llama3.2:3b", messages=messages)

        # ✅ Extract response text safely
        response_text = response.get("message", {}).get("content", "No response from Ollama").strip()

        # ✅ Extract confidence score from response using regex
        confidence_match = re.search(r"confidence score:\s*(\d+)", response_text, re.IGNORECASE)
        confidence_score = int(confidence_match.group(1)) if confidence_match else "N/A"  # Default to "N/A" if missing

        return {
            "filename": filename,
            "prompt": prompt_name,  # ✅ Store only the prompt name here
            "response": response_text,  # ✅ Full response from Ollama
            "confidence_score": confidence_score  # ✅ Extracted confidence score
        }
    except Exception as e:
        return {
            "filename": filename,
            "prompt": prompt_name,
            "response": f"Error: {str(e)}",
            "confidence_score": "N/A"  # Assign "N/A" on failure
        }

# Run Ollama queries in parallel
query_tasks = []
for doc in successful_extractions:
    for prompt_name, prompt_text in prompts.items():  # ✅ Use prompt_name
        query_tasks.append(query_ollama.remote(doc["text"], prompt_text, prompt_name, doc["filename"]))

# Retrieve responses from Ray tasks
responses = ray.get(query_tasks)


In [None]:
# Step 5: Categorizing Responses

def categorize_match(response):
    response_text = response.lower() if isinstance(response, str) else ""

    if "strongly related" in response_text:
        return "StrongMatch"
    elif "partially related" in response_text:
        return "PartialMatch"
    else:
        return "NoMatch"

for res in responses:
    res["match_status"] = categorize_match(res["response"])


In [None]:
#step 6

import os
import shutil
import pandas as pd

# Define directories for categorized PDFs
categorized_dir = "/home/shared/CategorizedPDFs"

# Ensure directories exist for each category
for category in ["StrongMatch", "PartialMatch", "NoMatch"]:
    os.makedirs(os.path.join(categorized_dir, category), exist_ok=True)

# ✅ Assign numbers to prompts using their short names
prompt_numbers = {prompt_name: i+1 for i, prompt_name in enumerate(prompts.keys())}

# Create a list to store CSV data
csv_data = []

for res in responses:
    filename = res["filename"]
    match_status = res["match_status"]
    response_text = res.get("response", "No response recorded")  # ✅ Avoids KeyError
    prompt_used = res["prompt"]  # ✅ Now contains short prompt name
    confidence_score = res.get("confidence_score", "N/A")  # ✅ Avoids KeyError if missing

    # ✅ Ensure prompt_used is a valid key
    prompt_number = prompt_numbers.get(prompt_used, "N/A")  # ✅ Prevents KeyError

    # Copy PDF into the appropriate folder based on match status
    shutil.copy2(os.path.join(pdf_dir, filename), os.path.join(categorized_dir, match_status))

    # Append data to CSV list
    csv_data.append({
        "Filename": filename,
        "Prompt Number": prompt_number,  # ✅ Now works correctly
        "Prompt Used": prompt_used,  # ✅ Now uses short name
        "Match Status": match_status,
        "Confidence Score": confidence_score,  # ✅ No heuristic, only from Ollama
        "Response Text": response_text  # ✅ Ensures response_text is included
    })

# Convert data to DataFrame
df = pd.DataFrame(csv_data)

# Save to CSV file
csv_output_path = os.path.join(categorized_dir, "results.csv")
df.to_csv(csv_output_path, index=False)

print(f"✅ Results saved to CSV: {csv_output_path}")


✅ Results saved to CSV: /home/shared/CategorizedPDFs/results.csv
