## Document processing



In [None]:
%pip install pytesseract
%pip install pdf2image
%pip install fitz pymupdf

In [18]:
import os
import re
import json
import pytesseract
import fitz  # PyMuPDF
from PIL import Image
from pdf2image import convert_from_path
from typing import Dict, List

class MedicalScanPipeline:
    """
    A high-performance, scalable pipeline for processing uploaded medical scan reports.
    It extracts textual data, formats the content, generates context, and predicts key medical insights.
    """

    def __init__(self, disease_model: str =None, abnormality_model: str=None):
        self.ocr_processor = pytesseract.image_to_string
        # self.disease_pipeline = self._load_model_pipeline(disease_model)
        # self.abnormality_pipeline = self._load_model_pipeline(abnormality_model)
    

    
    def process_scan(self, file_path: str) -> Dict:
        """Processes a scanned medical document (PDF or image) and returns structured medical insights."""
        extracted_text = self._extract_text(file_path)
        formatted_text = self._format_text(extracted_text)
        context = self._generate_context(formatted_text)
        key_insights = self._extract_key_insights(context)
        return key_insights
    
    def _extract_text(self, file_path: str) -> str:
        """Extracts raw text from an uploaded scanned medical document (PDF or image)."""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File {file_path} not found.")
        
        text = ""
        if file_path.lower().endswith(".pdf"):
            text = self._extract_text_from_pdf(file_path)
        else:
            text = self.ocr_processor(Image.open(file_path))
        
        return text.strip()
    
    def _extract_text_from_pdf(self, file_path: str) -> str:
        """Extracts text from a PDF using PyMuPDF, with a fallback to OCR if necessary."""
        extracted_text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                extracted_text += page.get_text("text") + "\n"
        
        if not extracted_text.strip():  # If no text is found, use OCR
            images = convert_from_path(file_path)
            extracted_text = "\n".join([self.ocr_processor(img) for img in images])
        
        return extracted_text.strip()
    
    def _format_text(self, text: str) -> str:
        """Cleans and formats extracted text for further processing."""
        text = re.sub(r'[^\w\s.,]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def _generate_context(self, text: str) -> str:
        """Generates structured context from formatted medical text."""
        return f"Medical Report Analysis:\n{text}\nEnd of Report."
    
    def _extract_key_insights(self, context: str) -> Dict:
        """Extracts key insights including lab results, predicted diseases, and abnormalities."""
        lab_key_points = self._extract_lab_key_points(context)
        predicted_disease = self._predict_disease(context)
        abnormality_analysis = self._analyze_abnormalities(context)
        
        return {
            "lab_key_points": lab_key_points,
            "predicted_disease": predicted_disease,
            "abnormalities": abnormality_analysis
        }
    
    def _extract_key_insights(self, context: str) -> Dict:
        """Extracts key insights including lab results, predicted diseases, and abnormalities."""
        lab_key_points = self._extract_lab_key_points(context)
        predicted_disease = self._predict_disease(context)
        abnormality_analysis = self._analyze_abnormalities(context)
        
        return {
            "lab_key_points": lab_key_points,
            "predicted_disease": predicted_disease,
            "abnormalities": abnormality_analysis
        }
    
    def _extract_lab_key_points(self, context: str) -> List[str]:
        """Extracts key points from the medical lab report using AI."""
        return self._medical_ai(context, task="extract_lab_values")
    
    def _predict_disease(self, context: str) -> str:
        """Predicts potential diseases based on the provided medical context using AI."""
        return self._medical_ai(context, task="predict_disease")
    
    def _analyze_abnormalities(self, context: str) -> str:
        """Analyzes the medical text for any abnormal indicators of sickness using AI."""
        return self._medical_ai(context, task="detect_abnormalities")

    
    def _medical_ai(self, text: str, task: str) -> str:
        """An obfuscated AI-based method to process medical context and return insights."""
        import os
        import requests
        from dotenv import load_dotenv
        load_dotenv()
        
        endpoint = os.getenv("MEDICAL_AI_ENDPOINT")
        Model = os.getenv("MEDICAL_AI_MODEL")
        oak = os.getenv("MEDICAL_AI_OAK")
        
        print(f"Endpoint: {endpoint}, Model: {Model}, OAK: {oak}")
        
        headers = {"Authorization": f"Bearer {oak}", "Content-Type": "application/json"}
        payload = {"input": text, "task": task, "model": Model}
        
        response = requests.post(endpoint, json=payload, headers=headers)
        if response.status_code == 200:
            return response.json().get("output", "No relevant insights found.")
        return "AI processing unavailable"


# Usage Example:
if __name__ == "__main__":
    pipeline = MedicalScanPipeline()
    # report = pipeline.process_scan("docs/sample_medical_report_2.pdf")
    # print(pipeline._extract_text_from_pdf("docs/sample_medical_report_2.pdf"))
    print(pipeline._format_text(pipeline._extract_text_from_pdf("docs/sample_medical_report_2.pdf")))
    # print(report._extract_text_from_pdf())
    # print(json.dumps(report, indent=4))


Report Date 28.04.2019 Physician Dr. M. Jaksch Freiburg Medical Lab Laboratory Report Online Version Remarks Note Our reference values are adjusted to age and gender. Daily internal Quality Control within the required range according to ISO 15189. External Quality Control available on request. nonaccredited parameter This parameter is affected by Biotin intake of 5 mg RDI 0.03mg This investigation has been performed in a collaborating accredited laboratory Germany. Page 1 of 3 Patient Name Diabetes Profile sample report Gender Female Date of Birth 01.01.1973 Nationality Your ID Test Request Code 1278 Sample ID Patient IDNo 380032 Sampling Date Time27.04.2019 1709 Receipt Date Time 27.04.2019 1709 This report has been printed through ImedOnlineReportingSystem and therefore does not carry a signature. Insurance Freiburg Medical Laboratory Middle East L.L.C. P.O. Box 3068, Dubai UAE, Tel 04 396 2227, Fax 04 396 2228 Email infofmldubai.com, Website www.fmldubai.com Techn. Validation by Dr.

In [1]:
class Example:
    class_variable = "Shared"

    @staticmethod
    def static_method():
        return "Static method called"

    @classmethod
    def class_method(cls):
        return f"Class method called, class variable: {cls.class_variable}"

print(Example.static_method())   # ✅ Works fine
print(Example.class_method())    # ✅ Accesses class-level attributes


Static method called
Class method called, class variable: Shared


In [None]:
import os
import re
import json
import pytesseract
import fitz  # PyMuPDF
# import torch
from PIL import Image
from pdf2image import convert_from_path
# from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from typing import Dict, List

class MedicalScanPipeline:
    """
    A high-performance, scalable pipeline for processing uploaded medical scan reports.
    It extracts textual data, formats the content, generates context, and predicts key medical insights.
    """
    # "bert-base-uncased" or "distilbert-base-uncased" for faster inference
    def __init__(self, disease_model: str, abnormality_model: str):
        self.ocr_processor = pytesseract.image_to_string
        # self.disease_pipeline = self._load_model_pipeline(disease_model)
        # self.abnormality_pipeline = self._load_model_pipeline(abnormality_model)
    
    # @staticmethod
    # def _load_model_pipeline(model_name: str):
    #     """Loads a transformer model for classification tasks."""
    #     tokenizer = AutoTokenizer.from_pretrained(model_name)
    #     model = AutoModelForSequenceClassification.from_pretrained(model_name)
    #     return pipeline("text-classification", model=model, tokenizer=tokenizer)
    
    def process_scan(self, file_path: str) -> Dict:
        """Processes a scanned medical document (PDF or image) and returns structured medical insights."""
        extracted_text = self._extract_text(file_path)
        formatted_text = self._format_text(extracted_text)
        context = self._generate_context(formatted_text)
        key_insights = self._extract_key_insights(context)
        return key_insights
    
    def _extract_text(self, file_path: str) -> str:
        """Extracts raw text from an uploaded scanned medical document (PDF or image)."""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File {file_path} not found.")
        
        text = ""
        if file_path.lower().endswith(".pdf"):
            text = self._extract_text_from_pdf(file_path)
        else:
            text = self.ocr_processor(Image.open(file_path))
        
        return text.strip()
    
    def _extract_text_from_pdf(self, file_path: str) -> str:
        """Extracts text from a PDF using PyMuPDF, with a fallback to OCR if necessary."""
        extracted_text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                extracted_text += page.get_text("text") + "\n"
        
        if not extracted_text.strip():  # If no text is found, use OCR
            images = convert_from_path(file_path)
            extracted_text = "\n".join([self.ocr_processor(img) for img in images])
        
        return extracted_text.strip()
    
    def _format_text(self, text: str) -> str:
        """Cleans and formats extracted text for further processing."""
        text = re.sub(r'[^\w\s.,]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def _generate_context(self, text: str) -> str:
        """Generates structured context from formatted medical text."""
        return f"Medical Report Analysis:\n{text}\nEnd of Report."
    
    def _extract_key_insights(self, context: str) -> Dict:
        """Extracts key insights including lab results, predicted diseases, and abnormalities."""
        lab_key_points = self._extract_lab_key_points(context)
        predicted_disease = self._predict_disease(context)
        abnormality_analysis = self._analyze_abnormalities(context)
        
        return {
            "lab_key_points": lab_key_points,
            "predicted_disease": predicted_disease,
            "abnormalities": abnormality_analysis
        }
    
    def _extract_lab_key_points(self, context: str) -> List[str]:
        """Extracts key points from the medical lab report."""
        return [sentence for sentence in context.split('.') if 'value' in sentence.lower() or 'level' in sentence.lower()]
    
    def _predict_disease(self, context: str) -> str:
        """Predicts potential diseases based on the provided medical context."""
        return self._medical_ai(context)
    
    def _analyze_abnormalities(self, context: str) -> str:
        """Analyzes the medical text for any abnormal indicators of sickness."""
        return self._medical_ai(context)
    
    def _medical_ai(self, text: str) -> str:
        """An obfuscated AI-based method to process medical context and return insights."""
        import os
        import requests
        
        endpoint = os.getenv("MEDICAL_AI_ENDPOINT")
        api_key = os.getenv("MEDICAL_AI_KEY")
        
        headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
        payload = {"input": text, "model": "gpt-4-medical"}
        
        response = requests.post(endpoint, json=payload, headers=headers)
        if response.status_code == 200:
            return response.json().get("output", "No relevant insights found.")
        return "AI processing unavailable"

# Usage Example:
if __name__ == "__main__":
    pipeline = MedicalScanPipeline()
    report = pipeline.process_scan("docs/sample_medical_report_2.pdf")
    print(json.dumps(report, indent=4))


In [None]:

class LabReportAgent:
    """Agent for extracting key points from medical lab reports using AI."""
    @staticmethod
    def extract(context: str) -> List[str]:
        return MedicalAI.request(context, task="extract_lab_values")

class DiseasePredictionAgent:
    """Agent for predicting diseases from medical context using AI."""
    @staticmethod
    def predict(context: str) -> str:
        return MedicalAI.request(context, task="predict_disease")

class AbnormalityDetectionAgent:
    """Agent for detecting abnormalities in medical text using AI."""
    @staticmethod
    def analyze(context: str) -> str:
        return MedicalAI.request(context, task="detect_abnormalities")

class MedicalAI:
    """Hidden AI-based service for processing medical insights using GPT-4."""
    @staticmethod
    def request(text: str, task: str) -> str:
        import os
        import requests
        
        endpoint = os.getenv("MEDICAL_AI_ENDPOINT")
        api_key = os.getenv("MEDICAL_AI_KEY")
        
        headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
        payload = {"input": text, "task": task, "model": "gpt-4-medical"}
        
        response = requests.post(endpoint, json=payload, headers=headers)
        if response.status_code == 200:
            return response.json().get("output", "No relevant insights found.")
        return "AI processing unavailable"

# Usage Example:
if __name__ == "__main__":
    pipeline = MedicalScanPipeline()
    pipeline.lab_agent = LabReportAgent()
    pipeline.disease_agent = DiseasePredictionAgent()
    pipeline.abnormality_agent = AbnormalityDetectionAgent()
    
    report = pipeline.process_scan("sample_medical_report.pdf")
    print(json.dumps(report, indent=4))

## python api

In [None]:
import os
import re
import json
import pytesseract
import fitz  # PyMuPDF
import torch
from PIL import Image
from pdf2image import convert_from_path
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from typing import Dict, List
import importlib.util

class MedicalScanPipeline:
    """
    A high-performance, scalable pipeline for processing uploaded medical scan reports.
    It extracts textual data, formats the content, generates context, and predicts key medical insights.
    """
    
    def __init__(self, disease_model: str = "bert-base-uncased", abnormality_model: str = "distilbert-base-uncased"):
        self.ocr_processor = pytesseract.image_to_string
        self.disease_pipeline = self._load_model_pipeline(disease_model)
        self.abnormality_pipeline = self._load_model_pipeline(abnormality_model)
        self.ai_client = self._initialize_ai_client()
    
    @staticmethod
    def _load_model_pipeline(model_name: str):
        """Loads a transformer model for classification tasks."""
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        return pipeline("text-classification", model=model, tokenizer=tokenizer)
    
    def _initialize_ai_client(self):
        """Dynamically loads the AI client to obscure OpenAI dependency."""
        spec = importlib.util.find_spec("openai")
        if spec is None:
            return None
        ai_module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(ai_module)
        return ai_module.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    
    def process_scan(self, file_path: str) -> Dict:
        """Processes a scanned medical document (PDF or image) and returns structured medical insights."""
        extracted_text = self._extract_text(file_path)
        formatted_text = self._format_text(extracted_text)
        context = self._generate_context(formatted_text)
        key_insights = self._extract_key_insights(context)
        return key_insights
    
    def _extract_text(self, file_path: str) -> str:
        """Extracts raw text from an uploaded scanned medical document (PDF or image)."""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File {file_path} not found.")
        
        text = ""
        if file_path.lower().endswith(".pdf"):
            text = self._extract_text_from_pdf(file_path)
        else:
            text = self.ocr_processor(Image.open(file_path))
        
        return text.strip()
    
    def _extract_text_from_pdf(self, file_path: str) -> str:
        """Extracts text from a PDF using PyMuPDF, with a fallback to OCR if necessary."""
        extracted_text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                extracted_text += page.get_text("text") + "\n"
        
        if not extracted_text.strip():  # If no text is found, use OCR
            images = convert_from_path(file_path)
            extracted_text = "\n".join([self.ocr_processor(img) for img in images])
        
        return extracted_text.strip()
    
    def _format_text(self, text: str) -> str:
        """Cleans and formats extracted text for further processing."""
        text = re.sub(r'[^\w\s.,]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def _generate_context(self, text: str) -> str:
        """Generates structured context from formatted medical text."""
        return f"Medical Report Analysis:\n{text}\nEnd of Report."
    
    def _extract_key_insights(self, context: str) -> Dict:
        """Extracts key insights including lab results, predicted diseases, and abnormalities."""
        lab_key_points = self._extract_lab_key_points(context)
        predicted_disease = self._predict_disease(context)
        abnormality_analysis = self._analyze_abnormalities(context)
        
        return {
            "lab_key_points": lab_key_points,
            "predicted_disease": predicted_disease,
            "abnormalities": abnormality_analysis
        }
    
    def _extract_lab_key_points(self, context: str) -> List[str]:
        """Extracts key points from the medical lab report using AI."""
        return self._medical_ai(context, task="extract_lab_values")
    
    def _predict_disease(self, context: str) -> str:
        """Predicts potential diseases based on the provided medical context using AI."""
        return self._medical_ai(context, task="predict_disease")
    
    def _analyze_abnormalities(self, context: str) -> str:
        """Analyzes the medical text for any abnormal indicators of sickness using AI."""
        return self._medical_ai(context, task="detect_abnormalities")
    
    def _medical_ai(self, text: str, task: str) -> str:
        """Handles AI-based medical analysis dynamically to obscure implementation details."""
        if not self.ai_client:
            return "AI processing unavailable"
        response = self.ai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an AI specialized in medical report analysis."},
                {"role": "user", "content": f"Task: {task}\nText: {text}"}
            ]
        )
        return response.choices[0].message.content.strip()

# Usage Example:
if __name__ == "__main__":
    pipeline = MedicalScanPipeline()
    report = pipeline.process_scan("sample_medical_report.pdf")
    print(json.dumps(report, indent=4))


## curl

In [None]:
import os
import re
import json
import pytesseract
import fitz  # PyMuPDF
import torch
from PIL import Image
from pdf2image import convert_from_path
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from typing import Dict, List
import requests

class MedicalScanPipeline:
    """
    A high-performance, scalable pipeline for processing uploaded medical scan reports.
    It extracts textual data, formats the content, generates context, and predicts key medical insights.
    """
    
    def __init__(self, disease_model: str = "bert-base-uncased", abnormality_model: str = "distilbert-base-uncased"):
        self.ocr_processor = pytesseract.image_to_string
        self.disease_pipeline = self._load_model_pipeline(disease_model)
        self.abnormality_pipeline = self._load_model_pipeline(abnormality_model)
    
    @staticmethod
    def _load_model_pipeline(model_name: str):
        """Loads a transformer model for classification tasks."""
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        return pipeline("text-classification", model=model, tokenizer=tokenizer)
    
    def process_scan(self, file_path: str) -> Dict:
        """Processes a scanned medical document (PDF or image) and returns structured medical insights."""
        extracted_text = self._extract_text(file_path)
        formatted_text = self._format_text(extracted_text)
        context = self._generate_context(formatted_text)
        key_insights = self._extract_key_insights(context)
        return key_insights
    
    def _extract_text(self, file_path: str) -> str:
        """Extracts raw text from an uploaded scanned medical document (PDF or image)."""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File {file_path} not found.")
        
        text = ""
        if file_path.lower().endswith(".pdf"):
            text = self._extract_text_from_pdf(file_path)
        else:
            text = self.ocr_processor(Image.open(file_path))
        
        return text.strip()
    
    def _extract_text_from_pdf(self, file_path: str) -> str:
        """Extracts text from a PDF using PyMuPDF, with a fallback to OCR if necessary."""
        extracted_text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                extracted_text += page.get_text("text") + "\n"
        
        if not extracted_text.strip():  # If no text is found, use OCR
            images = convert_from_path(file_path)
            extracted_text = "\n".join([self.ocr_processor(img) for img in images])
        
        return extracted_text.strip()
    
    def _format_text(self, text: str) -> str:
        """Cleans and formats extracted text for further processing."""
        text = re.sub(r'[^\w\s.,]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def _generate_context(self, text: str) -> str:
        """Generates structured context from formatted medical text."""
        return f"Medical Report Analysis:\n{text}\nEnd of Report."
    
    def _extract_key_insights(self, context: str) -> Dict:
        """Extracts key insights including lab results, predicted diseases, and abnormalities."""
        lab_key_points = self._extract_lab_key_points(context)
        predicted_disease = self._predict_disease(context)
        abnormality_analysis = self._analyze_abnormalities(context)
        
        return {
            "lab_key_points": lab_key_points,
            "predicted_disease": predicted_disease,
            "abnormalities": abnormality_analysis
        }
    
    def _extract_lab_key_points(self, context: str) -> List[str]:
        """Extracts key points from the medical lab report using AI."""
        return self._medical_ai(context, task="extract_lab_values")
    
    def _predict_disease(self, context: str) -> str:
        """Predicts potential diseases based on the provided medical context using AI."""
        return self._medical_ai(context, task="predict_disease")
    
    def _analyze_abnormalities(self, context: str) -> str:
        """Analyzes the medical text for any abnormal indicators of sickness using AI."""
        return self._medical_ai(context, task="detect_abnormalities")
    
    def _medical_ai(self, text: str, task: str) -> str:
        """Handles AI-based medical analysis using a secure external API with cURL-like execution."""
        api_url = "https://api.openai.com/v1/chat/completions"
        api_key = os.getenv("OPENAI_API_KEY")
        headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
        data = {
            "model": "gpt-4o-mini",
            "messages": [
                {"role": "system", "content": "You are an AI specialized in medical report analysis."},
                {"role": "user", "content": f"Task: {task}\nText: {text}"}
            ]
        }
        response = requests.post(api_url, json=data, headers=headers)
        return response.json().get("choices", [{}])[0].get("message", {}).get("content", "AI processing unavailable")

# Usage Example:
if __name__ == "__main__":
    pipeline = MedicalScanPipeline()
    report = pipeline.process_scan("sample_medical_report.pdf")
    print(json.dumps(report, indent=4))
