# SOAPNote AI
## Doctor Dictation to Structured Clinical Notes
Convert audio dictations into structured SOAP notes using Whisper and LangChain.

In [None]:
# Imports
import os
from typing import Optional
from faster_whisper import WhisperModel
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

In [None]:
# Configuration
# Ensure you have your OPENAI_API_KEY set in environment or uncomment below
# os.environ["OPENAI_API_KEY"] = "sk-..."

MODEL_SIZE = "tiny" # linear or base for speed on CPU/Mac
AUDIO_PATH = "../data/sample_dictation.mp3"
DEVICE = "cpu" # or "cuda" if available
COMPUTE_TYPE = "int8" # or "float16"

In [None]:
# 1. Transcription Logic
def transcribe_audio(audio_path, model_size="base", device="cpu", compute_type="int8"):
    print(f"Loading Whisper model: {model_size}...")
    model = WhisperModel(model_size, device=device, compute_type=compute_type)
    
    print(f"Transcribing {audio_path}...")
    segments, info = model.transcribe(audio_path, beam_size=5)
    
    transcript = ""
    for segment in segments:
        transcript += segment.text + " "
    
    return transcript.strip()

# Test Transcription (Uncomment to run independently)
# text = transcribe_audio(AUDIO_PATH, MODEL_SIZE, DEVICE, COMPUTE_TYPE)
# print("Transcript:", text)

In [None]:
# 2. Define SOAP Schema
class SOAPNote(BaseModel):
    Subjective: str = Field(description="Patient's subjective report of symptoms, history, and complaints.")
    Objective: str = Field(description="Objective findings, vital signs, physical exam results, labs.")
    Assessment: str = Field(description="Diagnosis or differential diagnosis based on findings.")
    Plan: str = Field(description="Treatment plan, medications, follow-up, and further testing.")

# 3. Structuring Logic
def structure_soap_note(transcript, model_name="gpt-3.5-turbo"):
    # Initialize LLM
    llm = ChatOpenAI(model=model_name, temperature=0)
    
    # Define Parser
    parser = JsonOutputParser(pydantic_object=SOAPNote)
    
    # Prompt
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a helpful medical assistant. Your task is to extract a structured SOAP note from a doctor's dictation. \nEnsure the output matches the following JSON schema: {format_instructions}\nDisclaimer: For clinical documentation assistance only. Not a medical decision system."),
        ("user", "Dictation: {transcript}")
    ])
    
    chain = prompt | llm | parser
    
    result = chain.invoke({"transcript": transcript, "format_instructions": parser.get_format_instructions()})
    return result

In [None]:
# 4. Full Pipeline Execution
def run_soap_pipeline(audio_path):
    # Step 1: Transcribe
    print("--- Starting Transcription ---")
    transcript = transcribe_audio(audio_path, MODEL_SIZE, DEVICE, COMPUTE_TYPE)
    print("\n[Transcript]:")
    print(transcript)
    
    # Step 2: Structure
    print("\n--- Structuring SOAP Note ---")
    # Note: Requires OPENAI_API_KEY. If not present, this will fail.
    try:
        soap_note = structure_soap_note(transcript)
        print("\n[SOAP Note JSON]:")
        print(soap_note)
        
        print("\n--- Formatted Output ---")
        for key, value in soap_note.items():
            print(f"{key}:\n{value}\n")
    except Exception as e:
        print(f"Structuring failed (likely missing API Key): {e}")
        print("Using mock output for demo if API fails:")
        print(f"Subjective: {transcript}")
        
run_soap_pipeline(AUDIO_PATH)