In [57]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import json
import fitz     
import httpx

In [None]:
# File paths - stored in dict for iteration through all files
pdf_files = {
    "signalment_physical": "signalment_physical.pdf",       
    "cbc": "cbc.pdf",                                       
    "chem": "chem.pdf",                                     
    "cpli": "cpli.pdf",                                     
    "aus": "aus.pdf"                                        
}

csv_file = "table.csv"

In [59]:
# Function to extract the text from file
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text

pdf_texts = {name: extract_text_from_pdf(path) for name, path in pdf_files.items()}

# Load the CSV file
df = pd.read_csv(csv_file, usecols=[0, 1])    # Only reading the first two columns, as the other two may have values which will be overwritten later
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)  # One item has extra spaces, so I am stripping any trailing whitespaces

In [60]:
pdf_texts

{'signalment_physical': 'was admitted to the Texas A&M University Veterinary Medical Teaching Hospital on \nFebruary 19, 2015 . \nPresenting complaint: Diarrhea and vomiting \nDiagnosis: \n·\nKetoacidosis and hyperglycemia secondary to diabetes mellitus--ketoacidosis is\ncurrently resolved\n·\nPancreatitis\nHistory: , a 13 year old male neutered Bichon Frise, presented to the Texas A&M ER \nservice on February 19, 2015 for vomiting and diarrhea. According to ,  had been very \nlethargic for the week prior to presentation. He was not wanting to eat and was drinking \na lot more water than usual. The day of presentation was the first day had any diarrhea \nor vomiting. He is kept confined to the hallway in the house because he has a history of \ngetting into the garbage.\nPage 1\n\nPage 2\n is up to date on vaccinations and takes Iverhart and Advantage for \nheartworm and flea and tick prevention. He has been treated here previously for \ndehydration, but has no other history of trauma, 

In [61]:
# I need a compatible GPU >.<

# Models in consideration:
# mistral (best but heavy), mistral:7b-instruct-q4_0 (4-bit quantized)
# tinyllama (testing)

def extract_with_ollama(prompt, url = "http://localhost:11434/api/generate"): 
    
    data = {
        "model": "mistral",
        "prompt": prompt
    }

    response = httpx.post(url, data=json.dumps(data), headers={'Content-Type': 'application/json'}, timeout=300)
    response_lines = [line for line in response.text.strip().split('\n') if line]
    response_dicts = [json.loads(line) for line in response_lines]
    result = ''.join(response_dict.get('response', '') for response_dict in response_dicts)
    return result

In [62]:
def create_df(text, items, pdf_name):
    prompt = f"""
        Extract the **value (with units, if applicable)** and the **exact relevant text** for the specified items from the veterinary medical report. 
        Return the output in **JSON format** with no additional text or explanations.

        ### Instructions:
        2. **Items in `items_details`**: Extract both **value** and **exact text** for the item.
        3. **If an item is not found**: Return `"N/A"` for both `"results"` and `"details"`.

        ### Output Format (JSON):
        {{
            "item_name": {{
                "results": "value with units (if applicable)",
                "details": "exact relevant text (if applicable, or N/A)"
            }}
        }}

        ### Input Variables:
        - **items**: {items}
        - **text**: {text}
    """
        
    str_data = extract_with_ollama(prompt)
    data = json.loads(str_data)
    df = pd.DataFrame([(key, value["results"], value["details"]) for key, value in data.items()], 
                        columns=['items', 'results', 'details'])
    df["filename"] = pdf_name
    df = df[['filename', 'items', 'results', 'details']]
    return df

In [63]:
final_df = pd.DataFrame()

for pdf_name, text in pdf_texts.items():

    # Get results from text
    items = ", ".join(df.loc[(df["filename"] == pdf_name), "items"].astype(str))
    results_df = create_df(text, items, pdf_name)
    
    print(f"{pdf_name} has completed processing")
    final_df = pd.concat([final_df, results_df], ignore_index=True)

csv_1 = final_df.copy()
csv_2 = final_df.copy()

signalment_physical has completed processing
cbc has completed processing
chem has completed processing
cpli has completed processing
aus has completed processing


In [64]:
# Save the updated CSV with required details
csv_1.loc[0:4, 'details'] = np.nan
csv_1.loc[11:53, 'details'] = np.nan
csv_1.to_csv("table.csv", index=False)

print("Required extraction complete using Ollama.")

Required extraction complete using Ollama.


In [69]:
# Save the updated csv with additional details (Ranges)
csv_2.loc[0:4, 'details'] = np.nan
csv_2.loc[11:17, 'details'] = np.nan
csv_2.loc[53:54, 'details'] = np.nan
csv_2.to_csv("add_table.csv", index=False)

print("Additional extraction complete using Ollama.")

Additional extraction complete using Ollama.


In [72]:
final_df

Unnamed: 0,filename,items,results,details
0,signalment_physical,age,13,
1,signalment_physical,breed,Bichon Frise,
2,signalment_physical,gender,Male,
3,signalment_physical,neuter_status,Neutered,
4,signalment_physical,vomit_nausea,Yes,Presented with vomiting and diarrhea
5,signalment_physical,lethargy_weakness,Yes,Was very lethargic for a week prior to present...
6,signalment_physical,appetite_loss,Yes,Not wanting to eat for a week prior to present...
7,signalment_physical,diarrhea_melena,Yes,Had diarrhea on the day of presentation
8,signalment_physical,abdominal_pain,Yes,Abdomen was painful when palpated in the crani...
9,signalment_physical,weight_loss,,


#### Resources used:
 - https://pymupdf.readthedocs.io/en/latest/recipes-text.html#how-to-extract-all-document-text
 - https://ollama.com/library