## Configure Ollama for inference

In [None]:
import os

OLLAMA_ENDPOINT="http://sim01:41527"
os.environ['OLLAMA_HOST']=OLLAMA_ENDPOINT


In [None]:
from langchain_community.chat_models import ChatOllama

local_llm = "gemma2:27b"
num_ctx=25000
llm = ChatOllama(model=local_llm, base_url=OLLAMA_ENDPOINT, temperature=0, num_ctx=num_ctx)

### Prompt your local LLM

In [None]:
from langchain_core.messages import AIMessage

messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence. Don't respond in json",
    ),
    ("human", "I love programming. Tell me if you also love programming"),
]
ai_msg = llm.invoke(messages)
ai_msg.pretty_print()

## Helper functions and starting point

### Go through the patient files

In [None]:
import pandas as pd

# Load the Excel file
file_path = '/nvme/h/gkosta/data_p184/hackathon.xlsx'
excel_data = pd.ExcelFile(file_path)

patient_data = []

# Iterate over each sheet in the Excel file
for sheet_name in excel_data.sheet_names:
    if sheet_name.startswith("PATIENT"):
        # Read the patient sheet
        patient_df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
        
        # Extract the patient's age
        patient_age = patient_df.at[0, 1]
        
        # Iterate over rows to extract multiple entries
        row = 2
        while row < len(patient_df) and pd.notna(patient_df.at[row, 0]):  # Continue until we find an empty cell in the drug name column
            patient_info = {
                'Patient_ID': sheet_name.split()[-1],
                'Age': patient_age,
                'Drug_Name': patient_df.at[row, 0],
                'Dosage': patient_df.at[row, 1],
                'Frequency': patient_df.at[row, 2],
                'Route': patient_df.at[row, 3],
                'Allergies': patient_df.at[row, 7],
                'Illnesses': patient_df.at[row, 9]
            }
            
            # Append the extracted information to the list
            patient_data.append(patient_info)
            
            row += 1

# Convert the list of dictionaries to a DataFrame
all_patients_df = pd.DataFrame(patient_data)
all_patients_df.set_index(['Patient_ID'],inplace=True)

all_patients_df.head(5)

### Simple getter function to access one patient file

In [None]:
def get_patient(id=str|int):
    if type(id)==str:
        return all_patients_df[all_patients_df.index==id]
    elif type(id)==int:
        return all_patients_df[all_patients_df.index==str(id)]
    else:
        raise TypeError("expecting int or str of patient id")

### Starting your RAG journey

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from tqdm import tqdm
directory='medicine_files'
files = [f for f in os.listdir(directory) if f.endswith('.pdf')]

all_splits = []

for file_name in tqdm(files):
    file_path = os.path.join(directory, file_name)
    # print(f"Processing {file_path}...")  # Optional: for tracking progress

    # Load the PDF
    loader = PyPDFLoader(file_path)

    ... ###FIXEME: search for RAG and Langchain maybe