In [None]:
import pandas as pd

import sys
sys.path.append("../..")


import torch
import random
import pickle
import time
import numpy as np
import pandas as pd
import jsonschema

from typing import Union
from pydantic import BaseModel, Field
from langchain_ollama import OllamaLLM


from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

from typing import Union
from pydantic import BaseModel, Field

# Set seeds
seed = 1234
random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fda18f63cd0>

In [3]:
# load descriptions
descriptions_df = pd.read_csv("../../data/llama/unique_assay_descriptions.csv")

In [36]:
# Steps to run ollama locally using podman/docker and cuda GPUs

# podman pull ollama
# podman run -d --name ollama_cuda --privileged --gpus all -v ollama_data:/root/.ollama -p 11434:11434 docker.io/ollama/ollama
# podman exec -it ollama bash
# ollama pull llama3.3
# exit

In [37]:
model = OllamaLLM(model="llama3.3", base_url="http://localhost:11434")
response = model.invoke("testing")

print(response)

It looks like you're testing our conversation! That's perfectly fine. How can I assist you today? Is there something specific you'd like to chat about or ask?


In [38]:
class Answers(BaseModel):
    mtb_strain: Union[str, bool] = Field(
        description="strain of Mycobacterium tuberculosis")
    resistant_to: Union[str, bool] = Field(
        description="drugs to which there is resistance.")
    mutant: Union[str, bool] = Field(description="if the strain is a mutant.")
    mutant_type: Union[str, bool] = Field(description="type of mutation.")
    checkerboard: Union[str, bool] = Field(description="checkerboard assay.")
    checkerboard_drug: Union[str, bool] = Field(
        description="drug in checkerboard assay.")

In [None]:
parser = JsonOutputParser(pydantic_object=Answers)

In [40]:
with open("../data/llama/prompt_template.txt", "r", encoding="utf-8") as f:
    response_template = f.read()

In [41]:
prompt = PromptTemplate(
    template=response_template,
    input_variables=["assay_description"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [42]:
# LangChain chain
chain = prompt | model | parser

In [43]:
def process_with_retries(description, retries=3, delay=2):
    for attempt in range(retries):
        try:
            output_dict = chain.invoke(description)
            return output_dict
        except Exception as e:
            print(f"Error processing description on attempt {attempt + 1}: {e}")
            time.sleep(delay)  # Wait before retrying
    # Flag as error if all retries fail
    #print(f"Failed to process description {description} after {retries} attempts")
    return {'mtb_strain': 'error', 'resistant_to': 'error', 'mutant': 'error', 'mutant_type': 'error', 'checkerboard': 'error', 'checkerboard_drug': 'error'}

In [44]:
#test

test_df = descriptions_df.dropna(subset=['assay_description']).sample(n=10, random_state=42)

# Dictionary to store results
processed_descriptions = {}

# Run through each of the 10 descriptions
for index, description in enumerate(test_df['assay_description']):
    print(f'\rProcessing {index + 1}/10', end='')
    processed_descriptions[description] = process_with_retries(description)

def get_output_dict(description):
    if pd.isna(description):
        return {'mtb_strain': np.nan, 'resistant_to': np.nan, 'susceptible_to': np.nan, 'mutant': np.nan, 'mutant_type': np.nan, 'checkerboard': np.nan, 'checkerboard_drug': np.nan}
    else:
        return processed_descriptions[description]
    
    
# Apply structured outputs back to the test dataframe
output_df = test_df['assay_description'].apply(get_output_dict).apply(pd.Series)
test_df[['mtb_strain', 'resistant_to', 'mutant', 'mutant_type', 'checkerboard', 'checkerboard_drug']] = output_df

Processing 10/10

In [None]:
processed_descriptions = {}

for index, description in enumerate(descriptions_df['assay_description']):
    print(f'\rProcessing {index + 1}/{len(descriptions_df)}', end='')
    processed_descriptions[description] = process_with_retries(description)

In [None]:
def get_output_dict(description):
    if pd.isna(description):
        return {'mtb_strain': np.nan, 'resistant_to': np.nan, 'susceptible_to': np.nan, 'mutant': np.nan, 'mutant_type': np.nan, 'checkerboard': np.nan, 'checkerboard_drug': np.nan}
    else:
        return processed_descriptions[description]


In [None]:
output_df = descriptions_df['assay_description'].apply(get_output_dict).apply(pd.Series)
descriptions_df[['mtb_strain', 'resistant_to', 'mutant', 'mutant_type', 'checkerboard', 'checkerboard_drug']] = output_df