In [1]:
import os
import dspy
import random
import pandas as pd 
import ast
from dotenv import load_dotenv
load_dotenv()

def get_data(test_size=0.2):
    data = pd.read_csv('./input_data/water_qa.csv')

    data = data.dropna()
    data = data.drop_duplicates()
    data['question'] = data['question'].apply(lambda x: x.lower())
    data['answer'] = data['answer'].apply(lambda x: x.lower())
    
    data = data.sample(frac=1, random_state=42).reset_index(drop=True)
    test_size = int(test_size * len(data))
    test_data = data[:test_size]
    train_data = data[test_size:]
    return train_data, test_data

train_data, test_data = get_data(test_size=0.3)

print(train_data.shape) 
print(test_data.shape)

  from .autonotebook import tqdm as notebook_tqdm


(112, 2)
(48, 2)


In [2]:
train_set = [dspy.Example(question=q, answer=a).with_inputs("question") for q, a in zip(train_data['question'], train_data['answer'])]
test_set = [dspy.Example(question=q, answer=a).with_inputs("question") for q, a in zip(test_data['question'], test_data['answer'])]

In [3]:
print(train_set[0])

Example({'question': 'which category of contamination includes naturally occurring boron in volcanic aquifers?', 'answer': 'natural contaminants'}) (input_keys={'question'})


In [4]:
class ReferenceAnswer(dspy.Signature):
    """ Choose the water contamination source from the following options: Agricultural Runoff,Chemical Spillage Natural Contaminants, Industrial Waste, Heavy Metals, Microbial Contamination, Sewage Leakage"""
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Output the correct answer; 1-2 words")
    
class PredictModel(dspy.Module):
    def __init__(self):
        self.predict = dspy.ChainOfThought(ReferenceAnswer)
        
    def forward(self, question):
        return self.predict(question=question)

In [5]:
turbo = dspy.OpenAI(api_key=os.getenv('OPENAI_API_KEY'), model="gpt-4o-mini")
mix = dspy.GROQ(api_key=os.getenv('GROQ_API_KEY'), model="mixtral-8x7b-32768")

In [6]:
dspy.settings.configure(lm=turbo, trace=[])
predict = PredictModel()


In [7]:
from dspy.evaluate import Evaluate
from dspy.evaluate.metrics import answer_exact_match

evaluate_prg = Evaluate(
    devset=test_set,
    metric=answer_exact_match,
    num_threads=4,
    display_progress=True,
    display_table=10,
    return_outputs=True
)

In [8]:
from rich import print as rprint

evl = evaluate_prg(predict)
rprint(eval)

  0%|          | 0/48 [00:00<?, ?it/s]

Average Metric: 31 / 48  (64.6): 100%|██████████| 48/48 [00:00<00:00, 85.93it/s]


Unnamed: 0,question,example_answer,rationale,pred_answer,answer_exact_match
0,which contaminant is most likely if water has a high concentration of rotavirus?,sewage leakage,identify the source of contamination. Rotavirus is a viral pathogen commonly associated with fecal contamination. This suggests that the water is likely contaminated by human...,Sewage Leakage,✔️ [True]
1,what is the primary concern when semiconductor manufacturing facilities are near water sources?,industrial waste,identify the potential risks associated with semiconductor manufacturing. These facilities often use various chemicals and materials that can be hazardous. If there is a leak...,Chemical Spillage,False
2,which contaminant is most likely if water has a high concentration of entamoeba histolytica?,microbial contamination,identify the source of contamination. Entamoeba histolytica is a protozoan parasite that causes amoebic dysentery and is typically transmitted through contaminated water. This contamination often...,Sewage Leakage,False
3,which contaminant is most likely if water contains high levels of trihalomethanes?,chemical spillage,determine the source of the trihalomethanes in the water. Trihalomethanes are typically formed when chlorine used for disinfection reacts with organic matter in the water....,Sewage Leakage,False
4,what type of contamination is linked to the presence of per- and polyfluoroalkyl substances (pfas) in firefighting foam?,chemical spillage,"identify the source of contamination associated with PFAS. PFAS are often found in firefighting foams, which are used in various industrial and emergency response situations....",Chemical Spillage,✔️ [True]
5,which type of contamination is associated with high levels of zinc in water?,industrial waste,identify the source of zinc contamination in water. Zinc is commonly found in industrial processes and can be released into water bodies through various means....,Heavy Metals,False
6,which category of contamination includes naturally occurring manganese in anaerobic aquifers?,natural contaminants,"identify the source of contamination. We know that manganese can occur naturally in certain geological formations, particularly in anaerobic conditions. This suggests that the contamination...",Natural Contaminants,✔️ [True]
7,which contaminant is most likely if water has a high concentration of fecal indicator bacteria?,sewage leakage,"identify the source of contamination. Fecal indicator bacteria are typically associated with human and animal waste. Therefore, if water has a high concentration of these...",Sewage Leakage,✔️ [True]
8,what type of contamination is linked to the presence of legionella pneumophila in cooling towers?,microbial contamination,"identify the source of contamination associated with legionella pneumophila. This bacterium is commonly found in water systems, particularly in man-made environments like cooling towers. It...",Microbial Contamination,✔️ [True]
9,what type of contamination is linked to the presence of bisphenol a (bpa) in water?,industrial waste,identify the source of bisphenol A (BPA) contamination in water. BPA is commonly found in plastics and can leach into water sources from various industrial...,Industrial Waste,✔️ [True]


## We got about 65% Accuracy on the Test Set using Few Shot

## Now optmizing using Boostrap Random Search by dspy to improve accuracy

In [21]:
from dspy.teleprompt import MIPRO
teleprompter = MIPRO(
    metric=answer_exact_match,
    num_candidates=3,
    verbose=True
)

eval_kwargs = dict(
    num_threads=16, 
    display_progress=True, 
    display_table=0
)

compiled_teleprompter = teleprompter.compile(
    predict,
    trainset=train_set,
    num_trials=1,
    max_bootstrapped_demos=3,
    max_labeled_demos=5,
    eval_kwargs=eval_kwargs
)


Please be advised that based on the parameters you have set, the maximum number of LM calls is projected as follows:

- Task Model: 112 examples in dev set * 1 trials * # of LM calls in your program = (112 * # of LM calls in your program) task model calls
- Prompt Model: # data summarizer calls (max 10) + 3 * 1 lm calls in program = 13 prompt model calls

Estimated Cost Calculation:

Total Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token) 
            + (Number of calls to prompt model * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Model Price per Output Token).

For a preliminary estimate of potential costs, we recommend you perform your own calculations based on the task
and prompt models you intend to use. If the projected costs exceed your budget or expectations, you may consider:

- Re

  3%|▎         | 3/112 [00:05<03:21,  1.85s/it]


Bootstrapped 3 full traces after 4 examples in round 0.
Creating basic bootstrap: 2/2


  5%|▌         | 6/112 [00:08<02:31,  1.43s/it]


Bootstrapped 3 full traces after 7 examples in round 0.
Model (<dsp.modules.gpt3.GPT3 object at 0x000001F34FDECD40>) History:



You are an instruction optimizer for large language models. I will give you a ``signature`` of fields (inputs and outputs) in English. Specifically, I will give you some ``observations`` I have made about the dataset and task, along with some ``examples`` of the expected inputs and outputs. I will also provide you with the current ``basic instruction`` that is being used for this task.

Your task is to propose a new improved instruction and prefix for the output field that will lead a good language model to perform the task well. Don't be afraid to be creative.

---

Follow the following format.

Observations: Observations about the dataset and task

Examples: Example(s) of the task

Basic Instruction: The initial instructions before optimization

Proposed Instruction: The improved instructions for the language model

Proposed Prefix For Output Field: The str

[I 2024-08-11 18:44:18,965] A new study created in memory with name: no-name-7171a5bf-31a4-43a4-b8d8-a3f118d3fe98


Starting trial #0
Evaling the following program:
Predictor 0
i: Proposed Instruction: Analyze the provided water contamination indicator and determine the most likely source of contamination based on the context of known contaminants. Consider the specific chemicals or substances mentioned and their typical associations with different contamination sources. Then, succinctly explain your thought process leading to the conclusion, identifying the primary source of contamination.
p: Source of Contamination:




Average Metric: 8 / 100  (8.0): 100%|██████████| 100/100 [00:13<00:00,  7.66it/s]


0st split score: 8.0
curr average score: 8.0


Average Metric: 0 / 12  (0.0): 100%|██████████| 12/12 [00:02<00:00,  4.68it/s]


1st split score: 0.0
curr average score: 7.142857142857143
Fully evaled score: 7.142857142857143
Model (<dsp.modules.gpt3.GPT3 object at 0x000001F34FDECD40>) History:



Proposed Instruction: Analyze the provided water contamination indicator and determine the most likely source of contamination based on the context of known contaminants. Consider the specific chemicals or substances mentioned and their typical associations with different contamination sources. Then, succinctly explain your thought process leading to the conclusion, identifying the primary source of contamination.

---

Follow the following format.

Question: ${question}
Reasoning: Let's think step by step in order to ${produce the answer}. We ...
Source of Contamination: Output the correct answer; 1-2 words

---

Question: what type of contamination is linked to the presence of tetrachloroethylene in groundwater?
Reasoning: Let's think step by step in order to analyze the potential sources of tetrachloroethylene. Tetr

[I 2024-08-11 18:44:34,935] Trial 0 finished with value: 7.142857142857143 and parameters: {'2144597026400_predictor_instruction': 1, '2144597026400_predictor_demos': 0}. Best is trial 0 with value: 7.142857142857143.


Returning predict = Predict(StringSignature(question -> rationale, answer
    instructions='Choose the water contamination source from the following options: Agricultural Runoff,Chemical Spillage Natural Contaminants, Industrial Waste, Heavy Metals, Microbial Contamination, Sewage Leakage'
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the answer}. We ...', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'Output the correct answer; 1-2 words', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)) from continue_program


In [22]:
evaluate_prg = Evaluate(
    devset=test_set,
    metric=answer_exact_match,
    num_threads=4,
    display_progress=True,
    display_table=10,
    return_outputs=True
)

eval_compiled = evaluate_prg(compiled_teleprompter)
print(eval_compiled)

Average Metric: 3 / 48  (6.2): 100%|██████████| 48/48 [00:23<00:00,  2.08it/s] 


Unnamed: 0,question,example_answer,rationale,pred_answer,answer_exact_match
0,which contaminant is most likely if water has a high concentration of rotavirus?,sewage leakage,"determine the source of contamination associated with rotavirus. Rotavirus is primarily transmitted through the fecal-oral route, often through contaminated water or food. High concentrations of...",Sewage,False
1,what is the primary concern when semiconductor manufacturing facilities are near water sources?,industrial waste,"analyze the potential contaminants associated with semiconductor manufacturing. These facilities often use a variety of chemicals, including solvents, acids, and heavy metals, which can leach...",Industrial runoff,False
2,which contaminant is most likely if water has a high concentration of entamoeba histolytica?,microbial contamination,determine the source of contamination. Entamoeba histolytica is a protozoan parasite that causes amoebic dysentery and is typically associated with fecal contamination of water sources....,Fecal matter,False
3,which contaminant is most likely if water contains high levels of trihalomethanes?,chemical spillage,"determine the source of contamination. Trihalomethanes (THMs) are a group of chemical compounds that are typically formed when chlorine, used as a disinfectant in drinking...",Chlorination,False
4,what type of contamination is linked to the presence of per- and polyfluoroalkyl substances (pfas) in firefighting foam?,chemical spillage,determine the source of contamination associated with PFAS. PFAS are a group of man-made chemicals that are often used in various industrial applications and consumer...,Firefighting foam,False
5,which type of contamination is associated with high levels of zinc in water?,industrial waste,determine the likely source of zinc contamination in water. Zinc is a metal that can enter water systems through various pathways. Common sources of zinc...,Industrial runoff,False
6,which category of contamination includes naturally occurring manganese in anaerobic aquifers?,natural contaminants,"analyze the context of manganese and its typical associations. Manganese is a trace metal that can be found in groundwater, particularly in anaerobic conditions where...",Natural geology,False
7,which contaminant is most likely if water has a high concentration of fecal indicator bacteria?,sewage leakage,"determine the source of contamination. Fecal indicator bacteria, such as E. coli, are typically associated with the presence of fecal matter from warm-blooded animals. High...",Sewage,False
8,what type of contamination is linked to the presence of legionella pneumophila in cooling towers?,microbial contamination,"determine the source of contamination associated with legionella pneumophila. Legionella pneumophila is a bacterium that thrives in warm water environments, particularly in man-made water systems....",Cooling towers,False
9,what type of contamination is linked to the presence of bisphenol a (bpa) in water?,industrial waste,"determine the source of bisphenol A (BPA) contamination in water. BPA is a chemical commonly used in the production of plastics and resins, particularly polycarbonate...",Plastic waste,False


(6.25, [(Example({'question': 'which contaminant is most likely if water has a high concentration of rotavirus?', 'answer': 'sewage leakage'}) (input_keys={'question'}), Prediction(
    rationale='determine the source of contamination associated with rotavirus. Rotavirus is primarily transmitted through the fecal-oral route, often through contaminated water or food. High concentrations of rotavirus in water typically indicate that the water has been contaminated by human or animal waste. This suggests a lack of proper sanitation or treatment of wastewater. Therefore, the most likely source of contamination in this case would be sewage or fecal matter.',
    answer='Sewage'
), False), (Example({'question': 'what is the primary concern when semiconductor manufacturing facilities are near water sources?', 'answer': 'industrial waste'}) (input_keys={'question'}), Prediction(
    rationale='analyze the potential contaminants associated with semiconductor manufacturing. These facilities ofte

## So MIPRO Failed....

In [23]:
class PredictModelSuggest(dspy.Module):
    def __init__(self):
        self.predict = dspy.ChainOfThought(ReferenceAnswer)
        
    def forward(self, question):
        output = self.predict(question=question)
        dspy.Suggest(
            output.answer.lower() in ["agricultural runoff", "chemical spillage", "natural contaminants", "industrial waste", "heavy metals", "microbial contamination", "sewage leakage"],
            "Answer can only be one of the following: Agricultural Runoff, Chemical Spillage, Natural Contaminants, Industrial Waste, Heavy Metals, Microbial Contamination, Sewage Leakage"
        )
        return dspy.Prediction(answer=output.answer)
    
predict_suggest = PredictModelSuggest().activate_assertions()
res = predict_suggest("Which category of contamination includes naturally occurring vanadium in certain groundwater sources?")


In [24]:
print(res.answer) ### Actual Answer: Natural Contaminants

Natural Contaminants
