# RAG Evaluations

In [3]:
import os
import dspy

In [4]:
os.chdir('../')

In [11]:
from src.chromadb_rm import ChromadbRM

In [15]:
class GenerateAnswer(dspy.Signature):
    """Answer questions given the context"""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Short factual answer to the question. 1 - 5 words long.")

class RAG(dspy.Module):
    def __init__(self, num_passages=5):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [16]:
def setup():
    """
    Setup the dsypy and retrieval models
    """

    turbo = dspy.OpenAI(model='gpt-3.5-turbo')

    chroma_rm = ChromadbRM(collection_name="test", persist_directory="chroma.db", local_embed_model="sentence-transformers/paraphrase-MiniLM-L6-v2",
                                   openai_api_key=os.environ["OPENAI_API_KEY"])

    dspy.settings.configure(lm=turbo, rm=chroma_rm)
    
    rag = RAG()

    return rag

In [17]:
rag = setup()

Collection Count: 3789


  return torch._C._cuda_getDeviceCount() > 0


In [48]:
# Read question, ground_truths from ./data/processed/synthetic_dataset.csv
import pandas as pd

df = pd.read_csv("./data/processed/synthetic_dataset.csv")

df = df[['question', 'ground_truths']]

In [49]:
df.head()

Unnamed: 0,question,ground_truths
0,"Who directed the play ""How to Curse"" in 2007?",['Josie Rourke']
1,"Who directed the film ""Donkey Punch""?",['Olly Blackburn.']
2,Who was Du Fu's paternal grandfather?,['Du Shenyan.']
3,How many children did Du Fu have by 757?,['Five.']
4,Where did Du Fu spend most of the next five ye...,['Sichuan province.']


In [76]:
from sklearn.model_selection import train_test_split

In [77]:
# split the data into train and test
train, test = train_test_split(df, test_size=0.2)

In [79]:
# save the train and test data
train.to_csv("./data/processed/train_synthetic.csv", index=False)
test.to_csv("./data/processed/test_synthetic.csv", index=False)

In [51]:
import tqdm

# Create an empty list to store rows
eval_results_rows = []

for index, row in test.iterrows():
    # Get the question
    question = row['question']
    # Response from rag
    response = rag(question)
    # Create a dictionary to represent a row
    row_dict = {'question': question, 'contexts': response.context, 'answer': response.answer, 'ground_truths' : row['ground_truths']}
    # Append the row dictionary to the list
    eval_results_rows.append(row_dict)

# Create the df_eval_results DataFrame from the list of rows
df_eval_results = pd.DataFrame(eval_results_rows)


In [81]:
df_eval_results

Unnamed: 0,question,contexts,answer,ground_truths_x,ground_truths_y
0,"Who directed the play ""How to Curse"" in 2007?","[in 2006, boulter starred alongside whishaw in...",Josie Rourke,[Josie Rourke],['Josie Rourke']
1,"Who directed the film ""Donkey Punch""?",[. turan agreed that mendes'choice of collabor...,Oliver Blackburn.,[Olly Blackburn.],['Olly Blackburn.']
2,What is Du Fu known for writing extensively ab...,[criticism of du fu's works has focused on his...,Lushi.,"[Domestic life, calligraphy, paintings, animals.]","['Domestic life, calligraphy, paintings, anima..."
3,"Which chart did ""Kiss You"" debut on in the Uni...",[. it peaked at number 13 in its third and fou...,Billboard,[United States Billboard Hot 100.],['United States Billboard Hot 100.']
4,Who held the Vevo record for the most views in...,[the music video garnered 10 @. @ 4 million vi...,Justin Bieber.,[Justin Bieber],['Justin Bieber']
...,...,...,...,...,...
82,Who ordered No. 202 Squadron RAF to Gibraltar?,[. the raf dispatched their next squadron to g...,No. 202 Squadron RAF was ordered to Gibraltar.,[Admiralty],['Admiralty']
83,Who was the senior tunnel guide with the Royal...,[work in gibraltar began immediately under com...,Pete Jackson.,[Sergeant Major Pete Jackson.],['Sergeant Major Pete Jackson.']
84,Who was the Roman Emperor after Nerva?,[nerva ( latin : marcus cocceius nerva caesar ...,Trajan,[Trajan],['Trajan']
85,Who was proclaimed emperor after the assassina...,"[on 18 september, 96, domitian was assassinate...",Marcus Cocceius Nerva,[Marcus Cocceius Nerva.],['Marcus Cocceius Nerva.']


In [68]:
import ast

# df_eval_results ground_truths to list
df_eval_results['ground_truths'] = df_eval_results['ground_truths'].apply(lambda x: ast.literal_eval(x))

In [82]:
# Save the df_eval_results DataFrame to a csv file
import time
EXP_NAME = "SIMPLE_RAG"
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
df_eval_results.to_csv('./results/inference_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

Now, that we have answers for all the questions, we can evaluate the RAG model.

In [87]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_similarity,
    context_relevancy
)
from datasets import Dataset
from ragas import evaluate

ds = Dataset.from_pandas(df_eval_results)

result = evaluate(
    ds,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_similarity,
        context_relevancy
    ],
)

evaluating with [context_precision]


100%|██████████| 6/6 [00:25<00:00,  4.17s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [faithfulness]


100%|██████████| 6/6 [00:32<00:00,  5.49s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [answer_relevancy]


100%|██████████| 6/6 [00:52<00:00,  8.70s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [context_recall]


100%|██████████| 6/6 [00:14<00:00,  2.46s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [answer_similarity]


100%|██████████| 6/6 [00:06<00:00,  1.13s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [context_relevancy]


100%|██████████| 6/6 [00:16<00:00,  2.69s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [88]:
result

{'context_precision': 0.5991, 'faithfulness': 0.7126, 'answer_relevancy': 0.8544, 'context_recall': 0.7835, 'answer_similarity': 0.9124, 'context_relevancy': 0.1115}

In [89]:
# save the result
result.to_pandas().to_csv('./results/evaluation_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [90]:
result.to_pandas()

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Unnamed: 0,question,contexts,answer,ground_truths,context_precision,faithfulness,answer_relevancy,context_recall,answer_similarity,context_relevancy
0,"Who directed the play ""How to Curse"" in 2007?","[in 2006, boulter starred alongside whishaw in...",Josie Rourke,[Josie Rourke],1.00,1.0,0.981121,1.0,0.999998,0.111111
1,"Who directed the film ""Donkey Punch""?",[. turan agreed that mendes'choice of collabor...,Oliver Blackburn.,[Olly Blackburn.],0.00,0.0,0.707491,0.0,0.950973,0.000000
2,What is Du Fu known for writing extensively ab...,[criticism of du fu's works has focused on his...,Lushi.,"[Domestic life, calligraphy, paintings, animals.]",0.20,1.0,0.961328,0.0,0.803045,0.100000
3,"Which chart did ""Kiss You"" debut on in the Uni...",[. it peaked at number 13 in its third and fou...,Billboard,[United States Billboard Hot 100.],0.00,0.0,0.799624,0.0,0.877977,0.000000
4,Who held the Vevo record for the most views in...,[the music video garnered 10 @. @ 4 million vi...,Justin Bieber.,[Justin Bieber],1.00,1.0,0.947051,1.0,0.957363,0.000000
...,...,...,...,...,...,...,...,...,...,...
82,Who ordered No. 202 Squadron RAF to Gibraltar?,[. the raf dispatched their next squadron to g...,No. 202 Squadron RAF was ordered to Gibraltar.,[Admiralty],1.00,1.0,0.939381,1.0,0.775491,0.062500
83,Who was the senior tunnel guide with the Royal...,[work in gibraltar began immediately under com...,Pete Jackson.,[Sergeant Major Pete Jackson.],0.20,0.0,0.945070,1.0,0.902460,0.071429
84,Who was the Roman Emperor after Nerva?,[nerva ( latin : marcus cocceius nerva caesar ...,Trajan,[Trajan],0.25,1.0,0.941410,1.0,1.000000,0.076923
85,Who was proclaimed emperor after the assassina...,"[on 18 september, 96, domitian was assassinate...",Marcus Cocceius Nerva,[Marcus Cocceius Nerva.],1.00,1.0,0.980902,1.0,0.986040,0.105263


In [91]:
# Logging to wandb

import wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wikitext-rag-eval",
    
    # track hyperparameters and run metadata
    config={
        "number_of_questions": len(ds),
        "comments": "Simple QA RAG model with no teleprompter",
        "model": "RAG",
        "dataset": "Synthetic",
        "num_passages": 5,
        "openai_model": "gpt-3.5-turbo",
        "chroma_collection_name": "test",
        "chroma_persist_directory": "chroma.db",
        "chroma_local_embed_model": "sentence-transformers/paraphrase-MiniLM-L6-v2",

    }
)

wandb.log(result)

wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mprasadshreyas[0m. Use [1m`wandb login --relogin`[0m to force relogin




VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
answer_relevancy,▁
answer_similarity,▁
context_precision,▁
context_recall,▁
context_relevancy,▁
faithfulness,▁

0,1
answer_relevancy,0.85442
answer_similarity,0.91238
context_precision,0.59906
context_recall,0.78352
context_relevancy,0.11154
faithfulness,0.71264


----

Now, let's compile the RAG using teleprompters.

In [101]:
train.reset_index(inplace=True, drop=True)

In [105]:
train = train[:10]

In [106]:
train

Unnamed: 0,question,ground_truths
0,What instruments did Thomas Newman mainly use ...,['Percussion instruments.']
1,Who led the Praetorian Guard in the siege of t...,['Casperius Aelianus.']
2,Where is San Lorenzo Colossal Head 2 currently...,['Mexico City.']
3,Who praised Coleman and the new side of the Do...,"[""The Mirror's Jon Cooper.""]"
4,Q: What was the North Korean strategy during t...,['A: Double envelopment of flanks.']
5,What were the effects of Typhoon Kujira in Japan?,['Agricultural damage and fatalities.']
6,Q: What was the name of the village located at...,['A: Agok.']
7,Q1: When were the sisters Ise and Hyūga transf...,['A1: 1 May 1944.']
8,Who was Nero's mother?,['Agrippina.']
9,What was the peak wind gust on Chichi-jima dur...,['200 km/h (124 mph)']


In [126]:
import ast

trainset = []
for i in range(5):
    ex = dspy.Example(
        question=train['question'].iloc[i],
        answer=ast.literal_eval(train['ground_truths'].iloc[i])[0]
    )
    ex = ex.with_inputs('question')
    trainset.append(ex)

In [127]:
trainset

[Example({'question': 'What instruments did Thomas Newman mainly use to create the score for American Beauty?', 'answer': 'Percussion instruments.'}) (input_keys={'question'}),
 Example({'question': 'Who led the Praetorian Guard in the siege of the Imperial Palace?', 'answer': 'Casperius Aelianus.'}) (input_keys={'question'}),
 Example({'question': 'Where is San Lorenzo Colossal Head 2 currently located?', 'answer': 'Mexico City.'}) (input_keys={'question'}),
 Example({'question': 'Who praised Coleman and the new side of the Doctor in "The Snowmen"?', 'answer': "The Mirror's Jon Cooper."}) (input_keys={'question'}),
 Example({'question': 'Q: What was the North Korean strategy during the Korean War?', 'answer': 'A: Double envelopment of flanks.'}) (input_keys={'question'})]

In [128]:
from dspy.teleprompt import BootstrapFewShot

# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)

# Compile!
compiled_rag = teleprompter.compile(RAG(), trainset=trainset)

100%|██████████| 5/5 [00:06<00:00,  1.35s/it]

Bootstrapped 3 full traces after 5 examples in round 0.





In [161]:
import ast
def get_evals(dataset, rag):
    # Create an empty list to store rows
    eval_results_rows = []

    for index, row in dataset.iterrows():
        # Get the question
        question = row['question']
        # Response from rag
        response = rag(question)
        # Create a dictionary to represent a row
        row_dict = {'question': question, 'contexts': response.context, 'answer': response.answer, 'ground_truths' : row['ground_truths']}
        # Append the row dictionary to the list
        eval_results_rows.append(row_dict)

    # Create the df_eval_results DataFrame from the list of rows
    df_eval_results = pd.DataFrame(eval_results_rows)

    # Convert 'ground_truths' column to list
    df_eval_results['ground_truths'] = df_eval_results['ground_truths'].apply(lambda x: ast.literal_eval(x))

    return df_eval_results


In [None]:

df_eval_results = get_evals(test, compiled_rag)


In [135]:
# Save the df_eval_results DataFrame to a csv file
import time
EXP_NAME = "COMPILED_RAG"
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
df_eval_results.to_csv('./results/inference_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

Now, that we have answers for all the questions, we can evaluate the RAG model.

In [136]:
ds = Dataset.from_pandas(df_eval_results)

result = evaluate(
    ds,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_similarity,
        context_relevancy
    ],
)

evaluating with [context_precision]


100%|██████████| 6/6 [00:22<00:00,  3.79s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [faithfulness]


100%|██████████| 6/6 [00:30<00:00,  5.15s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [answer_relevancy]


100%|██████████| 6/6 [00:52<00:00,  8.80s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [context_recall]


100%|██████████| 6/6 [00:13<00:00,  2.27s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [answer_similarity]


100%|██████████| 6/6 [00:03<00:00,  1.58it/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [context_relevancy]


100%|██████████| 6/6 [00:13<00:00,  2.23s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [137]:
result

{'context_precision': 0.5905, 'faithfulness': 0.7500, 'answer_relevancy': 0.8759, 'context_recall': 0.7694, 'answer_similarity': 0.9061, 'context_relevancy': 0.1104}

In [138]:
# save the result
result.to_pandas().to_csv('./results/evaluation_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

In [139]:
result.to_pandas()

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Unnamed: 0,question,contexts,answer,ground_truths,context_precision,faithfulness,answer_relevancy,context_recall,answer_similarity,context_relevancy
0,What type of teeth do temnospondyls have on th...,"[unlike semiaquatic temnospondyls, terrestrial...",Teeth.,[Tusks.],0.416667,1.0,0.878008,0.750000,0.855351,0.210526
1,Who were the Principal Architects for Palestin...,[= = = architects and sculptors = = = as well ...,Sir John James Burnet.,[Sir John James Burnet and Thomas Smith Tait.],1.000000,1.0,0.975334,1.000000,0.946708,0.090909
2,What is the title of Brock Lesnar's autobiogra...,"[in 2009, lesnar signed an endorsement deal wi...",Death Clutch.,[Death Clutch],1.000000,1.0,1.000000,1.000000,0.989006,0.071429
3,Where is the replica of San Lorenzo Head 8 loc...,"[san francisco, california. a replica of san l...","West Valley City, Utah.",[Utah Cultural Celebration Center.],0.866667,1.0,0.723326,1.000000,0.853857,0.000000
4,What was the main flaw in the design of the Fu...,"[the progress of fuso's construction, while th...",Distribution of midships gun turrets.,[Midships gun turrets.],1.000000,1.0,0.961188,1.000000,0.958398,0.100000
...,...,...,...,...,...,...,...,...,...,...
81,Q: How many Marines and Navy SEALs were part o...,[. the marines were to be released from 2nd di...,60.,[A: 51 Marines and 9 Navy SEALs.],0.500000,1.0,0.820366,0.333333,0.770896,0.000000
82,What is the estimated weight of the La Cobata ...,[the la cobata head is more or less rounded an...,40 tons.,[40 tons.],0.700000,1.0,0.974329,1.000000,1.000000,0.107143
83,What is the name of the group of temnospondyls...,"[temnospondyls, like all amphibians, reproduce...",Stereospondyli.,[Stereospondyli.],0.500000,1.0,0.951946,1.000000,1.000000,0.055556
84,Who promoted Brad Stevens to a full-time assis...,[. within 24 hours of the interviews stevens w...,Todd Lickliter.,[Todd Lickliter.],0.500000,1.0,0.841781,1.000000,1.000000,0.055556


In [140]:
# Logging to wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wikitext-rag-eval",
    
    # track hyperparameters and run metadata
    config={
        "number_of_questions": len(ds),
        "comments": "Compiled QA RAG model with teleprompter",
        "model": "RAG",
        "dataset": "Synthetic",
        "num_passages": 5,
        "openai_model": "gpt-3.5-turbo",
        "chroma_collection_name": "test",
        "chroma_persist_directory": "chroma.db",
        "chroma_local_embed_model": "sentence-transformers/paraphrase-MiniLM-L6-v2",

    }
)

wandb.log(result)

wandb.finish()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011114155201034414, max=1.0…



VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
answer_relevancy,▁
answer_similarity,▁
context_precision,▁
context_recall,▁
context_relevancy,▁
faithfulness,▁

0,1
answer_relevancy,0.8759
answer_similarity,0.90609
context_precision,0.59052
context_recall,0.76938
context_relevancy,0.11039
faithfulness,0.75


-------

No Retrieval
---

In [158]:
class BasicQA(dspy.Signature):
    """Answer questions with short factoid answers."""

    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

In [159]:
# Define the predictor.
generate_answer = dspy.Predict(BasicQA)

In [166]:
eval_results_rows = []

for index, row in test.iterrows():
    # Get the question
    question = row['question']
    # Response from rag
    response = generate_answer(question = question)
    # Create a dictionary to represent a row
    row_dict = {'question': question, 'answer': response.answer, 'ground_truths' : row['ground_truths']}
    # Append the row dictionary to the list
    eval_results_rows.append(row_dict)

# Create the df_eval_results DataFrame from the list of rows
df_eval_results = pd.DataFrame(eval_results_rows)

# Convert 'ground_truths' column to list
df_eval_results['ground_truths'] = df_eval_results['ground_truths'].apply(lambda x: ast.literal_eval(x))

In [170]:
ds = Dataset.from_pandas(df_eval_results)

result = evaluate(
    ds,
    metrics=[
        answer_similarity
    ],
)

evaluating with [answer_similarity]


100%|██████████| 6/6 [00:03<00:00,  1.59it/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [171]:
result

{'answer_similarity': 0.8535}

In [172]:
# save the result
result.to_pandas().to_csv('./results/evaluation_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

In [173]:
result.to_pandas()

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Unnamed: 0,question,answer,ground_truths,answer_similarity
0,What type of teeth do temnospondyls have on th...,Conical teeth,[Tusks.],0.815168
1,Who were the Principal Architects for Palestin...,Sir Ronald Storrs and Sir William Fisher,[Sir John James Burnet and Thomas Smith Tait.],0.844589
2,What is the title of Brock Lesnar's autobiogra...,Death Clutch,[Death Clutch],1.000000
3,Where is the replica of San Lorenzo Head 8 loc...,"Museo Nacional de Antropología, Mexico City",[Utah Cultural Celebration Center.],0.809204
4,What was the main flaw in the design of the Fu...,Weak armor,[Midships gun turrets.],0.792137
...,...,...,...,...
81,Q: How many Marines and Navy SEALs were part o...,A: 30,[A: 51 Marines and 9 Navy SEALs.],0.803497
82,What is the estimated weight of the La Cobata ...,Approximately 20 tons.,[40 tons.],0.878407
83,What is the name of the group of temnospondyls...,Metoposaurids,[Stereospondyli.],0.824162
84,Who promoted Brad Stevens to a full-time assis...,Doc Rivers,[Todd Lickliter.],0.790983


In [174]:
# Logging to wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wikitext-rag-eval",
    
    # track hyperparameters and run metadata
    config={
        "number_of_questions": len(ds),
        "comments": "No RAG model",
        "model": "RAG",
        "dataset": "Synthetic",
        "num_passages": 5,
        "openai_model": "gpt-3.5-turbo",
        "chroma_collection_name": "test",
        "chroma_persist_directory": "chroma.db",
        "chroma_local_embed_model": "sentence-transformers/paraphrase-MiniLM-L6-v2",

    }
)

wandb.log(result)

wandb.finish()



VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
answer_similarity,▁

0,1
answer_similarity,0.85353
