In [7]:
import pandas as pd
import yaml
import pandas as pd
from pyprojroot import here
from collections import Counter
import os
import plotly.express as px

**Load the Excel file and heck for Nan values**

In [8]:
with open(here("configs/config.yml")) as cfg:
    cfg = yaml.load(cfg, Loader=yaml.FullLoader)
questions_df = pd.read_excel(os.path.join(
    here(cfg["eval_questions_dir"]), cfg["eval_file_name"]))
final_df = pd.DataFrame(columns=["best_scorer", "num_top_appearance"])

print(questions_df["langchain_token_mmr_score"].isna().sum())
print(questions_df[questions_df["langchain_token_mmr_score"].isna()].index)

0
Index([], dtype='int64')


In [9]:
questions_df.columns

Index(['source', 'question', 'correct_answer', 'langchain_token_mmr_result',
       'langchain_token_mmr_inference_time',
       'langchain_recursive_similarity_result',
       'langchain_recursive_similarity_inference_time',
       'langchain_recursive_mmr_result',
       'langchain_recursive_mmr_inference_time',
       'llama_index_sentence_retrieval_result',
       'llama_index_sentence_retrieval_inference_time',
       'llama_index_auto_merging_retrieval_result',
       'llama_index_auto_merging_retrieval_inference_time',
       'langchain_recursive_similarity_score', 'langchain_recursive_mmr_score',
       'llama_index_sentence_retrieval_score',
       'llama_index_auto_merging_retrieval_score', 'lowest_score',
       'highest_score', 'langchain_token_mmr_score',
       'langchain_token_mmr_result_score', 'langchain_token_mmr_score_score',
       '1) langchain_token_mmr_score'],
      dtype='object')

----------------------------------------------------

**Compute and print total scores (scores are from 40):**

In [12]:
langchain_token_mmr_total_score = sum(questions_df["langchain_token_mmr_score"])
langchain_similarity_total_score = sum(questions_df["langchain_recursive_similarity_score"])
langchain_recursive_mmr_total_score = sum(questions_df["langchain_recursive_mmr_score"])
llama_index_sentence_retrieval_score = sum(questions_df["llama_index_sentence_retrieval_score"])
llama_index_auto_merging_retrieval_total_score = sum(questions_df["llama_index_auto_merging_retrieval_score"])
print(f"Total scores:\n\
    langchain - token_mmr_total_score: {round(langchain_token_mmr_total_score, 2)}, \n\
    langchain - recursive_similarity_total_score: {round(langchain_similarity_total_score, 2)},\n\
    langchain - recursive_mmr_total_score: {round(langchain_recursive_mmr_total_score, 2)},\n\
    llama_index - sentence_retrieval_score: {round(llama_index_sentence_retrieval_score, 2)},\n\
    llama_index - auto_merging_retrieval_total_score: {round(llama_index_auto_merging_retrieval_total_score, 2)}")
scorer_list = [langchain_token_mmr_total_score, langchain_similarity_total_score, langchain_recursive_mmr_total_score, llama_index_sentence_retrieval_score, llama_index_auto_merging_retrieval_total_score]

Total scores:
    langchain - token_mmr_total_score: 36.1, 
    langchain - recursive_similarity_total_score: 38.4,
    langchain - recursive_mmr_total_score: 35.95,
    llama_index - sentence_retrieval_score: 36.6,
    llama_index - auto_merging_retrieval_total_score: 32.3


In [15]:
final_df["best_scorer"] = scorer_list

------------------------------------------

In [16]:
score_df = questions_df[[
    'langchain_token_mmr_score',
    'langchain_recursive_similarity_score',
    'langchain_recursive_mmr_score',
    'llama_index_sentence_retrieval_score',
    'llama_index_auto_merging_retrieval_score']]
max_cols = score_df.apply(lambda x: x[x == x.max()].index.tolist(), axis=1)
max_cols_list = max_cols.to_list()
max_cols_count = Counter([col for row in max_cols_list for col in row])

min_cols = score_df.apply(lambda x: x[x == x.min()].index.tolist(), axis=1)
min_cols_list = min_cols.to_list()
min_cols_count = Counter([col for row in min_cols_list for col in row])

**Count the number of times that each technique was among the highest scorers:**

In [17]:
max_cols_count

Counter({'langchain_recursive_similarity_score': 36,
         'llama_index_sentence_retrieval_score': 34,
         'langchain_recursive_mmr_score': 33,
         'langchain_token_mmr_score': 33,
         'llama_index_auto_merging_retrieval_score': 23})

**Count the number of times that each technique was among the lowest scorers:**

In [18]:
min_cols_count

Counter({'llama_index_auto_merging_retrieval_score': 32,
         'langchain_token_mmr_score': 21,
         'llama_index_sentence_retrieval_score': 21,
         'langchain_recursive_mmr_score': 20,
         'langchain_recursive_similarity_score': 19})

In [19]:
final_df.index = ['langchain_token_mmr', 'langchain_recursive_similarity', 'langchain_recursive_mmr', 'llama_index_sentence_retrieval', 'llama_index_auto_merging_retrieval']
for idx, row in final_df.iterrows():
    final_df.at[idx, "num_top_appearance"] = int(max_cols_count[f"{idx}_score"])

----------------------------------------------------

**Count the number of times that the method took zero score**

In [20]:
def count_number_of_zero_scores(df, column_name):
    return len(df[df[column_name]==0])

langchain_token_mmr_total_zero_score = count_number_of_zero_scores(questions_df, "langchain_token_mmr_score")
langchain_recursive_similarity_total_zero_score = count_number_of_zero_scores(questions_df, "langchain_recursive_similarity_score")
langchain_recursive_mmr_total_zero_score = count_number_of_zero_scores(questions_df, "langchain_recursive_mmr_score")
llama_index_sentence_retrieval_zero_score = count_number_of_zero_scores(questions_df, "llama_index_sentence_retrieval_score")
llama_index_auto_merging_retrieval_total_zero_score = count_number_of_zero_scores(questions_df, "llama_index_auto_merging_retrieval_score")
print(f"Total zero scores:\n\
    langchain_token_mmr: {langchain_token_mmr_total_zero_score},\n\
    langchain_recursive_similarity: {langchain_recursive_similarity_total_zero_score},\n\
    langchain_recursive_mmr: {langchain_recursive_mmr_total_zero_score},\n\
    llama_index_sentence_retrieval: {llama_index_sentence_retrieval_zero_score},\n\
    llama_index_auto_merging_retrieval: {llama_index_auto_merging_retrieval_total_zero_score}")

Total zero scores:
    langchain_token_mmr: 2,
    langchain_recursive_similarity: 1,
    langchain_recursive_mmr: 2,
    llama_index_sentence_retrieval: 1,
    llama_index_auto_merging_retrieval: 3


------------------------------------

### Analysis based on each document

**Score of each technique on each document**

In [21]:
langchain_token_mmr_df = questions_df.pivot_table(index='source', columns=None, values='langchain_token_mmr_score', aggfunc='sum')
langchain_recursive_similarity_df = questions_df.pivot_table(index='source', columns=None, values='langchain_recursive_similarity_score', aggfunc='sum')
langchain_recursive_mmr_df = questions_df.pivot_table(index='source', columns=None, values='langchain_recursive_mmr_score', aggfunc='sum')
llama_index_sentence_df = questions_df.pivot_table(index='source', columns=None, values='llama_index_sentence_retrieval_score', aggfunc='sum')
llama_index_auto_merging_df = questions_df.pivot_table(index='source', columns=None, values='llama_index_auto_merging_retrieval_score', aggfunc='sum')

# Concatenate the DataFrames into a single DataFrame
result_df = pd.concat([langchain_token_mmr_df, langchain_recursive_similarity_df, langchain_recursive_mmr_df, llama_index_sentence_df, llama_index_auto_merging_df], axis=1)

# Rename the columns to match your desired output
result_df.columns = [
    'langchain_token_mmr_score',
    'langchain_recursive_similarity_score',
    'langchain_recursive_mmr_score',
    'llama_index_sentence_retrieval_score',
    'llama_index_auto_merging_retrieval_score'
    ]

# Display the result
display(result_df)

Unnamed: 0_level_0,langchain_token_mmr_score,langchain_recursive_similarity_score,langchain_recursive_mmr_score,llama_index_sentence_retrieval_score,llama_index_auto_merging_retrieval_score
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SegmentAnything paper,13.3,13.6,12.2,12.8,10.8
product scpecification,3.0,4.0,3.0,4.5,4.0
stories,4.5,5.0,4.75,4.5,3.3
technical support,5.0,5.0,5.0,4.3,5.0
vision transformer paper,10.3,10.8,11.0,10.5,9.2


**Average score per document (normalized)**

In [22]:
print("Normalized average score per document:")
documents = ["SegmentAnything paper", "product scpecification", "stories", "technical support", "vision transformer paper"]
num_questions = questions_df[["source", "question"]].groupby("source").count().values.tolist()
num_questions = [val for sublist in num_questions for val in sublist]
mean_doc_score = result_df.mean(axis=1).values.tolist()
result = [round(x / y, 2) for x, y in zip(mean_doc_score, num_questions)]
result_dict = {}
for i in range(len(result)):
    result_dict[documents[i]] = result[i]
result_dict

Normalized average score per document:


{'SegmentAnything paper': 0.9,
 'product scpecification': 0.74,
 'stories': 0.88,
 'technical support': 0.97,
 'vision transformer paper': 0.94}

-----------------------------------------------------

### Inference time analysis

**Total inference time to answer 40 question**

In [23]:
questions_df[[
    "langchain_token_mmr_inference_time",
    "langchain_recursive_mmr_inference_time",
    "langchain_recursive_similarity_inference_time",
    "llama_index_sentence_retrieval_inference_time",
    "llama_index_auto_merging_retrieval_inference_time"
    ]].sum()

langchain_token_mmr_inference_time                    68.97
langchain_recursive_mmr_inference_time                62.03
langchain_recursive_similarity_inference_time         66.92
llama_index_sentence_retrieval_inference_time        100.95
llama_index_auto_merging_retrieval_inference_time     52.66
dtype: float64

**Average inference time (AVG time per question)**

In [24]:
questions_df[[
    "langchain_token_mmr_inference_time",
    "langchain_recursive_mmr_inference_time",
    "langchain_recursive_similarity_inference_time",
    "llama_index_sentence_retrieval_inference_time",
    "llama_index_auto_merging_retrieval_inference_time"
    ]].mean()

langchain_token_mmr_inference_time                   1.72425
langchain_recursive_mmr_inference_time               1.55075
langchain_recursive_similarity_inference_time        1.67300
llama_index_sentence_retrieval_inference_time        2.52375
llama_index_auto_merging_retrieval_inference_time    1.31650
dtype: float64

* Fastest: llama_index - auto_merging_retrieval
* slowest: llama_index - sentence_retrieval

-------------------------

**Check the zero scored questions**

In [25]:
print("llama_index: auto_merging_retrieval")
tmp_df = questions_df[["source",
                       "question",
                       "correct_answer",
                       "llama_index_auto_merging_retrieval_result",
                       "llama_index_auto_merging_retrieval_score"
                       ]][questions_df["llama_index_auto_merging_retrieval_score"]==0]
display(tmp_df)
for idx, row in tmp_df.iterrows():
    print("Question:")
    print(row["question"])
    print("=====================")
    print("Correct answer:")
    print(row["correct_answer"])
    print()
    print("Given answer:")
    print(row["llama_index_auto_merging_retrieval_result"])
    print("------------------")

llama_index: auto_merging_retrieval


Unnamed: 0,source,question,correct_answer,llama_index_auto_merging_retrieval_result,llama_index_auto_merging_retrieval_score
0,stories,"Who are Amarok, Fred, and Lily?",Amarok the Lone Wolf:\n\nAmarok is a lone wolf...,There is no information provided in the given ...,0.0
13,product scpecification,How much does CubeTriangle Kappa Portable Spea...,2000,The CubeTriangle Kappa Portable Speaker costs ...,0.0
30,SegmentAnything paper,How was the data engine used to collect the SA...,The data engine was used to collect the SA-1B ...,The data engine was used to collect the SA-1B ...,0.0


Question:
Who are Amarok, Fred, and Lily?
Correct answer:
Amarok the Lone Wolf:

Amarok is a lone wolf living in the Alaskan mountains.
Separated from his pack during a fierce winter storm, he embarks on a challenging journey to reunite with his family.
The story highlights Amarok's resilience, the beauty of the Alaskan wilderness, and the emotional reunion with his pack.
Fred the Red Fish:

Fred is a small red fish living in a vibrant coral reef.
His adventurous spirit leads him to discover a treasure map, prompting him to set off on a daring journey with his friend Delphi.
The narrative explores the wonders and challenges of the ocean, emphasizing Fred's growth and realization that home is where love resides.
Lily the Bee:

Lily is a young bee living in Blossom Valley with a deep love for flowers and her family.
Caught in a storm, Lily is carried far from home, leading her on an adventurous journey back.
Throughout her odyssey, Lily encounters various creatures, overcomes obstacles, 

In [26]:
print("llama_index: sentence_retrieval")
tmp_df = questions_df[["source",
                       "question",
                       "correct_answer",
                       "llama_index_sentence_retrieval_result",
                       "llama_index_sentence_retrieval_score"
                       ]][questions_df["llama_index_sentence_retrieval_score"]==0]
display(tmp_df)
for idx, row in tmp_df.iterrows():
    print("Question:")
    print(row["question"])
    print("==========================================")
    print("Correct answer:")
    print(row["correct_answer"])
    print()
    print("Given answer:")
    print(row["llama_index_sentence_retrieval_result"])
    print("------------------------------------------------------")

llama_index: sentence_retrieval


Unnamed: 0,source,question,correct_answer,llama_index_sentence_retrieval_result,llama_index_sentence_retrieval_score
30,SegmentAnything paper,How was the data engine used to collect the SA...,The data engine was used to collect the SA-1B ...,The context does not provide information about...,0.0


Question:
How was the data engine used to collect the SA-1B mask dataset, and what were the three stages of the data collection process?
Correct answer:
The data engine was used to collect the SA-1B mask dataset, which involved three stages of data collection. In the first stage, a team of professional annotators labeled masks by clicking foreground/background object points using a browser-based interactive segmentation tool powered by SAM. In the second stage, annotators were presented with images pre-filled with automatically detected confident masks and asked to annotate any additional unannotated objects. In the third stage, annotation was fully automatic using the ambiguity-aware model, which prompted the model with a regular grid of points and predicted a set of masks that may correspond to valid objects. The resulting dataset contained 1.1 billion high-quality masks.

Given answer:
The context does not provide information about how the data engine was used to collect the SA-1B m

In [27]:
print("langchain: token_mmr")
tmp_df = questions_df[["source",
                       "question",
                       "correct_answer",
                       "langchain_token_mmr_result",
                       "langchain_token_mmr_score"
                       ]][questions_df["langchain_token_mmr_score"]==0]
display(tmp_df)
for idx, row in tmp_df.iterrows():
    print("Question:")
    print(row["question"])
    print("==========================================")
    print("Correct answer:")
    print(row["correct_answer"])
    print()
    print("Given answer:")
    print(row["langchain_token_mmr_result"])
    print("------------------------------------------------------")

langchain: token_mmr


Unnamed: 0,source,question,correct_answer,langchain_token_mmr_result,langchain_token_mmr_score
13,product scpecification,How much does CubeTriangle Kappa Portable Spea...,2000,The CubeTriangle Kappa Portable Speaker costs ...,0.0
14,product scpecification,Is there any skateboard with 20 mph top speed ...,Yes the CubeTriangle Chi Electric Skateboard h...,"I'm sorry, but I couldn't find any information...",0.0


Question:
How much does CubeTriangle Kappa Portable Speaker cost?
Correct answer:
2000

Given answer:
The CubeTriangle Kappa Portable Speaker costs $1630.
------------------------------------------------------
Question:
Is there any skateboard with 20 mph top speed with a 15-mile range on a single charge?
Correct answer:
Yes the CubeTriangle Chi Electric Skateboard has the specifications that you are looing for. Here are the features of this product:
20 mph top speed with a 15-mile range on a single charge
Regenerative braking and shock-absorbing wheels for a smooth ride
Wireless remote control for effortless speed adjustments
Durable, waterproof deck with integrated LED safety lights
Companion app for route tracking and battery management
Price: $780

Given answer:
I'm sorry, but I couldn't find any information about a skateboard with a 20 mph top speed and a 15-mile range on a single charge in the retrieved content.
------------------------------------------------------


In [28]:
print("langchain: recursive_mmr")
tmp_df = questions_df[["source",
                       "question",
                       "correct_answer",
                       "langchain_recursive_mmr_result",
                       "langchain_recursive_mmr_score"
                       ]][questions_df["langchain_recursive_mmr_score"]==0]
display(tmp_df)
for idx, row in tmp_df.iterrows():
    print("Question:")
    print(row["question"])
    print("==========================================")
    print("Correct answer:")
    print(row["correct_answer"])
    print()
    print("Given answer:")
    print(row["langchain_recursive_mmr_result"])
    print("------------------------------------------------------")

langchain: recursive_mmr


Unnamed: 0,source,question,correct_answer,langchain_recursive_mmr_result,langchain_recursive_mmr_score
13,product scpecification,How much does CubeTriangle Kappa Portable Spea...,2000,The CubeTriangle Kappa Portable Speaker is pri...,0.0
14,product scpecification,Is there any skateboard with 20 mph top speed ...,Yes the CubeTriangle Chi Electric Skateboard h...,"Based on the information from the vectorDB, th...",0.0


Question:
How much does CubeTriangle Kappa Portable Speaker cost?
Correct answer:
2000

Given answer:
The CubeTriangle Kappa Portable Speaker is priced at $730.
------------------------------------------------------
Question:
Is there any skateboard with 20 mph top speed with a 15-mile range on a single charge?
Correct answer:
Yes the CubeTriangle Chi Electric Skateboard has the specifications that you are looing for. Here are the features of this product:
20 mph top speed with a 15-mile range on a single charge
Regenerative braking and shock-absorbing wheels for a smooth ride
Wireless remote control for effortless speed adjustments
Durable, waterproof deck with integrated LED safety lights
Companion app for route tracking and battery management
Price: $780

Given answer:
Based on the information from the vectorDB, there is no specific skateboard mentioned with a 20 mph top speed and a 15-mile range on a single charge. However, there is a CubeTriangle Theta -2 Electric Scooter Pro ment

In [29]:
print("langchain: recursive_similarity")
tmp_df = questions_df[["source",
                       "question",
                       "correct_answer",
                       "langchain_recursive_similarity_result",
                       "langchain_recursive_similarity_score"
                       ]][questions_df["langchain_recursive_similarity_score"]==0]
display(tmp_df)
for idx, row in tmp_df.iterrows():
    print("Question:")
    print(row["question"])
    print("==========================================")
    print("Correct answer:")
    print(row["correct_answer"])
    print()
    print("Given answer:")
    print(row["langchain_recursive_similarity_result"])
    print("------------------------------------------------------")

langchain: recursive_similarity


Unnamed: 0,source,question,correct_answer,langchain_recursive_similarity_result,langchain_recursive_similarity_score
13,product scpecification,How much does CubeTriangle Kappa Portable Spea...,2000,The CubeTriangle Kappa Portable Speaker is pri...,0.0


Question:
How much does CubeTriangle Kappa Portable Speaker cost?
Correct answer:
2000

Given answer:
The CubeTriangle Kappa Portable Speaker is priced at $430.
------------------------------------------------------


In [30]:
final_df = final_df.reset_index().rename(columns={'index': 'method'})

In [31]:
final_df['num_top_appearance'] = final_df['num_top_appearance'].astype(float)

In [32]:
color_sequence = px.colors.qualitative.Plotly

# Create the scatter plot
fig = px.scatter(
    final_df,
    x='method',
    y='best_scorer',
    size='num_top_appearance',
    hover_name='method',
    color='method',  # Assign colors based on the 'system' column
    color_discrete_sequence=color_sequence,  # Use the defined color sequence
    title='<b>Best RAG scorer<b>'
)

# Customize the layout
fig.update_layout(
    title_font=dict(family='Arial, sans-serif', size=24, color='black'),
    title_x=0.5,  # Center the title
    paper_bgcolor='white',  # Background color for the outer area
    plot_bgcolor='white',  # Background color for the plot area
    xaxis=dict(
        title='<b>Methods<b>',
        title_font=dict(size=24),
        tickangle=45,  # Rotate labels
        tickfont=dict(family='Arial, sans-serif', size=18, color='black'),
        showgrid=True  # Hide gridlines
    ),
    yaxis=dict(
        title='<b>Scores<b>',
        title_font=dict(family='Arial, sans-serif', size=18, color='black'),
        tickfont=dict(family='Arial, sans-serif', size=16, color='black'),
        showgrid=True,  # Show gridlines
        gridcolor='lightgrey'  # Gridline color
    ),
    legend=dict(
        title='Methods',
        font=dict(family='Arial, sans-serif', size=16, color='black',),
        bgcolor='lightgrey',  # Background color for the legend
        bordercolor='black',  # Border color for the legend
        borderwidth=1
    ),
    margin=dict(l=60, r=60, t=60, b=60),  # Adjust margins to fit labels
    width=1800,  # Width of the entire plot image
    height=600  # Height of the entire plot image
)

# Customize marker appearance
fig.update_traces(
    marker=dict(
        line=dict(width=3, color='darkgrey')  # Border line for markers
    ),
    selector=dict(mode='markers')
)

# Show the plot
fig.show()