In [25]:
import pandas as pd
import yaml
import pandas as pd
from pyprojroot import here
from collections import Counter
import os
import plotly.express as px

In [26]:
with open(here("configs/config.yml")) as cfg:
    cfg = yaml.load(cfg, Loader=yaml.FullLoader)
questions_df = pd.read_excel(os.path.join(
    here(cfg["eval_questions_dir"]), cfg["eval_file_name"]))
final_df = pd.DataFrame(columns=["best_scorer", "num_top_appearance"])

**Check for Nan values**

In [27]:
print(questions_df["langchain_token_mmr_score"].isna().sum())
print(questions_df[questions_df["langchain_token_mmr_score"].isna()].index)

0
Index([], dtype='int64')


In [28]:
questions_df.columns

Index(['source', 'question', 'correct_answer', 'langchain_token_mmr_result',
       'langchain_token_mmr_inference_time',
       'langchain_recursive_similarity_result',
       'langchain_recursive_similarity_inference_time',
       'langchain_recursive_mmr_result',
       'langchain_recursive_mmr_inference_time',
       'llama_index_sentence_retrieval_result',
       'llama_index_sentence_retrieval_inference_time',
       'llama_index_auto_merging_retrieval_result',
       'llama_index_auto_merging_retrieval_inference_time',
       'langchain_recursive_similarity_score', 'langchain_recursive_mmr_score',
       'llama_index_sentence_retrieval_score',
       'llama_index_auto_merging_retrieval_score', 'lowest_score',
       'highest_score', 'langchain_token_mmr_score',
       'langchain_token_mmr_result_score', 'langchain_token_mmr_score_score'],
      dtype='object')

**Compute and print total scores (from 40):**

In [29]:
langchain_token_mmr_total_score = sum(questions_df["langchain_token_mmr_score"])
langchain_similarity_total_score = sum(questions_df["langchain_recursive_similarity_score"])
langchain_recursive_mmr_total_score = sum(questions_df["langchain_recursive_mmr_score"])
llama_index_sentence_retrieval_score = sum(questions_df["llama_index_sentence_retrieval_score"])
llama_index_auto_merging_retrieval_total_score = sum(questions_df["llama_index_auto_merging_retrieval_score"])
print(f"Total scores:\n\
    langchain_token_mmr_total_score: {round(langchain_token_mmr_total_score, 2)}, \n\
    langchain_similarity_total_score: {round(langchain_similarity_total_score, 2)},\n\
    langchain_recursive_mmr_total_score: {round(langchain_recursive_mmr_total_score, 2)},\n\
    llama_index_sentence_retrieval_score: {round(llama_index_sentence_retrieval_score, 2)},\n\
    llama_index_auto_merging_retrieval_total_score: {round(llama_index_auto_merging_retrieval_total_score, 2)}")
scorer_list = [langchain_token_mmr_total_score, langchain_similarity_total_score, langchain_recursive_mmr_total_score, llama_index_sentence_retrieval_score, llama_index_auto_merging_retrieval_total_score]

Total scores:
    langchain_token_mmr_total_score: 32.6, 
    langchain_similarity_total_score: 35.9,
    langchain_recursive_mmr_total_score: 32.0,
    llama_index_sentence_retrieval_score: 34.9,
    llama_index_auto_merging_retrieval_total_score: 29.8


* Best overall performance:   langchain_recursive_similarity
* Wrost overall performance: llama_index_auto_merging_retrieval

In [30]:
final_df["best_scorer"] = scorer_list

In [7]:
scorer_list

[32.59999999999999,
 35.89999999999997,
 32.000000000000014,
 34.89999999999998,
 29.8]

------------------------------------------

In [118]:
#  langchain_token_mmr: 32.6, 
#     langchain_recursive_similarity: 35.9,
#     langchain_recursive_mmr: 32.0,
#     llama_index_sentence_retrieval: 34.9,
#     llama_index_auto_merging_retrieval: 29.8

In [None]:
# langchain_token_mmr_total_score: 32.6, 
# langchain_similarity_total_score: 35.9,
# langchain_mmr_total_score: 32.0,
# llama_index_sentence_retrieval_score: 34.9,
# llama_index_auto_merging_retrieval_total_score: 29.8

In [31]:
score_df = questions_df[[
    'langchain_token_mmr_score',
    'langchain_recursive_similarity_score',
    'langchain_recursive_mmr_score',
    'llama_index_sentence_retrieval_score',
    'llama_index_auto_merging_retrieval_score']]
max_cols = score_df.apply(lambda x: x[x == x.max()].index.tolist(), axis=1)
max_cols_list = max_cols.to_list()

min_cols = score_df.apply(lambda x: x[x == x.min()].index.tolist(), axis=1)
min_cols_list = min_cols.to_list()

**Count the number of times that each technique was among the highest scorers:**

In [122]:
# max_cols

In [32]:
# Count the number of times each column name appears in the top scorer list
max_cols_count = Counter([col for row in max_cols_list for col in row])
max_cols_count
top_appearance_list = list(max_cols_count.values())

In [33]:
max_cols_count

Counter({'llama_index_sentence_retrieval_score': 35,
         'langchain_recursive_similarity_score': 32,
         'langchain_token_mmr_score': 23,
         'langchain_recursive_mmr_score': 9,
         'llama_index_auto_merging_retrieval_score': 8})

In [34]:
final_df

Unnamed: 0,best_scorer,num_top_appearance
0,32.6,
1,35.9,
2,32.0,
3,34.9,
4,29.8,


In [38]:
final_df.index = ['langchain_token_mmr', 'langchain_recursive_similarity', 'langchain_recursive_mmr', 'llama_index_sentence_retrieval', 'llama_index_auto_merging_retrieval']

In [39]:
for idx, row in final_df.iterrows():
    final_df.at[idx, "num_top_appearance"] = int(max_cols_count[f"{idx}_score"])

In [40]:
final_df

Unnamed: 0,best_scorer,num_top_appearance
langchain_token_mmr,32.6,23
langchain_recursive_similarity,35.9,32
langchain_recursive_mmr,32.0,9
llama_index_sentence_retrieval,34.9,35
llama_index_auto_merging_retrieval,29.8,8


**Count the number of times that each technique was among the lowest scorers:**

In [41]:
# Count the number of times each column name appears in the lowest scorer list
min_cols_count = Counter([col for row in min_cols_list for col in row])
min_cols_count

Counter({'llama_index_auto_merging_retrieval_score': 30,
         'langchain_recursive_mmr_score': 21,
         'langchain_token_mmr_score': 11,
         'langchain_recursive_similarity_score': 6,
         'llama_index_sentence_retrieval_score': 6})

---------------------------------------

In [42]:
def count_number_of_zero_scores(df, column_name):
    return len(df[df[column_name]==0])

langchain_token_mmr_total_zero_score = count_number_of_zero_scores(questions_df, "langchain_token_mmr_score")
langchain_recursive_similarity_total_zero_score = count_number_of_zero_scores(questions_df, "langchain_recursive_similarity_score")
langchain_recursive_mmr_total_zero_score = count_number_of_zero_scores(questions_df, "langchain_recursive_mmr_score")
llama_index_sentence_retrieval_zero_score = count_number_of_zero_scores(questions_df, "llama_index_sentence_retrieval_score")
llama_index_auto_merging_retrieval_total_zero_score = count_number_of_zero_scores(questions_df, "llama_index_auto_merging_retrieval_score")
print(f"Total zero scores:\n\
    langchain_token_mmr: {langchain_token_mmr_total_zero_score},\n\
    langchain_recursive_similarity: {langchain_recursive_similarity_total_zero_score},\n\
    langchain_recursive_mmr: {langchain_recursive_mmr_total_zero_score},\n\
    llama_index_sentence_retrieval: {llama_index_sentence_retrieval_zero_score},\n\
    llama_index_auto_merging_retrieval: {llama_index_auto_merging_retrieval_total_zero_score}")

Total zero scores:
    langchain_token_mmr: 2,
    langchain_recursive_similarity: 0,
    langchain_recursive_mmr: 0,
    llama_index_sentence_retrieval: 0,
    llama_index_auto_merging_retrieval: 1


------------------------------------

**Analysis based on each document**

In [43]:
langchain_token_mmr_df = questions_df.pivot_table(index='source', columns=None, values='langchain_token_mmr_score', aggfunc='sum')
langchain_recursive_similarity_df = questions_df.pivot_table(index='source', columns=None, values='langchain_recursive_similarity_score', aggfunc='sum')
langchain_recursive_mmr_df = questions_df.pivot_table(index='source', columns=None, values='langchain_recursive_mmr_score', aggfunc='sum')
llama_index_sentence_df = questions_df.pivot_table(index='source', columns=None, values='llama_index_sentence_retrieval_score', aggfunc='sum')
llama_index_auto_merging_df = questions_df.pivot_table(index='source', columns=None, values='llama_index_auto_merging_retrieval_score', aggfunc='sum')

# Concatenate the DataFrames into a single DataFrame
result_df = pd.concat([langchain_token_mmr_df, langchain_recursive_similarity_df, langchain_recursive_mmr_df, llama_index_sentence_df, llama_index_auto_merging_df], axis=1)

# Rename the columns to match your desired output
result_df.columns = [
    'langchain_token_mmr_score',
    'langchain_recursive_similarity_score',
    'langchain_recursive_mmr_score',
    'llama_index_sentence_retrieval_score',
    'llama_index_auto_merging_retrieval_score'
    ]

# Display the result
display(result_df)

Unnamed: 0_level_0,langchain_token_mmr_score,langchain_recursive_similarity_score,langchain_recursive_mmr_score,llama_index_sentence_retrieval_score,llama_index_auto_merging_retrieval_score
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SegmentAnything paper,11.6,12.5,10.4,12.1,10.1
product scpecification,3.0,4.1,3.9,4.2,4.0
stories,4.1,4.5,4.0,4.5,3.3
technical support,4.8,4.9,4.9,4.1,4.6
vision transformer paper,9.1,9.9,8.8,10.0,7.8


**Average score per document (normalized)**

In [44]:
print("Normalized average score per document:")
num_questions = questions_df[["source", "question"]].groupby("source").count().values.tolist()
num_questions = [val for sublist in num_questions for val in sublist]
mean_doc_score = result_df.mean(axis=1).values.tolist()
result = [round(x / y, 2) for x, y in zip(mean_doc_score, num_questions)]
result

Normalized average score per document:


[0.81, 0.77, 0.82, 0.93, 0.83]

* SegmentAnything paper: (2 column scientific paper) 
    - Best scorer: 
    - Wrost scorer:
    - Normalized average score:
* product scpecification:
    - Best scorer: 
    - Wrost scorer:
    - Normalized average score:
* stories:
    - Best scorer: 
    - Wrost scorer:
    - Normalized average score:
* technical support:
    - Best scorer: 
    - Wrost scorer:
    - Normalized average score:
* vision transformer paper: (1 coulmn scientific paper)
    - Best scorer: 
    - Wrost scorer:
    - Normalized average score:
-----------------------------

**Total inference time**

In [45]:
questions_df[[
    "langchain_token_mmr_inference_time",
    "langchain_recursive_mmr_inference_time",
    "langchain_recursive_similarity_inference_time",
    "llama_index_sentence_retrieval_inference_time",
    "llama_index_auto_merging_retrieval_inference_time"
    ]].sum()

langchain_token_mmr_inference_time                    68.97
langchain_recursive_mmr_inference_time                62.03
langchain_recursive_similarity_inference_time         66.92
llama_index_sentence_retrieval_inference_time        100.95
llama_index_auto_merging_retrieval_inference_time     52.66
dtype: float64

**Average inference time**

In [46]:
questions_df[[
    "langchain_token_mmr_inference_time",
    "langchain_recursive_mmr_inference_time",
    "langchain_recursive_similarity_inference_time",
    "llama_index_sentence_retrieval_inference_time",
    "llama_index_auto_merging_retrieval_inference_time"
    ]].mean()

langchain_token_mmr_inference_time                   1.72425
langchain_recursive_mmr_inference_time               1.55075
langchain_recursive_similarity_inference_time        1.67300
llama_index_sentence_retrieval_inference_time        2.52375
llama_index_auto_merging_retrieval_inference_time    1.31650
dtype: float64

* Fastest: llama_index_auto_merging_retrieval_inference_time
* slowest: llama_index_sentence_retrieval_inference_time

-------------------------

**Check the zero scored questions**

In [47]:
print("llama_index_auto_merging_retrieval:")
display(questions_df[questions_df["llama_index_auto_merging_retrieval_score"]==0])
print("llama_index_sentence_retrieval:")
display(questions_df[questions_df["llama_index_sentence_retrieval_score"]==0])
print("langchain_token_mmr:")
display(questions_df[questions_df["langchain_token_mmr_score"]==0])
print("langchain_recursive_mmr:")
display(questions_df[questions_df["langchain_recursive_mmr_score"]==0])
print("langchain_recursive_similarity:")
display(questions_df[questions_df["langchain_recursive_similarity_score"]==0])

llama_index_auto_merging_retrieval:


Unnamed: 0,source,question,correct_answer,langchain_token_mmr_result,langchain_token_mmr_inference_time,langchain_recursive_similarity_result,langchain_recursive_similarity_inference_time,langchain_recursive_mmr_result,langchain_recursive_mmr_inference_time,llama_index_sentence_retrieval_result,...,llama_index_auto_merging_retrieval_inference_time,langchain_recursive_similarity_score,langchain_recursive_mmr_score,llama_index_sentence_retrieval_score,llama_index_auto_merging_retrieval_score,lowest_score,highest_score,langchain_token_mmr_score,langchain_token_mmr_result_score,langchain_token_mmr_score_score
13,product scpecification,How much does CubeTriangle Kappa Portable Spea...,2000,The CubeTriangle Kappa Portable Speaker costs ...,0.43,The CubeTriangle Kappa Portable Speaker is pri...,0.36,The CubeTriangle Kappa Portable Speaker is pri...,0.46,"Based on the given context information, the pr...",...,0.82,0.2,0.3,0.2,0.0,langchain_token_mmr,langchain_token_mmr,0.0,0.6,


llama_index_sentence_retrieval:


Unnamed: 0,source,question,correct_answer,langchain_token_mmr_result,langchain_token_mmr_inference_time,langchain_recursive_similarity_result,langchain_recursive_similarity_inference_time,langchain_recursive_mmr_result,langchain_recursive_mmr_inference_time,llama_index_sentence_retrieval_result,...,llama_index_auto_merging_retrieval_inference_time,langchain_recursive_similarity_score,langchain_recursive_mmr_score,llama_index_sentence_retrieval_score,llama_index_auto_merging_retrieval_score,lowest_score,highest_score,langchain_token_mmr_score,langchain_token_mmr_result_score,langchain_token_mmr_score_score


langchain_token_mmr:


Unnamed: 0,source,question,correct_answer,langchain_token_mmr_result,langchain_token_mmr_inference_time,langchain_recursive_similarity_result,langchain_recursive_similarity_inference_time,langchain_recursive_mmr_result,langchain_recursive_mmr_inference_time,llama_index_sentence_retrieval_result,...,llama_index_auto_merging_retrieval_inference_time,langchain_recursive_similarity_score,langchain_recursive_mmr_score,llama_index_sentence_retrieval_score,llama_index_auto_merging_retrieval_score,lowest_score,highest_score,langchain_token_mmr_score,langchain_token_mmr_result_score,langchain_token_mmr_score_score
13,product scpecification,How much does CubeTriangle Kappa Portable Spea...,2000,The CubeTriangle Kappa Portable Speaker costs ...,0.43,The CubeTriangle Kappa Portable Speaker is pri...,0.36,The CubeTriangle Kappa Portable Speaker is pri...,0.46,"Based on the given context information, the pr...",...,0.82,0.2,0.3,0.2,0.0,langchain_token_mmr,langchain_token_mmr,0.0,0.6,
14,product scpecification,Is there any skateboard with 20 mph top speed ...,Yes the CubeTriangle Chi Electric Skateboard h...,"I'm sorry, but I couldn't find any information...",0.69,"Yes, the CubeTriangle Chi Electric Skateboard ...",0.42,"Based on the information from the vectorDB, th...",0.98,"Yes, there is a skateboard with a 20 mph top s...",...,0.96,0.9,0.6,1.0,1.0,langchain_token_mmr,langchain_token_mmr,0.0,0.2,


langchain_recursive_mmr:


Unnamed: 0,source,question,correct_answer,langchain_token_mmr_result,langchain_token_mmr_inference_time,langchain_recursive_similarity_result,langchain_recursive_similarity_inference_time,langchain_recursive_mmr_result,langchain_recursive_mmr_inference_time,llama_index_sentence_retrieval_result,...,llama_index_auto_merging_retrieval_inference_time,langchain_recursive_similarity_score,langchain_recursive_mmr_score,llama_index_sentence_retrieval_score,llama_index_auto_merging_retrieval_score,lowest_score,highest_score,langchain_token_mmr_score,langchain_token_mmr_result_score,langchain_token_mmr_score_score


langchain_recursive_similarity:


Unnamed: 0,source,question,correct_answer,langchain_token_mmr_result,langchain_token_mmr_inference_time,langchain_recursive_similarity_result,langchain_recursive_similarity_inference_time,langchain_recursive_mmr_result,langchain_recursive_mmr_inference_time,llama_index_sentence_retrieval_result,...,llama_index_auto_merging_retrieval_inference_time,langchain_recursive_similarity_score,langchain_recursive_mmr_score,llama_index_sentence_retrieval_score,llama_index_auto_merging_retrieval_score,lowest_score,highest_score,langchain_token_mmr_score,langchain_token_mmr_result_score,langchain_token_mmr_score_score


In [54]:
# for i in [0, 38, 13, 30, 13]:
#     print(questions_df[questions_df.index==i]["question"].values[0])

In [49]:
final_df = final_df.reset_index().rename(columns={'index': 'method'})

In [52]:
final_df['num_top_appearance'] = final_df['num_top_appearance'].astype(float)


In [53]:
color_sequence = px.colors.qualitative.Plotly

# Create the scatter plot
fig = px.scatter(
    final_df,
    x='method',
    y='best_scorer',
    size='num_top_appearance',
    hover_name='method',
    color='method',  # Assign colors based on the 'system' column
    color_discrete_sequence=color_sequence,  # Use the defined color sequence
    title='<b>Best RAG scorer<b>'
)

# Customize the layout
fig.update_layout(
    title_font=dict(family='Arial, sans-serif', size=24, color='black'),
    title_x=0.5,  # Center the title
    paper_bgcolor='white',  # Background color for the outer area
    plot_bgcolor='white',  # Background color for the plot area
    xaxis=dict(
        title='<b>Methods<b>',
        title_font=dict(size=24),
        tickangle=45,  # Rotate labels
        tickfont=dict(family='Arial, sans-serif', size=18, color='black'),
        showgrid=True  # Hide gridlines
    ),
    yaxis=dict(
        title='<b>Scores<b>',
        title_font=dict(family='Arial, sans-serif', size=18, color='black'),
        tickfont=dict(family='Arial, sans-serif', size=16, color='black'),
        showgrid=True,  # Show gridlines
        gridcolor='lightgrey'  # Gridline color
    ),
    legend=dict(
        title='Methods',
        font=dict(family='Arial, sans-serif', size=16, color='black',),
        bgcolor='lightgrey',  # Background color for the legend
        bordercolor='black',  # Border color for the legend
        borderwidth=1
    ),
    margin=dict(l=60, r=60, t=60, b=60),  # Adjust margins to fit labels
    width=1800,  # Width of the entire plot image
    height=600  # Height of the entire plot image
)

# Customize marker appearance
fig.update_traces(
    marker=dict(
        line=dict(width=3, color='darkgrey')  # Border line for markers
    ),
    selector=dict(mode='markers')
)

# Show the plot
fig.show()