In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import os

In [32]:
def import_data(num_shards):
    dfs = []
    for i in range(num_shards):
        df = pd.read_csv('results/S' + str(i+1) + '_results.csv')
        dfs.append(df)
    return dfs

In [33]:
# Get the number of files that are csvs in the results folder
num_shards = len([name for name in os.listdir('results') if os.path.isfile(os.path.join('results', name)) and name.endswith('.csv')])

# Import the dataframes
dfs = import_data(num_shards)
print(f"Dataframes imported successfully ({num_shards} shards)")

Dataframes imported successfully (6 shards)


In [34]:
# The goal is to visualize the the impact of the number of shards on the query time

# First, we need to create a dataframe with the query time for each shard
# We will use the column 'avg_runtime_no_outliers' of each dataframe
# We will use the column 'query' to identify the query

# The dataframe will look like this:
# Query, avg_runtime_no_outliers1, avg_runtime_no_outliers2, ..., avg_runtime_no_outliersN
 
df = pd.DataFrame()
cols_to_plot = []
for i in range(len(dfs)):
    df['query'] = dfs[i]['query']
    df['query_number'] = dfs[i]['query_number']
    df['avg_runtime_no_outliers' + str(i+1)] = dfs[i]['avg_runtime_no_outliers']
    cols_to_plot.append('avg_runtime_no_outliers' + str(i+1))
df

Unnamed: 0,query,query_number,avg_runtime_no_outliers1,avg_runtime_no_outliers2,avg_runtime_no_outliers3,avg_runtime_no_outliers4,avg_runtime_no_outliers5,avg_runtime_no_outliers6
0,[{'$group': {'_id': {'$toLower': '$institution...,1,3646.5,2929.25,2762.5,2300.375,2310.5,2227.0
1,[{'$match': {'institution.city_name': re.compi...,2,1199.125,1010.25,1000.0,783.0,778.875,755.125
2,[{'$match': {'investigators.email_id': {'$ne':...,3,2866.625,1427.0,1270.75,976.125,831.75,626.375
3,[{'$match': {'institution.state_name': 'Califo...,4,1342.0,1125.25,1113.625,871.0,869.875,843.75
4,"[{'$group': {'_id': '$institution', 'count': {...",5,5607.5,4475.625,4218.625,3525.25,3521.875,3412.0
5,"[{'$unwind': {'path': '$programs', 'includeArr...",6,3196.375,2574.375,2459.125,1993.25,2010.25,1939.5
6,[{'$match': {'investigators.email_id': re.comp...,7,2951.125,1504.625,1372.375,1023.625,847.375,644.25
7,[{'$match': {'investigators.email_id': 'jeremy...,8,3.0,3.625,3.375,3.0,3.125,3.0


In [35]:
# Plot the results as a side by side bar chart
# Set fig size
plt.rcParams['figure.figsize'] = [20, 15]
fig = px.bar(df, x="query_number", y=cols_to_plot, barmode='group', title='Query time in ms in function of the number of shards')

# Add a horizontal line to show the average query time for each shard with a new color every time
for i in range(len(cols_to_plot)):
    color = fig.data[i].marker.color # Get the name of the color
    fig.add_hline(y=df[cols_to_plot[i]].mean(), line_dash="dash", line_color=color)
    
# Add a legend to the plot and make it say '1 shard', 2 shards', etc.
fig.update_layout(legend_title_text='Number of shards')
fig.for_each_trace(lambda trace: trace.update(name=trace.name.replace("avg_runtime_no_outliers", "# ")))

# Label the y axis
fig.update_yaxes(title_text="Query time (ms)")

# Label the x axis
fig.update_xaxes(title_text="Query number")

fig.show()

In [36]:
# Export the plot as a png file
fig.write_image(r"results/benchmark_viz.png", engine="kaleido")