In [444]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import os

# Definitions

In [445]:
def import_data(num_shards):
    dfs = []
    for i in range(num_shards):
        df = pd.read_csv('results/S' + str(i+1) + '_results.csv')
        dfs.append(df)
    return dfs

In [446]:
def save_figure(fig, filename):
    base_path = r'results/images/'
    # Export the plot as a svg file
    fig.update_layout(template="plotly_dark", autosize=False, width=2000, height=1200, margin=dict(l=0, r=0, b=0, t=0))
    fig.write_image(base_path + filename + "_dark.svg", engine="kaleido")
    fig.write_image(base_path + filename + "_dark.png", engine="kaleido")
    
    fig.update_layout(template="plotly_white", autosize=False, width=2000, height=1200, margin=dict(l=0, r=0, b=0, t=0))
    fig.write_image(base_path + filename + "_white.svg", engine="kaleido")
    fig.write_image(base_path + filename + "_white.png", engine="kaleido")
    

# Import metrics

In [447]:
# Get the number of files that are csvs in the results folder
num_shards = len([name for name in os.listdir('results') if os.path.isfile(os.path.join('results', name)) and name.endswith('.csv')])

# Import the dataframes
dfs = import_data(num_shards)
print(f"Dataframes imported successfully ({num_shards} shards)")

Dataframes imported successfully (6 shards)


# Process dataframe

In [448]:
# The goal is to visualize the the impact of the number of shards on the query time

# First, we need to create a dataframe with the query time for each shard
# We will use the column 'avg_runtime_no_outliers' of each dataframe
# We will use the column 'query' to identify the query

# The dataframe will look like this:
# Query, avg_runtime_no_outliers1, avg_runtime_no_outliers2, ..., avg_runtime_no_outliersN
 
df = pd.DataFrame()
cols_to_plot = []
for i in range(len(dfs)):
    df['query'] = dfs[i]['query']
    df['query_number'] = dfs[i]['query_number']
    df['avg_runtime_no_outliers' + str(i+1)] = dfs[i]['avg_runtime_no_outliers']
    cols_to_plot.append('avg_runtime_no_outliers' + str(i+1))

# Plot shards / query

In [449]:
# Plot the results as a side by side bar chart
# Set fig size
plt.rcParams['figure.figsize'] = [10, 15]
fig = px.bar(df, x="query_number", y=cols_to_plot, barmode='group', title='Query time in ms in function of the number of shards')

# Add a horizontal line to show the average query time for each shard with a new color every time
for i in range(len(cols_to_plot)):
    color = fig.data[i].marker.color # Get the name of the color
    fig.add_hline(y=df[cols_to_plot[i]].mean(), line_dash="dash", line_color=color)
    
# Add a legend to the plot and make it say '1 shard', 2 shards', etc.
fig.update_layout(legend_title_text='Number of shards')
fig.for_each_trace(lambda trace: trace.update(name=trace.name.replace("avg_runtime_no_outliers", "# ")))

# Add a label to each bar
for i in range(len(cols_to_plot)):
    fig.data[i].text = round(df[cols_to_plot[i]])
    fig.data[i].textposition = 'outside'
    fig.data[i].textfont.size = 8

# Label the y axis
fig.update_yaxes(title_text="Query time (ms)")

# Label the x axis
fig.update_xaxes(title_text="Query number")

fig.show()

In [450]:
save_figure(fig, "benchmark_sharding_for_each_query")

# Plot query / # of shards

## Melt and pivot dataframe

In [451]:
# Reverse the dataframe to have queries for shard instead of shards for queries
df = df.melt(id_vars=['query', 'query_number'], var_name='shard', value_name='query_time')

# For each value of shard, pivot the dataframe to have the query time for each query
df = df.pivot(index='shard', columns='query_number', values='query_time').reset_index(level=0)
df = df.rename_axis(None, axis=1)

# Rename the values of the column 'shard' to have the number of shards instead of the name of the column
df['shard'] = df['shard'].str.replace('avg_runtime_no_outliers', '# ')

In [452]:
# Check that the dataframe is correct
df

Unnamed: 0,shard,1,2,3,4,5,6,7,8
0,# 1,3646.5,1199.125,2866.625,1342.0,5607.5,3196.375,2951.125,3.0
1,# 2,2929.25,1010.25,1427.0,1125.25,4475.625,2574.375,1504.625,3.625
2,# 3,2762.5,1000.0,1270.75,1113.625,4218.625,2459.125,1372.375,3.375
3,# 4,2300.375,783.0,976.125,871.0,3525.25,1993.25,1023.625,3.0
4,# 5,2310.5,778.875,831.75,869.875,3521.875,2010.25,847.375,3.125
5,# 6,2227.0,755.125,626.375,843.75,3412.0,1939.5,644.25,3.0


## Plot

In [453]:
# Plot the results lines. On the x axis, we want groups of bars for each shard. On the y axis, we want the query time for each query. The color of the bars will depend on the query.

# Set the fig size
plt.rcParams['figure.figsize'] = [10, 15]

# Create the plot
fig = px.line(df, x="shard", y=df.columns[1:], title='Query time in ms in function of the number of shards')

# Add a legend to the plot and make it say 'query 1', 'query 2', etc.
fig.update_layout(legend_title_text='Query #')

# Label the y axis
fig.update_yaxes(title_text="Query time (ms)")

# Label the x axis
fig.update_xaxes(title_text="# shards")

fig.show()

In [454]:
save_figure(fig, "benchmark_query_for_each_shard")