# Visualizing the Benchmarks
- make sense of the data
- get insights
- detect patterns

In [7]:
import pandas as pd
import plotly.express as px

## Understanding the data gathered
- INPUTS : `./prompts/`
    - device
    - prompts (1, 8, 32, 128)
    - length of prompt (, input tokens)
    - complexity of prompt
    - output length expected (concise, detail)
- OUTPUTS :
    - output tokens
    - time (milliseconds)
    - load time
    - prompt evaluation time
    - output evaluation time
    - answers `./answers/`
    - logs `./logs/`
- Derived Outputs

In [8]:
def process(device):
    df = pd.read_csv(f"./data/{device}_outputs.csv")

    df["input_tokens_per_prompt"] = df["input_tokens"] / df["prompts"]
    df["output_tokens_per_prompt"] = df["output_tokens"] / df["prompts"]
    df["prompts"] = df["prompts"].astype(str)

    df["load_time"] = df["load_time"]/1000
    df["prompt_eval_time"] = df["prompt_eval_time"]/1000
    df["output_eval_time"] = df["output_eval_time"]/1000
    df["total_time"] = df["total_time"]/1000

    df["load_time_per_input_token"] = df["load_time"] / df["input_tokens"]
    df["prompt_eval_time_per_input_token"] = df["prompt_eval_time"] / df["input_tokens"]
    df["output_eval_time_per_output_token"] = df["output_eval_time"] / df["output_tokens"]
    df["total_time_per_input_token"] = df["total_time"] / df["input_tokens"]
    df["total_time_per_output_token"] = df["total_time"] / df["output_tokens"]


    return df

devices = ["M3", "RTX4060", "Ultra9-185H"]
data_df = process(devices[1])

## Analyzing Batch Size effects
- on load time
- on total time

In [9]:
# remove color from the fig below and run again
fig = px.scatter(data_df, x= range(len(data_df)), y= "load_time", marginal_y="histogram")
fig.show()
fig = px.scatter(data_df, x= range(len(data_df)), y= "load_time_per_input_token", color="prompts", size="input_tokens", marginal_y="box")
fig.show()
fig = px.scatter(data_df, x= range(len(data_df)), y= "total_time_per_output_token", color = 'prompts', marginal_y="violin")
fig.show()
fig = px.scatter(data_df, x= range(len(data_df)), y= "total_time", color = 'prompts', marginal_y="violin")
fig.show()

caveat of Total time for batch size of 128

## Analyze effect of Prompt Length
- time per output token
- on number of output tokens p

`Output length is a categorical field`

In [10]:
fig = px.scatter(data_df, x= range(len(data_df)), y= "total_time_per_output_token", color = 'prompt_length', marginal_y="violin")
fig.show()
fig = px.scatter(data_df, x= range(len(data_df)), y= "output_tokens_per_prompt", color = 'prompt_length', marginal_y="violin")
fig.show()
fig = px.scatter(data_df, x= range(len(data_df)), y= "output_tokens_per_prompt", color = 'output_length', marginal_y="box")
fig.show()

# Analyzing the most important thing
- time w.r.t inputs
- time w.r.t outputs

In [11]:
fig = px.scatter(data_df, x= range(len(data_df)), y= "total_time", size="output_tokens", color = 'output_length', marginal_y="violin")
fig.show()
fig = px.scatter(data_df, x= "output_tokens" , y= ["total_time", "load_time", "prompt_eval_time", "output_eval_time"], trendline="lowess")
fig.show()
fig = px.scatter(data_df, x= "input_tokens", y= ["total_time", "load_time", "prompt_eval_time", "output_eval_time"], trendline="lowess")
fig.show()
fig = px.scatter(data_df, y= "output_eval_time_per_output_token" , x= "output_tokens", trendline="lowess", marginal_y="histogram")
fig.show()

# Analyzing effects of Prompt complexity
- time
- output tokens

In [12]:
fig = px.scatter(data_df, x= range(len(data_df)), y= "total_time_per_input_token", size= "input_tokens", color = 'prompt_complexity', marginal_y="violin")
fig.show()
fig = px.scatter(data_df, x= range(len(data_df)), y= "total_time_per_output_token", size= "output_tokens", color = 'prompt_complexity', marginal_y="violin")
fig.show()
# prompt complexity has no effect on latency, ignore the outliers

# The single most Important Graph
- devices
- time

In [16]:
rtx_df = process("RTX4060")
u9_df = process("Ultra9-185H")
m3_df = process("M3")

all_df = pd.concat([rtx_df, u9_df, m3_df], ignore_index=True)
device_df = all_df.groupby("device")

merged = pd.concat([device_df["load_time"].mean(),device_df["load_time"].var(),device_df["load_time_per_input_token"].mean(), device_df["prompt_eval_time_per_input_token"].mean(), device_df["prompt_eval_time_per_input_token"].var(), device_df["output_eval_time_per_output_token"].mean(), device_df["output_eval_time_per_output_token"].var()], axis=1)

merged.columns = ["load_time_mean", "load_time_var","load_time_per_input_token_mean","prompt_eval_time_per_token_mean","prompt_eval_time_per_token_var", "output_eval_time_per_token_mean", "output_eval_time_per_token_var"]

fig = px.bar(merged, y=["load_time_mean", "load_time_var"])
fig.show()

fig = px.bar(merged, y=["prompt_eval_time_per_token_mean", "prompt_eval_time_per_token_var"])
fig.show()

fig = px.bar(merged, y=["output_eval_time_per_token_mean", "output_eval_time_per_token_var"])
fig.show()

fig = px.bar(merged, y=["load_time_per_input_token_mean","prompt_eval_time_per_token_mean", "output_eval_time_per_token_mean"])
fig.show()