# Why do we need these bench marks?
- to predict expected performance
- to judge the condition of our system

In [1]:
from tqdm import tqdm
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def process(d):
    df = pd.read_csv(f"./data/{d}_outputs.csv")

    df["input_tokens_per_prompt"] = df["input_tokens"] / df["prompts"]
    df["output_tokens_per_prompt"] = df["output_tokens"] / df["prompts"]
    df["prompts"] = df["prompts"].astype(str)

    df["load_time"] = df["load_time"]/1000
    df["prompt_eval_time"] = df["prompt_eval_time"]/1000
    df["output_eval_time"] = df["output_eval_time"]/1000
    df["total_time"] = df["total_time"]/1000

    df["load_time_per_input_token"] = df["load_time"] / df["input_tokens"]
    df["prompt_eval_time_per_input_token"] = df["prompt_eval_time"] / df["input_tokens"]
    df["output_eval_time_per_output_token"] = df["output_eval_time"] / df["output_tokens"]
    df["total_time_per_input_token"] = df["total_time"] / df["input_tokens"]
    df["total_time_per_output_token"] = df["total_time"] / df["output_tokens"]

    return df

rtx_df = process("RTX4060")
u9_df = process("Ultra9-185H")
m3_df = process("M3")

all_df = pd.concat([rtx_df, u9_df, m3_df], ignore_index=True)

device, all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1272 entries, 0 to 1271
Data columns (total 18 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   prompts                            1272 non-null   object 
 1   input_tokens                       1272 non-null   int64  
 2   output_tokens                      1272 non-null   int64  
 3   total_time                         1272 non-null   float64
 4   load_time                          1272 non-null   float64
 5   prompt_eval_time                   1272 non-null   float64
 6   output_eval_time                   1272 non-null   float64
 7   prompt_complexity                  1272 non-null   object 
 8   prompt_length                      1272 non-null   object 
 9   output_length                      1272 non-null   object 
 10  device                             1272 non-null   object 
 11  input_tokens_per_prompt            1272 non-null   float

(device(type='cuda'), None)

In [2]:
columns = ["input_tokens", "output_tokens", "device", "total_time"]

inputs_df = all_df[columns]
devices = inputs_df["device"].unique()

m = inputs_df[["input_tokens", "output_tokens"]].mean(numeric_only=True)
s = inputs_df[["input_tokens", "output_tokens"]].std(numeric_only=True)

inputs_df [["input_tokens", "output_tokens"]] =  (inputs_df[["input_tokens", "output_tokens"]] - m)/s
inputs_df = pd.get_dummies(inputs_df)
outputs_df = inputs_df.pop("total_time")


inputs_arr = np.array(inputs_df, dtype = np.float32)
inputs = torch.from_numpy(inputs_arr).to(device)

outputs_arr = np.array(outputs_df, dtype = np.float32).reshape(-1,1)
outputs = torch.from_numpy(outputs_arr).to(device)
inputs_df.columns, inputs.shape, outputs.shape

(Index(['input_tokens', 'output_tokens', 'device_M3', 'device_RTX4060',
        'device_Ultra9-185H'],
       dtype='object'),
 torch.Size([1272, 5]),
 torch.Size([1272, 1]))

## Simple 3 layer feed-forward Neural Network

In [3]:
network = nn.Sequential(
    nn.Linear(inputs.shape[-1], 10),
    nn.ReLU(),
    nn.Linear(10, 5),
    nn.ReLU(),
    nn.Linear(5, 1),
    nn.ReLU()
).to(device)

lossfn = nn.MSELoss()
opt = torch.optim.Adam(network.parameters(), lr=0.001)

In [4]:
epochs = 10000
for i in tqdm(range(epochs)):
    opt.zero_grad()

    preds = network(inputs)
    loss = lossfn(preds, outputs)
    loss.backward()
    opt.step()

print(loss)

100%|██████████| 10000/10000 [00:20<00:00, 496.67it/s]


tensor(88.5694, device='cuda:0', grad_fn=<MseLossBackward0>)


## Just enter expected Input and Output tokens to predict the time it would take, without actually sitting there

In [5]:
input_tokens = 92
output_tokens = 441

input_tokens = (input_tokens - m["input_tokens"])/ s["input_tokens"]
output_tokens = (output_tokens  - m["output_tokens"])/ s["output_tokens"]

input_ = torch.tensor([[input_tokens, output_tokens, 1, 0, 0],[input_tokens, output_tokens, 0, 1, 0],[input_tokens, output_tokens, 0, 0, 1]], dtype= torch.float32).to(device)
output_ = network(input_)

print("Prediction Time:")
print(f"M3 : {output_[0,0].item():.2f}s")
print(f"RTX4060 : {output_[1,0].item():.2f}s")
print(f"Ultra9 : {output_[2,0].item():.2f}s")

Prediction Time:
M3 : 27.96s
RTX4060 : 0.00s
Ultra9 : 56.76s
