# Preview Evaluation - vLLM inference parameters

We are comparing a flexibilization of vLLM parameters in the following ranges:
- "temperature": 0.6, 0.7, 0.8, 0.9
- "top_p": 0.7, 0.8, 0.9
- "repetition_penalty": 1.0, 1.05, 1.1, 1.2, 1.25
- "frequency_penalty": 0.0, 0.05, 0.1, 0.2, 0.25
- "presence_penalty": 0.0, 0.05, 0.1, 0.2, 0.25


In [1]:
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
from utils.metrics.calculate_metric import calculate_agg_metric
import os
import numpy as np
import json

  from .autonotebook import tqdm as notebook_tqdm


INFO 11-01 15:20:02 [__init__.py:220] No platform detected, vLLM is running on UnspecifiedPlatform


In [5]:

exp_names = {
    "temperature": ["vllm_temp0", "vllm_temp1", "vllm_temp2", "vllm_temp3"],
    "top_p": ["vllm_top_p_0", "vllm_top_p_1", "vllm_top_p_2"],
    "repetition_penalty": ["vllm_rep_pen_0", "vllm_rep_pen_1", "vllm_rep_pen_2", "vllm_rep_pen_3", "vllm_rep_pen_4"],
    "frequency_penalty": ["vllm_freq_pen_0", "vllm_freq_pen_1", "vllm_freq_pen_2", "vllm_freq_pen_3", "vllm_freq_pen_4"],
    "presence_penalty": ["vllm_pres_pen_0", "vllm_pres_pen_1", "vllm_pres_pen_2", "vllm_pres_pen_3", "vllm_pres_pen_4"]
}

## Calculate Metrics and save them

In [6]:


questions_path = "../../questions_2500_42_dev.feather"
for cat in exp_names.keys():
    for exp in exp_names[cat]:
        for file in os.listdir(f"../{exp}/generations"):
            if f"result_{exp}.feather" not in os.listdir("."):
                calculate_agg_metric(
                    metrics=["rouge_l"],
                    generation_path=f"../{exp}/generations/{file}",
                    reference_path=questions_path,
                    saving_path=f"result_{exp}.feather"
                )






## Load Data

In [7]:
results = []
for f in os.listdir("."):
    if f.endswith("feather"):
        processsed_namefile = f.split(".")[0].split("result_")[1]

        for cat in exp_names.keys():
            if processsed_namefile in exp_names[cat]:
                true_cat = cat
                break
            
        results.append(
            pl.read_ipc(f, memory_map=False)
            .with_columns(
                pl.lit(f"{processsed_namefile}").alias("exp_name"),
                pl.lit(true_cat).alias("category")
            )
        )

results = pl.concat(results)
with pl.Config(
    tbl_rows=-1,             # Show all rows (-1 means all)
    tbl_cols=-1,             # Show all columns (-1 means all)
    fmt_str_lengths=100    # Set string column width
):
    print(results.group_by("exp_name").agg(pl.col("mean").mean()).sort("mean", descending=True))

shape: (22, 2)
┌─────────────────┬──────────┐
│ exp_name        ┆ mean     │
│ ---             ┆ ---      │
│ str             ┆ f64      │
╞═════════════════╪══════════╡
│ vllm_rep_pen_1  ┆ 0.074415 │
│ vllm_pres_pen_4 ┆ 0.07417  │
│ vllm_freq_pen_3 ┆ 0.07417  │
│ vllm_freq_pen_4 ┆ 0.07417  │
│ vllm_pres_pen_0 ┆ 0.073589 │
│ vllm_freq_pen_0 ┆ 0.073589 │
│ vllm_rep_pen_0  ┆ 0.073589 │
│ vllm_temp1      ┆ 0.073589 │
│ vllm_pres_pen_3 ┆ 0.073357 │
│ vllm_temp0      ┆ 0.072876 │
│ vllm_top_p_0    ┆ 0.072711 │
│ vllm_rep_pen_2  ┆ 0.072575 │
│ vllm_freq_pen_2 ┆ 0.072334 │
│ vllm_freq_pen_1 ┆ 0.072334 │
│ vllm_pres_pen_1 ┆ 0.072334 │
│ vllm_pres_pen_2 ┆ 0.072334 │
│ vllm_top_p_2    ┆ 0.07151  │
│ vllm_top_p_1    ┆ 0.070655 │
│ vllm_rep_pen_3  ┆ 0.067121 │
│ vllm_rep_pen_4  ┆ 0.065706 │
│ vllm_temp2      ┆ 0.064159 │
│ vllm_temp3      ┆ 0.061324 │
└─────────────────┴──────────┘


In [17]:
results.filter(pl.col("mean") > 0).group_by("exp_name").agg(pl.col("mean").count()).sort("mean")

exp_name,mean
str,u32
"""vllm_temp3""",341
"""vllm_rep_pen_4""",364
"""vllm_rep_pen_3""",380
"""vllm_rep_pen_2""",396
"""vllm_temp2""",398
…,…
"""vllm_pres_pen_2""",423
"""vllm_freq_pen_2""",423
"""vllm_pres_pen_1""",423
"""vllm_freq_pen_1""",423


In [12]:
results.filter(pl.col("mean") > 0).filter(pl.col("category") == "temperature").group_by("exp_name").agg(pl.col("mean").count()).sort("mean")

exp_name,mean
str,u32
"""vllm_temp3""",341
"""vllm_temp2""",398
"""vllm_temp1""",417
"""vllm_temp0""",421


In [13]:
results.filter(pl.col("mean") > 0).filter(pl.col("category") == "top_p").group_by("exp_name").agg(pl.col("mean").count()).sort("mean")

exp_name,mean
str,u32
"""vllm_top_p_0""",411
"""vllm_top_p_1""",420
"""vllm_top_p_2""",428


In [14]:
results.filter(pl.col("mean") > 0).filter(pl.col("category") == "repetition_penalty").group_by("exp_name").agg(pl.col("mean").count()).sort("mean")


exp_name,mean
str,u32
"""vllm_rep_pen_4""",364
"""vllm_rep_pen_3""",380
"""vllm_rep_pen_2""",396
"""vllm_rep_pen_1""",413
"""vllm_rep_pen_0""",417


In [15]:
results.filter(pl.col("mean") > 0).filter(pl.col("category") == "frequency_penalty").group_by("exp_name").agg(pl.col("mean").count()).sort("mean")


exp_name,mean
str,u32
"""vllm_freq_pen_0""",417
"""vllm_freq_pen_4""",419
"""vllm_freq_pen_3""",419
"""vllm_freq_pen_2""",423
"""vllm_freq_pen_1""",423


In [16]:
results.filter(pl.col("mean") > 0).filter(pl.col("category") == "presence_penalty").group_by("exp_name").agg(pl.col("mean").count()).sort("mean")


exp_name,mean
str,u32
"""vllm_pres_pen_0""",417
"""vllm_pres_pen_4""",419
"""vllm_pres_pen_3""",420
"""vllm_pres_pen_1""",423
"""vllm_pres_pen_2""",423


In [7]:
results.filter(pl.col("mean") > 0).group_by("idx").count().sort("count", descending=True).filter(pl.col("count")==3)

  results.filter(pl.col("mean") > 0).group_by("idx").count().sort("count", descending=True).filter(pl.col("count")==3)


idx,count
i64,u32
923,3
399,3
2367,3
9,3
664,3
…,…
902,3
780,3
384,3
1700,3
