# RQ1

How much do embedding size and number of embeddings (dataset size in terms of number of rows) impact indexing and retrieval?

In [21]:
%load_ext autoreload
%autoreload 2

from results_processor import get_experiments_data, compute_total_energy_per_run, get_variation_runs_data, compute_kruskal_wallis

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
small_embeddings = [
    "gte_small_arguana_milvus",
    "gte_small_nfcorpus_milvus",
    "gte_small_cqadupstack_webmasters_milvus",
]

medium_embeddings = [
    "gte_base_arguana_milvus",
    "gte_base_nfcorpus_milvus",
    "gte_base_cqadupstack_webmasters_milvus",
]

large_embeddings = [
    "gte_large_arguana_milvus",
    "gte_large_nfcorpus_milvus",
    "gte_large_cqadupstack_webmasters_milvus",
]

small_datasets = [
    "gte_small_nfcorpus_milvus",
    "gte_base_nfcorpus_milvus",
    "gte_large_nfcorpus_milvus",
]

medium_datasets = [
    "gte_small_arguana_milvus",
    "gte_base_arguana_milvus",
    "gte_large_arguana_milvus",
]

large_datasets = [
    "gte_small_cqadupstack_webmasters_milvus",
    "gte_base_cqadupstack_webmasters_milvus",
    "gte_large_cqadupstack_webmasters_milvus",
]

dataset_no_rows = [3956, 10080, 17911]

### 1. Indexing 

In [3]:
indexing_experiments_data = get_experiments_data(
    "rq1_indexing",
    [
        "Total DRAM Power [W]",
        "IA Cores Power [W]",
    ],
)
indexing_runs_data = compute_total_energy_per_run(indexing_experiments_data)


#### 1.1. Indexing - varying the embedding size

In [4]:
index_embedding_variation_runs = get_variation_runs_data(
    indexing_runs_data,
    small_embeddings,
    medium_embeddings,
    large_embeddings,
    "model",
    ["gte-small", "gte-medium", "gte-large"],
)

In [5]:
index_embedding_variation_runs.groupby("model").aggregate(
    lambda x: str(round(x.mean(), 2)) + " ± " + str(round(x.std(), 2))
)

Unnamed: 0_level_0,CPU Cores Energy [J],DRAM Energy [J],duration [s]
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gte-large,1243.29 ± 711.74,56.82 ± 32.85,74.87 ± 42.94
gte-medium,1202.28 ± 699.04,54.71 ± 31.88,72.2 ± 41.79
gte-small,1193.89 ± 693.78,54.78 ± 32.18,71.68 ± 41.24


In [6]:
compute_kruskal_wallis(
    index_embedding_variation_runs,
    "model",
    ["gte-small", "gte-medium", "gte-large"],
    "duration [s]",
)

KruskalResult(statistic=np.float64(4.269401709401734), pvalue=np.float64(0.11827996798125272))

In [7]:
compute_kruskal_wallis(
    index_embedding_variation_runs,
    "model",
    ["gte-small", "gte-medium", "gte-large"],
    "CPU Cores Energy [J]",
)

KruskalResult(statistic=np.float64(1.769084249084301), pvalue=np.float64(0.4129031882394544))

#### 1.2. Indexing - varying the dataset size

In [8]:
indexing_dataset_variation_runs = get_variation_runs_data(
    indexing_runs_data,
    small_datasets,
    medium_datasets,
    large_datasets,
    "dataset",
    ["N", "A", "C"],
)

In [9]:
indexing_dataset_variation_runs.groupby("dataset").aggregate(
    lambda x: str(round(x.mean(), 2)) + " ± " + str(round(x.std(), 2))
)

Unnamed: 0_level_0,CPU Cores Energy [J],DRAM Energy [J],duration [s]
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1057.17 ± 64.33,48.3 ± 4.42,63.46 ± 1.93
C,2121.37 ± 87.0,97.05 ± 5.66,127.5 ± 3.11
N,460.93 ± 49.97,20.96 ± 2.9,27.78 ± 1.28


In [10]:
compute_kruskal_wallis(
    indexing_dataset_variation_runs,
    "dataset",
    ["N", "A", "C"],
    "duration [s]",
)

KruskalResult(statistic=np.float64(79.12087912087912), pvalue=np.float64(6.593551417550709e-18))

In [11]:
compute_kruskal_wallis(
    indexing_dataset_variation_runs,
    "dataset",
    ["N", "A", "C"],
    "CPU Cores Energy [J]",
)

KruskalResult(statistic=np.float64(79.12087912087912), pvalue=np.float64(6.593551417550709e-18))

### 2. Querying

#### 2.1. Querying - varying the embedding size

In [12]:
querying_experiments_data = get_experiments_data(
    "rq1_querying",
    [
        "Total DRAM Power [W]",
        "IA Cores Power [W]",
    ],
)

querying_runs_data = compute_total_energy_per_run(querying_experiments_data)

In [13]:
querying_embedding_variation_runs = get_variation_runs_data(
    querying_runs_data,
    small_embeddings,
    medium_embeddings,
    large_embeddings,
    "model",
    ["gte-small", "gte-medium", "gte-large"],
)

In [14]:
querying_embedding_variation_runs.groupby("model").aggregate(
    lambda x: str(round(x.mean(), 2)) + " ± " + str(round(x.std(), 2))
)

Unnamed: 0_level_0,CPU Cores Energy [J],DRAM Energy [J],duration [s]
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gte-large,145.6 ± 13.44,5.8 ± 0.48,11.75 ± 1.32
gte-medium,138.51 ± 16.31,5.66 ± 0.9,10.61 ± 1.28
gte-small,112.85 ± 11.96,4.59 ± 0.37,9.02 ± 1.02


In [15]:
compute_kruskal_wallis(
    querying_embedding_variation_runs,
    "model",
    ["gte-small", "gte-medium", "gte-large"],
    "duration [s]",
)

KruskalResult(statistic=np.float64(44.50666666666666), pvalue=np.float64(2.1652105243405554e-10))

In [16]:
compute_kruskal_wallis(
    querying_embedding_variation_runs,
    "model",
    ["gte-small", "gte-medium", "gte-large"],
    "model",
)

KruskalResult(statistic=np.float64(89.0), pvalue=np.float64(4.7194952715261225e-20))

#### 2.2. Querying - varying the dataset size

In [17]:
querying_dataset_varation_runs = get_variation_runs_data(
    querying_runs_data,
    small_datasets,
    medium_datasets,
    large_datasets,
    "dataset",
    ["N", "A", "C"],
)

In [18]:
querying_dataset_varation_runs.groupby("dataset").aggregate(
    lambda x: str(round(x.mean(), 2)) + " ± " + str(round(x.std(), 2))
)

Unnamed: 0_level_0,CPU Cores Energy [J],DRAM Energy [J],duration [s]
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,128.1 ± 20.63,5.35 ± 1.05,9.6 ± 1.03
C,123.26 ± 16.61,5.09 ± 0.67,9.68 ± 1.15
N,145.6 ± 14.71,5.61 ± 0.62,12.1 ± 1.33


In [19]:
compute_kruskal_wallis(
    querying_dataset_varation_runs,
    "dataset",
    ["N", "A", "C"],
    "duration [s]",
)

KruskalResult(statistic=np.float64(38.41142857142853), pvalue=np.float64(4.561044029837794e-09))

In [20]:
compute_kruskal_wallis(
    querying_dataset_varation_runs,
    "dataset",
    ["N", "A", "C"],
    "CPU Cores Energy [J]",
)

KruskalResult(statistic=np.float64(22.440244200244194), pvalue=np.float64(1.3401792381726827e-05))