# RQ1

How much do embedding size and number of embeddings (dataset size in terms of number of rows) impact indexing and retrieval?

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from results_parser import get_experiments_data, compute_total_energy_per_run

In [2]:
small_embeddings = [
    "gte_small_arguana_milvus",
    "gte_small_nfcorpus_milvus",
    "gte_small_cqadupstack_webmasters_milvus",
]

medium_embeddings = [
    "gte_base_arguana_milvus",
    "gte_base_nfcorpus_milvus",
    "gte_base_cqadupstack_webmasters_milvus",
]

large_embeddings = [
    "gte_large_arguana_milvus",
    "gte_large_nfcorpus_milvus",
    "gte_large_cqadupstack_webmasters_milvus",
]

small_datasets = [
    "gte_small_nfcorpus_milvus",
    "gte_base_nfcorpus_milvus",
    "gte_large_nfcorpus_milvus",
]

medium_datasets = [
    "gte_small_arguana_milvus",
    "gte_base_arguana_milvus",
    "gte_large_arguana_milvus",
]

large_datasets = [
    "gte_small_cqadupstack_webmasters_milvus",
    "gte_base_cqadupstack_webmasters_milvus",
    "gte_large_cqadupstack_webmasters_milvus",
]

### 1. Indexing 

In [3]:
indexing_experiments_data = get_experiments_data("rq1_indexing")
indexing_runs_data = compute_total_energy_per_run(indexing_experiments_data)

#### 1.1. Indexing - varying the embedding size

In [4]:
def get_variation_runs_data(
    runs_data, small, medium, large, independent_variable, variable_values
):
    runs_list = []

    for index, data_independent_variable in enumerate([small, medium, large]):
        runs = pd.concat(
            [
                runs_data[experiment_name]["measurements_per_run"]
                for experiment_name in data_independent_variable
            ]
        )
        durations = np.array(
            [
                runs_data[experiment_name]["durations_per_run"]
                for experiment_name in data_independent_variable
            ]
        ).flatten()
        runs.loc[:, independent_variable] = variable_values[index]
        runs.loc[:, "duration"] = durations

        runs_list.append(runs)

    return pd.concat(runs_list)

In [5]:
index_embeeding_variation_runs = get_variation_runs_data(
    indexing_runs_data,
    small_embeddings,
    medium_embeddings,
    large_embeddings,
    "embedding_model",
    ["gte-small", "gte-medium", "gte-large"],
)

In [6]:
index_embeeding_variation_runs.groupby("embedding_model").mean()

Unnamed: 0_level_0,CPU Package Energy [J],IA Cores Energy [J],DRAM Energy [J],GPU Energy [J],duration
embedding_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gte-large,1593.636277,1476.930287,91.624827,21.8307,74.867835
gte-medium,1548.443717,1427.741667,88.32588,21.116927,72.196045
gte-small,1528.07523,1416.51032,87.978683,20.710953,71.679724


In [7]:
index_embeeding_variation_runs.groupby("embedding_model").std()

Unnamed: 0_level_0,CPU Package Energy [J],IA Cores Energy [J],DRAM Energy [J],GPU Energy [J],duration
embedding_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gte-large,914.227656,845.346862,52.692268,12.465734,42.944809
gte-medium,894.956228,828.676659,51.221369,12.147936,41.792178
gte-small,880.796452,820.785662,51.120953,11.925125,41.237786


#### 1.2. Indexing - varying the dataset size

In [8]:
indexing_dataset_varation_runs = get_variation_runs_data(
    indexing_runs_data,
    small_datasets,
    medium_datasets,
    large_datasets,
    "dataset",
    ["nfcorpus", "arguana", "cqadupstack_webmasters"],
)

In [9]:
indexing_dataset_varation_runs.groupby("dataset").mean()

Unnamed: 0_level_0,CPU Package Energy [J],IA Cores Energy [J],DRAM Energy [J],GPU Energy [J],duration
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
arguana,1359.995787,1255.122583,77.813597,18.548523,63.462206
cqadupstack_webmasters,2717.588217,2518.36086,156.230023,37.027683,127.502182
nfcorpus,592.57122,547.69883,33.88577,8.082373,27.779217


In [10]:
indexing_dataset_varation_runs.groupby("dataset").std()

Unnamed: 0_level_0,CPU Package Energy [J],IA Cores Energy [J],DRAM Energy [J],GPU Energy [J],duration
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
arguana,78.570334,65.646076,4.771157,0.586564,1.926374
cqadupstack_webmasters,100.251222,90.982672,6.440106,0.956987,3.10913
nfcorpus,53.946803,50.789573,3.222781,0.386302,1.281596


### 2. Querying

#### 2.1. Querying - varying the embedding size

In [11]:
querying_experiments_data = get_experiments_data("rq1_querying")

querying_runs_data = compute_total_energy_per_run(querying_experiments_data)

In [12]:
querying_embeeding_variation_runs = get_variation_runs_data(
    querying_runs_data,
    small_embeddings,
    medium_embeddings,
    large_embeddings,
    "embedding_model",
    ["gte-small", "gte-medium", "gte-large"],
)

In [13]:
querying_embeeding_variation_runs.groupby("embedding_model").mean()

Unnamed: 0_level_0,CPU Package Energy [J],IA Cores Energy [J],DRAM Energy [J],GPU Energy [J],duration
embedding_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gte-large,205.900217,187.060967,11.58073,3.305253,11.749643
gte-medium,196.00078,175.438853,10.854717,3.00248,10.613021
gte-small,157.821897,144.611433,9.01982,2.53154,9.016228


In [14]:
querying_embeeding_variation_runs.groupby("embedding_model").std()

Unnamed: 0_level_0,CPU Package Energy [J],IA Cores Energy [J],DRAM Energy [J],GPU Energy [J],duration
embedding_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gte-large,19.767506,17.324038,1.014807,0.365159,1.315556
gte-medium,23.740822,18.5303,1.104326,0.375154,1.279919
gte-small,16.843271,15.393932,0.830029,0.292839,1.022468


#### 2.2. Querying - varying the dataset size

In [15]:
querying_dataset_varation_runs = get_variation_runs_data(
    querying_runs_data,
    small_datasets,
    medium_datasets,
    large_datasets,
    "dataset",
    ["nfcorpus", "arguana", "cqadupstack_webmasters"],
)

In [16]:
querying_dataset_varation_runs.groupby("dataset").mean()

Unnamed: 0_level_0,CPU Package Energy [J],IA Cores Energy [J],DRAM Energy [J],GPU Energy [J],duration
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
arguana,178.213323,161.187507,10.024537,2.713793,9.603548
cqadupstack_webmasters,174.95704,157.10222,9.82508,2.712987,9.678519
nfcorpus,206.55253,188.821527,11.60565,3.412493,12.096826


In [17]:
querying_dataset_varation_runs.groupby("dataset").std()

Unnamed: 0_level_0,CPU Package Energy [J],IA Cores Energy [J],DRAM Energy [J],GPU Energy [J],duration
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
arguana,27.532373,22.16011,1.311363,0.30654,1.031931
cqadupstack_webmasters,27.369127,20.137289,1.181364,0.320375,1.149003
nfcorpus,20.967892,19.066076,1.207567,0.375068,1.329728
