# RQ1

How much do embedding size and number of embeddings (dataset size in terms of number of rows) impact indexing and retrieval?

In [1]:
%load_ext autoreload
%autoreload 2

from results_processor import get_experiments_data, compute_total_energy_per_run, get_variation_runs_data, compute_kruskal_wallis

In [2]:
small_embeddings = [
    "gte_small_arguana_milvus",
    "gte_small_nfcorpus_milvus",
    "gte_small_cqadupstack_webmasters_milvus",
]

medium_embeddings = [
    "gte_base_arguana_milvus",
    "gte_base_nfcorpus_milvus",
    "gte_base_cqadupstack_webmasters_milvus",
]

large_embeddings = [
    "gte_large_arguana_milvus",
    "gte_large_nfcorpus_milvus",
    "gte_large_cqadupstack_webmasters_milvus",
]

small_datasets = [
    "gte_small_nfcorpus_milvus",
    "gte_base_nfcorpus_milvus",
    "gte_large_nfcorpus_milvus",
]

medium_datasets = [
    "gte_small_arguana_milvus",
    "gte_base_arguana_milvus",
    "gte_large_arguana_milvus",
]

large_datasets = [
    "gte_small_cqadupstack_webmasters_milvus",
    "gte_base_cqadupstack_webmasters_milvus",
    "gte_large_cqadupstack_webmasters_milvus",
]

### 1. Indexing 

In [3]:
indexing_experiments_data = get_experiments_data(
    "rq1_indexing",
    [
        "Total DRAM Power [W]",
        "IA Cores Power [W]",
    ],
)

indexing_runs_data = compute_total_energy_per_run(indexing_experiments_data)

#### 1.1. Indexing - varying the embedding size

In [4]:
index_embeeding_variation_runs = get_variation_runs_data(
    indexing_runs_data,
    small_embeddings,
    medium_embeddings,
    large_embeddings,
    "embedding_model",
    ["gte-small", "gte-medium", "gte-large"],
)

In [5]:
index_embeeding_variation_runs.groupby("embedding_model").mean()

Unnamed: 0_level_0,IA Cores Energy [J],DRAM Energy [J],duration [s]
embedding_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gte-large,1476.930287,91.624827,74.867835
gte-medium,1427.741667,88.32588,72.196045
gte-small,1416.51032,87.978683,71.679724


In [6]:
index_embeeding_variation_runs.groupby("embedding_model").std()

Unnamed: 0_level_0,IA Cores Energy [J],DRAM Energy [J],duration [s]
embedding_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gte-large,845.346862,52.692268,42.944809
gte-medium,828.676659,51.221369,41.792178
gte-small,820.785662,51.120953,41.237786


In [7]:
compute_kruskal_wallis(
    index_embeeding_variation_runs,
    "embedding_model",
    ["gte-small", "gte-medium", "gte-large"],
    "duration [s]",
)

KruskalResult(statistic=np.float64(4.269401709401734), pvalue=np.float64(0.11827996798125272))

In [8]:
compute_kruskal_wallis(
    index_embeeding_variation_runs,
    "embedding_model",
    ["gte-small", "gte-medium", "gte-large"],
    "IA Cores Energy [J]",
)

KruskalResult(statistic=np.float64(2.1558974358974865), pvalue=np.float64(0.3402928468080614))

#### 1.2. Indexing - varying the dataset size

In [9]:
indexing_dataset_variation_runs = get_variation_runs_data(
    indexing_runs_data,
    small_datasets,
    medium_datasets,
    large_datasets,
    "dataset",
    ["nfcorpus", "arguana", "cqadupstack_webmasters"],
)

In [10]:
indexing_dataset_variation_runs.groupby("dataset").mean()

Unnamed: 0_level_0,IA Cores Energy [J],DRAM Energy [J],duration [s]
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
arguana,1255.122583,77.813597,63.462206
cqadupstack_webmasters,2518.36086,156.230023,127.502182
nfcorpus,547.69883,33.88577,27.779217


In [11]:
indexing_dataset_variation_runs.groupby("dataset").std()

Unnamed: 0_level_0,IA Cores Energy [J],DRAM Energy [J],duration [s]
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
arguana,65.646076,4.771157,1.926374
cqadupstack_webmasters,90.982672,6.440106,3.10913
nfcorpus,50.789573,3.222781,1.281596


In [12]:
compute_kruskal_wallis(
    indexing_dataset_variation_runs,
    "dataset",
    ["nfcorpus", "arguana", "cqadupstack_webmasters"],
    "duration [s]",
)

KruskalResult(statistic=np.float64(79.12087912087912), pvalue=np.float64(6.593551417550709e-18))

In [13]:
compute_kruskal_wallis(
    indexing_dataset_variation_runs,
    "dataset",
    ["nfcorpus", "arguana", "cqadupstack_webmasters"],
    "IA Cores Energy [J]",
)

KruskalResult(statistic=np.float64(79.12087912087912), pvalue=np.float64(6.593551417550709e-18))

### 2. Querying

#### 2.1. Querying - varying the embedding size

In [14]:
querying_experiments_data = get_experiments_data(
    "rq1_querying",
    [
        "Total DRAM Power [W]",
        "IA Cores Power [W]",
    ],
)

querying_runs_data = compute_total_energy_per_run(querying_experiments_data)

In [15]:
querying_embeeding_variation_runs = get_variation_runs_data(
    querying_runs_data,
    small_embeddings,
    medium_embeddings,
    large_embeddings,
    "embedding_model",
    ["gte-small", "gte-medium", "gte-large"],
)

In [16]:
querying_embeeding_variation_runs.groupby("embedding_model").mean()

Unnamed: 0_level_0,IA Cores Energy [J],DRAM Energy [J],duration [s]
embedding_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gte-large,187.060967,11.58073,11.749643
gte-medium,175.438853,10.854717,10.613021
gte-small,144.611433,9.01982,9.016228


In [17]:
querying_embeeding_variation_runs.groupby("embedding_model").std()

Unnamed: 0_level_0,IA Cores Energy [J],DRAM Energy [J],duration [s]
embedding_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gte-large,17.324038,1.014807,1.315556
gte-medium,18.5303,1.104326,1.279919
gte-small,15.393932,0.830029,1.022468


In [18]:
compute_kruskal_wallis(
    querying_embeeding_variation_runs,
    "embedding_model",
    ["gte-small", "gte-medium", "gte-large"],
    "duration [s]",
)

KruskalResult(statistic=np.float64(44.50666666666666), pvalue=np.float64(2.1652105243405554e-10))

In [19]:
compute_kruskal_wallis(
    querying_embeeding_variation_runs,
    "embedding_model",
    ["gte-small", "gte-medium", "gte-large"],
    "IA Cores Energy [J]",
)

KruskalResult(statistic=np.float64(46.33435897435896), pvalue=np.float64(8.6820401011881e-11))

#### 2.2. Querying - varying the dataset size

In [20]:
querying_dataset_varation_runs = get_variation_runs_data(
    querying_runs_data,
    small_datasets,
    medium_datasets,
    large_datasets,
    "dataset",
    ["nfcorpus", "arguana", "cqadupstack_webmasters"],
)

In [21]:
querying_dataset_varation_runs.groupby("dataset").mean()

Unnamed: 0_level_0,IA Cores Energy [J],DRAM Energy [J],duration [s]
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
arguana,161.187507,10.024537,9.603548
cqadupstack_webmasters,157.10222,9.82508,9.678519
nfcorpus,188.821527,11.60565,12.096826


In [22]:
querying_dataset_varation_runs.groupby("dataset").std()

Unnamed: 0_level_0,IA Cores Energy [J],DRAM Energy [J],duration [s]
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
arguana,22.16011,1.311363,1.031931
cqadupstack_webmasters,20.137289,1.181364,1.149003
nfcorpus,19.066076,1.207567,1.329728


In [23]:
compute_kruskal_wallis(
    querying_dataset_varation_runs,
    "dataset",
    ["nfcorpus", "arguana", "cqadupstack_webmasters"],
    "duration [s]",
)

KruskalResult(statistic=np.float64(38.41142857142853), pvalue=np.float64(4.561044029837794e-09))

In [24]:
compute_kruskal_wallis(
    querying_dataset_varation_runs,
    "dataset",
    ["nfcorpus", "arguana", "cqadupstack_webmasters"],
    "IA Cores Energy [J]",
)

KruskalResult(statistic=np.float64(27.424273504273458), pvalue=np.float64(1.1089058470793492e-06))