In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import os

In [2]:
ref_dataset = "ct24:latest"
ref_dev_dataset = "ct24-dev"
ref_dev_test_dataset = "ct24-dev-test"
ref_test_dataset = "ct24-test"
dataset_1 = "ct24_synthetic_only:latest"
dataset_2 = "ct24_synth_0_10k:latest"
dataset_3 = "general_claim_filtered:latest"

In [3]:
import wandb

api = wandb.Api()

project = "redstag/thesis"
artifact_names = [ref_dataset, dataset_1, dataset_2, dataset_3]

artifacts = {}
for name in artifact_names:
    artifact = api.artifact(f"{project}/{name}")
    artifact_dir = artifact.download()
    artifacts[name] = artifact_dir

[34m[1mwandb[0m:   4 of 4 files downloaded.  
[34m[1mwandb[0m:   4 of 4 files downloaded.  
[34m[1mwandb[0m:   6 of 6 files downloaded.  
[34m[1mwandb[0m:   4 of 4 files downloaded.  


In [4]:
dfs = {}
for name, dir_path in artifacts.items():
    csv_path = os.path.join(dir_path, "train.csv")
    dfs[name] = pd.read_csv(csv_path)

# Add CT24 eval, dev-test and test-gold
dfs[ref_dev_dataset] = pd.read_csv(os.path.join(artifacts[ref_dataset], "dev.csv"))
dfs[ref_dev_test_dataset] = pd.read_csv(os.path.join(artifacts[ref_dataset], "dev-test.csv"))
dfs[ref_test_dataset] = pd.read_csv(os.path.join(artifacts[ref_dataset], "test.csv"))

In [5]:
model = SentenceTransformer("all-mpnet-base-v2")

In [6]:
embeddings = {}

for name, frame in dfs.items():
    embeddings[name] = model.encode(list(frame["Text"]), show_progress_bar=True)

Batches:   0%|          | 0/704 [00:00<?, ?it/s]

Batches:   0%|          | 0/704 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/1549 [00:00<?, ?it/s]

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

In [19]:
all_datasets = [ref_dev_dataset, ref_dev_test_dataset, ref_test_dataset, dataset_1, dataset_2, dataset_3]
filter_label = "Yes"

sims = {}
for ds in all_datasets:
    if filter_label:
        mask_ref = dfs[ref_dataset]["class_label"] == filter_label
        mask_ds = dfs[ds]["class_label"] == filter_label
        sims[ds] = model.similarity(
            embeddings[ref_dataset][mask_ref.values],
            embeddings[ds][mask_ds.values]
        )
    else:
        sims[ds] = model.similarity(embeddings[ref_dataset], embeddings[ds])

In [20]:
for ds, s in sims.items():
    print(f"Similarities {ds}:")
    print(f"min: {s.min()}")
    print(f"max: {s.max()}")
    print(f"mean: {s.mean()}")
    print("-----------------")

Similarities ct24-dev:
min: -0.2065959870815277
max: 1.0000001192092896
mean: 0.1746121048927307
-----------------
Similarities ct24-dev-test:
min: -0.22296826541423798
max: 0.8025941848754883
mean: 0.15725839138031006
-----------------
Similarities ct24-test:
min: -0.25562357902526855
max: 0.7282082438468933
mean: 0.1478351205587387
-----------------
Similarities ct24_synthetic_only:latest:
min: -0.2824961245059967
max: 0.9844942092895508
mean: 0.1499294489622116
-----------------
Similarities ct24_synth_0_10k:latest:
min: -0.272604376077652
max: 0.9122903347015381
mean: 0.08540401607751846
-----------------
Similarities general_claim_filtered:latest:
min: -0.2737082540988922
max: 0.9999999403953552
mean: 0.07583823055028915
-----------------
