In [None]:

import glob
import random
import numpy as np
import pandas as pd
from torch.utils.data import Dataset


class MultiViewValueQADataset(Dataset):
    def __init__(self, dataset_path, qa_per_view=1):
        
        self.all_qa_samples = self.read_samples(dataset_path)
        self.value_idx_map = {i: k for i,k in enumerate(self.all_qa_samples.keys())}
        self.qa_per_view = qa_per_view
        
        num_pair = []
        for k, v in self.all_qa_samples.items():
            num_pair.append(len(v))
            # print(f" [Value Dataset] {len(v)} samples in {k}")
        self.end_value_idx = np.array(num_pair).cumsum()

    def read_samples(self, path):
        dfs = []
        for path in sorted(glob.glob(f'{path}/*.csv*')):
            # print(f" [Value Dataset] Reading {path}")
            dfs.append(pd.read_csv(path))
        df = pd.concat(dfs)
        df = df.dropna(subset=['answer']).reset_index(drop=True)
        df['qa'] = df.apply(lambda x: f"Q: {x['question'].strip()} A: {x['answer'].strip()}", axis='columns')

        samples = {f'{k[0]}${k[1]}': tdf['qa'].tolist() for k, tdf in df.groupby(['model', 'lang'])}
        # print(f" [Value Dataset] Total {len(samples)} values loaded")
        return samples
        
    def __len__(self):
        return self.end_value_idx[-1]

    def __getitem__(self, idx):
        value_idx = np.argmax(self.end_value_idx > idx)       
        value_key = self.value_idx_map[value_idx]
        qa_data = self.all_qa_samples[value_key]

        view_1 = '\n'.join(random.sample(qa_data, min(len(qa_data), self.qa_per_view)))
        view_2 = '\n'.join(random.sample(qa_data, min(len(qa_data), self.qa_per_view)))
 
        sample = {'query': view_1, 'document': view_2, 'dataset_name': value_key} 
        return sample
    

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

from sentence_transformers import SentenceTransformer


for n_samples, dataset_path in [
    (10000, '/share/value-embedding/datasets/20240314_gen_qa_pairs_translated_to_english/train'),
    (5000, '/share/value-embedding/datasets/20240314_gen_qa_pairs_translated_to_english/oos'),
]:
    for qa_per_view, title, model_path in  [
            # (1, "single QA per view all-mpnet-base-v2 (Mar29)", "/share/value-embedding/checkpoints/Mar29_19-24-52_eez116_[translated]-[all-mpnet-base-v2]-[infonce]/140340"),
            # (5, "5 QA per view reward-model-deberta-v3-base (Apr07)", "/share/value-embedding/checkpoints/Apr07_17-33-46_eez116_arxiv-v1-1ep-[reward-model-deberta-v3-base]-[infonce-5_qa_per_view]/18712"),

            # (1, "single QA per view nomic (context=128 tokens)", "/home/dchenbs/workspace/contrastors/src/contrastors/ckpts/nomic-embed-text-v1-len128-bs128-4gpu/sbert"),
            # (5, "5 QA per view nomic (context=512 tokens)", "/home/dchenbs/workspace/contrastors/src/contrastors/ckpts/nomic-embed-text-v1-len512-bs128-4gpu/sbert"),

            (100, "nomic (context=2048 tokens)",  "/home/dchenbs/workspace/contrastors/src/contrastors/ckpts/nomic-embed-text-v1-len2048-bs128-4gpu/sbert"),
            (100, "nomic (context=4096 tokens)","/home/dchenbs/workspace/contrastors/src/contrastors/ckpts/nomic-embed-text-v1-len4096-bs128-4gpu/sbert")
        ]:

        print(f"Using {qa_per_view} QA per view model {model_path}")
        torch.cuda.empty_cache()

        dataset = MultiViewValueQADataset(dataset_path, qa_per_view=qa_per_view)
        model = SentenceTransformer(model_path, trust_remote_code=True).to('cuda')

        # gather samples
        samples = {}
        embeddings = {}
        for value in dataset.value_idx_map.values():
            samples[value] = []
            embeddings[value] = []

        for i in range(n_samples):
            sample = dataset[random.randint(0, len(dataset)-1)]
            samples[sample['dataset_name']].append(sample['query'])

        # encoding
        for value in dataset.value_idx_map.values():
            print(f"Extract embeddings for {len(samples[value])} samples in {value}")
            embeddings[value] = model.encode(samples[value], convert_to_tensor=True, show_progress_bar=True, batch_size=8)

        # tsne
        tsne = TSNE(n_components=2, perplexity=30, n_iter=300)
        X = np.concatenate([v.cpu().numpy() for v in embeddings.values()])
        X_embedded = tsne.fit_transform(X)

        plt.figure(figsize=(15, 15))
        sns.scatterplot(x=X_embedded[:,0], y=X_embedded[:,1], hue=np.concatenate([[k]*v.shape[0] for k,v in embeddings.items()]), palette='tab20', size=0.5, alpha=0.5)
        plt.title(title+'\n'+model_path)

        plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)
        plt.show()
