In [11]:
import json
from pathlib import Path

path = Path("output_dir_our/no_model_name_available/no_revision_available")

json_files = path.glob("*.json")

scores = []

for file in json_files:
    with open(file) as f:
        data = json.load(f)
        if "scores" in data:
            # print(data["scores"]['test'][0]['main_score'])
            scores.append(data["scores"]['test'][0]['main_score'])

sum(scores) / len(scores)

0.29353802216121283

In [25]:
import torch
import torch.nn.functional as F
import numpy
from transformers import AutoTokenizer, AutoModel
import os

from scipy.stats import spearmanr
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/unsup-simcse-roberta-base")
base_model = AutoModel.from_pretrained("princeton-nlp/unsup-simcse-roberta-base").to(
    "cuda"
)

# our_model = AutoModel.from_pretrained(
#     "/home/decycle/ML/simcse_finetune/test_trainer/checkpoint-64800"
# ).to("cuda")

our_model = AutoModel.from_pretrained(
    "/home/decycle/ML/simcse_finetune/test_trainer_v4/checkpoint-50400"
).to("cuda")

In [26]:
from tqdm import trange

class SentenceTransformer():
    def __init__(self, model):
        self.model = model
    def encode(self, sentences, max_length=512, batch_size=32, convert_to_tensor=True, **kwargs):
        with torch.no_grad():
            inputs = tokenizer(
                sentences,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=max_length,
            )
            input_ids = inputs["input_ids"].to("cuda")
            attention_mask = inputs["attention_mask"].to("cuda")

            embeddings = torch.zeros(
                len(sentences), 768
            ).to("cuda")
            for i in trange(0, len(sentences), batch_size):
                batch_input_ids = input_ids[i : i + batch_size]
                batch_attention_mask = attention_mask[i : i + batch_size]
                outputs = self.model(
                    input_ids=batch_input_ids,
                    attention_mask=batch_attention_mask,
                )
                embeddings[i : i + batch_size] = outputs.pooler_output
            if convert_to_tensor:
                return embeddings
            else:
                return embeddings.cpu().numpy()

In [27]:
base_sentence_model = SentenceTransformer(base_model)
our_sentence_model = SentenceTransformer(our_model)

In [28]:
# import mteb

# retrieval_task_list = [
#     "LEMBSummScreenFDRetrieval",
#     "LEMBQMSumRetrieval",
#     "LEMBWikimQARetrieval",
#     "LEMBNarrativeQARetrieval",
# ]
# output_dict = {}
# tasks = mteb.get_tasks(tasks=retrieval_task_list)
# evaluation = mteb.MTEB(tasks=tasks)
# results = evaluation.run(
#     model=base_sentence_model,
#     output_folder="evaluation/base_model",
#     overwrite_results=True,
#     batch_size=32,
#     verbosity=0,
# )
# for key, value in results.items():
#     split = "test" if "test" in value else "validation"
#     output_dict[key] = {
#         "ndcg@1": value[split]["ndcg_at_1"],
#         "ndcg@10": value[split]["ndcg_at_10"],
#     }
# print(output_dict)

In [29]:
dataset = load_dataset("toughdata/quora-question-answer-dataset", split="train")

In [30]:
base_model.eval()
our_model.eval()

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

In [31]:
# test

test_sentences = [
    "Dogs are really cute.",
    "Dogs are lovely.",
    "I hate dogs.",
    "WIndows is not a good operating system.",
    "i like apples.",
    "one apple a day keeps the doctor away.",
]

base_embeddings = base_sentence_model.encode(test_sentences)
our_embeddings = our_sentence_model.encode(test_sentences)

base_embeddings = F.normalize(base_embeddings, p=2, dim=1)
our_embeddings = F.normalize(our_embeddings, p=2, dim=1)

base_similarity = torch.matmul(base_embeddings, base_embeddings.T)
our_similarity = torch.matmul(our_embeddings, our_embeddings.T)

print(base_similarity)
print(our_similarity)

100%|██████████| 1/1 [00:00<00:00, 226.08it/s]
100%|██████████| 1/1 [00:00<00:00, 195.22it/s]

tensor([[1.0000, 0.8349, 0.6276, 0.2650, 0.2556, 0.1174],
        [0.8349, 1.0000, 0.6187, 0.2925, 0.2267, 0.1489],
        [0.6276, 0.6187, 1.0000, 0.1884, 0.3024, 0.2215],
        [0.2650, 0.2925, 0.1884, 1.0000, 0.1466, 0.0638],
        [0.2556, 0.2267, 0.3024, 0.1466, 1.0000, 0.6059],
        [0.1174, 0.1489, 0.2215, 0.0638, 0.6059, 1.0000]], device='cuda:0')
tensor([[1.0000, 0.8821, 0.6038, 0.2826, 0.2847, 0.1374],
        [0.8821, 1.0000, 0.5995, 0.3037, 0.2919, 0.1375],
        [0.6038, 0.5995, 1.0000, 0.1856, 0.3621, 0.2265],
        [0.2826, 0.3037, 0.1856, 1.0000, 0.2135, 0.1094],
        [0.2847, 0.2919, 0.3621, 0.2135, 1.0000, 0.4744],
        [0.1374, 0.1375, 0.2265, 0.1094, 0.4744, 1.0000]], device='cuda:0')





In [32]:
# change dropout to 0.05
for name, param in our_model.named_parameters():
    if name == 'encoder.layer.0.attention.sel':
        print(param.data)
        print(param.data.shape)
    # if "Dropout" in name:
    #     # param.data.fill_(0.05)
    #     print(param.data)

In [33]:
from tqdm import trange

questions = dataset["question"]
answer = dataset["answer"]

def get_embedding(texts, model, batch_size=128):
    model.eval()
    embeddings = []

    for i in trange(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=512).to("cuda")
        with torch.no_grad():
            embeddings.append(model(**inputs).pooler_output)
    return F.normalize(torch.cat(embeddings))

test_size = 40960

base_questions_embedding = get_embedding(questions[:test_size], base_model)
our_questions_embedding = get_embedding(questions[:test_size], our_model)
base_answer_embedding = get_embedding(answer[:test_size], base_model, batch_size=32)
our_answer_embedding = get_embedding(answer[:test_size], our_model, batch_size=32)

100%|██████████| 320/320 [00:23<00:00, 13.81it/s]
100%|██████████| 320/320 [00:23<00:00, 13.66it/s]
100%|██████████| 1280/1280 [04:00<00:00,  5.33it/s]
100%|██████████| 1280/1280 [04:00<00:00,  5.32it/s]


In [34]:
# from seaborn import heatmap
# from matplotlib import pyplot as plt

base_similarity_matrix = torch.matmul(
    base_questions_embedding, base_questions_embedding.T
)
our_similarity_matrix = torch.matmul(our_questions_embedding, our_questions_embedding.T)

# fig, ax = plt.subplots(1, 2, figsize=(20, 10))
# heatmap(base_similarity_matrix.cpu().numpy(), ax=ax[0], vmin=0, vmax=1)
# ax[0].set_title("Base Model")
# heatmap(our_similarity_matrix.cpu().numpy(), ax=ax[1], vmin=0, vmax=1)
# ax[1].set_title("Our Model")

In [35]:
import numpy as np


def mean_reciprocal_rank(similarity_matrix):
    sim_matrix = similarity_matrix.cpu().numpy()
    n = sim_matrix.shape[0]

    reciprocal_ranks = []
    for i in range(n):
        row = sim_matrix[i]
        ranks = (-row).argsort()  # Rank in descending order
        rank_of_diag = np.where(ranks == i)[0][0] + 1  # 1-based rank
        reciprocal_ranks.append(1 / rank_of_diag)

    return np.mean(reciprocal_ranks)


print("Base Model MRR:", mean_reciprocal_rank(base_similarity_matrix))
print("Our Model MRR:", mean_reciprocal_rank(our_similarity_matrix))

Base Model MRR: 0.18596852633068217
Our Model MRR: 0.18750335292910994


In [36]:
def get_uniform_loss(embeddings):
    distance_matrix = torch.pdist(embeddings, p=2).pow(2)
    exp_kernel = torch.exp(-2 * distance_matrix)
    uniform_loss = torch.log(exp_kernel.mean())
    return uniform_loss.item()


def get_alignment_loss(embeddings_1, embeddings_2):
    return (embeddings_1 - embeddings_2).norm(p=2, dim=1).pow(2).mean().item()

base_uniform_loss_question = get_uniform_loss(base_questions_embedding)
base_uniform_loss_answer = get_uniform_loss(base_answer_embedding)
base_alignment_loss = get_alignment_loss(base_questions_embedding, base_answer_embedding)

our_uniform_loss_question = get_uniform_loss(our_questions_embedding)
our_uniform_loss_answer = get_uniform_loss(our_answer_embedding)
our_alignment_loss = get_alignment_loss(our_questions_embedding, our_answer_embedding)

In [37]:
our_questions_embedding.shape

torch.Size([40960, 768])

In [38]:
our_questions_embedding.shape

torch.Size([40960, 768])

In [39]:
similarity_matrix = torch.matmul(our_questions_embedding, our_answer_embedding.T)

In [40]:
def get_ranking(similarity_matrix):
    rank = torch.argsort(similarity_matrix, dim=1, descending=True)
    # find the index of i in ith row
    row_indices = torch.arange(rank.size(0)).to(rank.device)
    # Compare each element in base_rank with its row index
    # row_indices.unsqueeze(1)
    comparison = rank == row_indices.unsqueeze(1)
    # Find the index where the value is True in each row
    positions = comparison.nonzero()[:, 1]

    return positions

def plot_top_k(ranking, name):
    data = []
    for k in range(200):
        data.append((ranking < k).float().mean().item())

    plt.plot(range(200), data, label=name)

ranking_base = get_ranking(base_similarity_matrix)
ranking_our = get_ranking(our_similarity_matrix)



KeyboardInterrupt: 

In [None]:
# ranking_base


tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 1], device='cuda:0')

In [None]:
print("Base Model Uniform Loss (Question):", base_uniform_loss_question)
print("Base Model Uniform Loss (Answer):", base_uniform_loss_answer)
print("Base Model Alignment Loss:", base_alignment_loss)
print("Our Model Uniform Loss (Question):", our_uniform_loss_question)
print("Our Model Uniform Loss (Answer):", our_uniform_loss_answer)
print("Our Model Alignment Loss:", our_alignment_loss)

Base Model Uniform Loss (Question): -3.6299386024475098
Base Model Uniform Loss (Answer): -3.2607717514038086
Base Model Alignment Loss: 1.210917353630066
Our Model Uniform Loss (Question): -3.420525550842285
Our Model Uniform Loss (Answer): -3.3810951709747314
Our Model Alignment Loss: 1.1982558965682983


: 

In [None]:
# dataset = load_dataset("wikipedia", "20220301.en", split="train", trust_remote_code=True, streaming=True)
# dataset_len = 6_458_670

In [None]:
# dataset

In [None]:
def compute_loss(
    model,
    inputs,
    temperature=1,
):
    model.train()

    output1 = model(**inputs, use_cache=False).pooler_output
    output2 = model(**inputs, use_cache=False).pooler_output

    output1 = F.normalize(output1, p=2, dim=1)
    output2 = F.normalize(output2, p=2, dim=1)

    M = output1 @ (output2.T)
    # print(M)
    M /= temperature
    # # Compute log softmax along the rows
    log_softmax_M = F.log_softmax(M, dim=1)
    # print(log_softmax_M)

    # # Extract the diagonal elements of the log softmax matrix
    diag_log_softmax = torch.diagonal(log_softmax_M)
    # # Compute the loss as the negative sum of diagonal elements
    loss = -diag_log_softmax.mean()
    return loss

input_text = [
    "how are you doing",
    "cat vomits",
    "dog poop"
    ]

tokens = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt", max_length=512).to("cuda")
compute_loss(our_model, tokens)

tensor(0.7606, device='cuda:0', grad_fn=<NegBackward0>)

In [None]:
def compute_loss2(
    model,
    inputs,
    temperature=0.05,
):
    model.train()

    output1 = model(**inputs, use_cache=False).pooler_output
    output2 = model(**inputs, use_cache=False).pooler_output

    output1 = F.normalize(output1, p=2, dim=1)
    output2 = F.normalize(output2, p=2, dim=1)

    cos_sim = output1 @ (output2.T)
    cos_sim /= temperature
    print(cos_sim)
    labels = torch.arange(cos_sim.size(0)).to(cos_sim.device)
    print(labels)

    loss_func = torch.nn.CrossEntropyLoss()
    loss = loss_func(cos_sim, labels)
    print(loss)
    # print(M)
    # M /= temperature
    # # # Compute log softmax along the rows
    # log_softmax_M = F.softmax(M, dim=1)
    # print(log_softmax_M)

    # # Extract the diagonal elements of the log softmax matrix
    # diag_log_softmax = torch.diagonal(log_softmax_M)
    # # Compute the loss as the negative sum of diagonal elements
    # loss = -diag_log_softmax.mean()
    # return loss


input_text = ["how are you doing", "cat vomits", "dog poop"]

tokens = tokenizer(
    input_text, padding=True, truncation=True, return_tensors="pt", max_length=512
).to("cuda")

compute_loss2(our_model, tokens)

tensor([[17.2711,  4.0674,  6.2838],
        [ 2.5451, 17.0781,  5.4657],
        [ 7.0869,  6.2003, 16.0998]], device='cuda:0', grad_fn=<DivBackward0>)
tensor([0, 1, 2], device='cuda:0')
tensor(6.6752e-05, device='cuda:0', grad_fn=<NllLossBackward0>)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/unsup-simcse-roberta-base")

dataset = load_dataset(
    "abokbot/wikipedia-first-paragraph",
    split="train",
    trust_remote_code=True,
)


def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=32,
    )


tokenized_datasets = dataset.map(tokenize_function, batched=True)

KeyboardInterrupt: 

In [None]:
# make sure the input data makes sense
ids = tokenized_datasets[138]['input_ids']
string = tokenizer.decode(ids)

print(string)