In [1]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
import random
from sentence_transformers import util
import helper_functions as hp
from torch.optim.lr_scheduler import StepLR
#from imports import Dataset,torch,np,random,DataLoader,util
# DATASET CLASSES PYTORCH 
# import helper_functions as hp

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
class CustomDataset(Dataset):
    def __init__(self,dataset,openai_vectors_dict, edit_vectors_dict, neighbourhood_vectors_dict, paraphrase_vectors_dict,device):
        self.dataset=np.array(dataset,dtype=object)
        self.openai_vectors_dict=openai_vectors_dict
        self.edit_vectors_dict=edit_vectors_dict
        self.neighbourhood_vectors_dict=neighbourhood_vectors_dict
        self.paraphrase_vectors_dict=paraphrase_vectors_dict
        self.device = device


    def __len__(self):
        return len(self.dataset)

    def total_indexes(self):
        # print(self.dataset[0][2:])
        return np.unique(self.dataset[:, 3])

    def get_row_indexes(self,target_sample_index):
        return np.where(self.dataset[:, 3] == target_sample_index)[0]

    def get_samples_at_data_index(self,target_sample_index):
        row_indexes = np.where(self.dataset[:, 3] == target_sample_index)[0]
        emb1=[]
        emb2=[]
        label=[]
        row_index=[]
        sent1=[]
        sent2=[]
        for index in row_indexes:
        
          emb1.append(hp.to_tensor(self.dataset[index][0][0]))
          emb2.append(hp.to_tensor(self.dataset[index][1][0]))
          label.append(hp.to_tensor(self.dataset[index][2]))
          print(f"label: {label[index]}")
          row_index.append(hp.to_tensor(self.dataset[index][3]))
          sent1.append(self.dataset[index][4])
          sent2.append(self.dataset[index][5])
        return emb1.to(self.device), emb2.to(self.device), label.to(self.device),row_index, sent1, sent2

    def __getitem__(self, index):
        data_row=self.dataset[index]
        # print("aloha",data_row,data_row[0])
        if(data_row[-2]==0):#open ai paraphrase
            emb1 = hp.to_tensor(self.edit_vectors_dict[data_row[0]]).to(self.device)#, dtype=torch.float)
            emb2 = hp.to_tensor(self.openai_vectors_dict[data_row[0]][data_row[1]]).to(self.device)#, dtype=torch.float)
            label = hp.to_tensor(self.dataset[index][2]).to(self.device)#, dtype=torch.long)
            sample_index=self.dataset[index][3]
            sent1=self.dataset[index][4]
            sent2=self.dataset[index][5]
            pair_type=self.dataset[index][6]
            negative_sample_cntrl=self.dataset[index][7]
            emb1_index=data_row[0]
            emb2_index=data_row[1]

        elif(data_row[-2]==1):#paraphrase
            emb1 = hp.to_tensor(self.edit_vectors_dict[data_row[0]]).to(self.device)#, dtype=torch.float)
            emb2 = hp.to_tensor(self.paraphrase_vectors_dict[data_row[0]]).to(self.device)#, dtype=torch.float)
            label = hp.to_tensor(self.dataset[index][2]).to(self.device)#, dtype=torch.long)
            sample_index=self.dataset[index][3]
            sent1=self.dataset[index][4]
            sent2=self.dataset[index][5]
            pair_type=self.dataset[index][6]#neighbour,openai,paraphrase
            negative_sample_cntrl=self.dataset[index][7]
            emb1_index=data_row[0]#both should be the same
            emb2_index=data_row[1]#both should be the same

        else:#neighbour
            emb1 = hp.to_tensor(self.edit_vectors_dict[data_row[1]]).to(self.device)#, dtype=torch.float)
            emb2 = hp.to_tensor(self.neighbourhood_vectors_dict[data_row[1]][data_row[0]]).to(self.device)#, dtype=torch.float)
            #print(f"emb1.shape: {emb1.shape}, emb2.shape: {emb2.shape}")
            label = hp.to_tensor(self.dataset[index][2]).to(self.device)#, dtype=torch.long)
            sample_index=self.dataset[index][3]
            sent1=self.dataset[index][4]
            sent2=self.dataset[index][5]
            pair_type=self.dataset[index][6]
            negative_sample_cntrl=self.dataset[index][7]
            emb1_index=data_row[0]
            emb2_index=data_row[1]

        return emb1, emb2, label, sample_index, sent1, sent2, pair_type, emb1_index, emb2_index, negative_sample_cntrl


In [3]:
def get_data_loader(dataset_paired,openai_vectors_dict, edit_vectors_dict, neighbourhood_train_vectors_dict, paraphrase_train_vectors_dict,batch_size=8192,shuffle=True,device="cpu"):
  """
    dataset: dataset to be used
    shuffle: dataset shuffle per iteration

  """

  dataset_pt=CustomDataset(dataset_paired,openai_vectors_dict, edit_vectors_dict, neighbourhood_train_vectors_dict, paraphrase_train_vectors_dict,device=device)
  data_loader = DataLoader(dataset_pt, batch_size=batch_size, shuffle=shuffle)
  return data_loader

In [4]:
def create_dataset_pairs(dataset,neightbour_control=0,label_reversal=False):
    """
    The dataloader is designed to create pairwise samples. 
    Since an edit can have multyple paraphrases storing edit vector multiple times is redundant thus
    all vectors are store in dictionary based on the sample index(order that sample appear in the dataset.

    dataset_paired_train/dataset_paired_test=
    [index1,index2,label,row_index,sentence1,sentence2,pair_type,control]
        index1 and index2:
            for edits index1 is always used and index1 == row_index since it is single value per sample
            for paraphrases, open_ai_paraphrases index1,index2 is used, this is not row_index, but index of the
                paraphrase in the list of paraphrases in a sample
            for neighbours index2 is used, this is not row_index, but index of the
                neighbour in the list of neighbours in a sample
            for open_ai_paraphrases index2 is used this is not row_index, but index of the
                open_ai_paraphrase in the list of open_ai_paraphrases in a sample
                
    Inputs:
    dataset: path to dataset
    neightbour_control=0# ignore
    label_reversal=False#ignore
    """
    paraphrase=1
    neightbour=0



    openai_vectors_dict={}
    edit_vectors_dict={}
    neighbourhood_train_vectors_dict={}
    neighbourhood_test_vectors_dict={}
    paraphrase_train_vectors_dict={}
    paraphrase_test_vectors_dict={}

    dataset_paired_train=[]
    dataset_paired_test=[]
    for row_index,row in enumerate(dataset):
    #     # print(row["vector_edited_prompt"][:5],row["edited_prompt"][0])
    #     # print("\n\n")
    #     # print(row["vector_edited_prompt"][0]
        
        edit_vectors_dict[row_index]=row["vector_edited_prompt"][0]
        paraphrase_train_vectors_dict[row_index]=row["vector_edited_prompt_paraphrases_processed"][0]
        paraphrase_test_vectors_dict[row_index]=row["vector_edited_prompt_paraphrases_processed_testing"][0]

        num_elements_to_select = min(3, len(row["openai_usable_paraphrases_embeddings"]))#add 5 max open ai paraphrases
        # return  None, None, None, None, None, None, None, None

        sampled_indices, sampled_elements = zip(*random.sample(list(enumerate(row["openai_usable_paraphrases_embeddings"])), num_elements_to_select))# sample and get indexes
        for index,vector in zip(sampled_indices, sampled_elements):#create postive label with edit vector
            if(row_index not in openai_vectors_dict.keys()):
                openai_vectors_dict[row_index]={}
            openai_vectors_dict[row_index][index]=vector[0]
            # print(vector[:5],row["openai_usable_paraphrases"][index],"openai")
            dataset_paired_train.append([row_index,index,paraphrase,row_index,
                                    row["edited_prompt"][0],row["openai_usable_paraphrases"][index],0,0])


        dataset_paired_train.append([row_index,row_index,paraphrase,row_index,
                                    row["edited_prompt"][0],row["edited_prompt_paraphrases_processed"],1,1])
        # print(row["edited_prompt_paraphrases_processed"])
        dataset_paired_test.append([row_index,row_index,paraphrase,row_index,
                                    row["edited_prompt"][0],row["edited_prompt_paraphrases_processed_testing"],1,0])
        
        if(neightbour_control==0):
            for index,vector in enumerate(row["vectors_neighborhood_prompts_high_sim"]):
                if(row_index not in neighbourhood_train_vectors_dict.keys()):
                    neighbourhood_train_vectors_dict[row_index]={}
                neighbourhood_train_vectors_dict[row_index][index]=vector[0]
                dataset_paired_train.append([index,row_index,neightbour,row_index,
                                        row["edited_prompt"][0],row["neighborhood_prompts_high_sim"][index],2,1])

                # print(vector[:5],row["neighborhood_prompts_high_sim"][index],"high")
            for index,vector in enumerate(row["vectors_neighborhood_prompts_low_sim"]):
                if(row_index not in neighbourhood_test_vectors_dict.keys()):
                    neighbourhood_test_vectors_dict[row_index]={}
                neighbourhood_test_vectors_dict[row_index][index]=vector[0]
                dataset_paired_test.append([index,row_index,neightbour,row_index,
                                        row["edited_prompt"][0],row["neighborhood_prompts_low_sim"][index],2,0])
      
    return openai_vectors_dict, edit_vectors_dict, neighbourhood_train_vectors_dict, neighbourhood_test_vectors_dict, paraphrase_train_vectors_dict, paraphrase_test_vectors_dict, dataset_paired_train, dataset_paired_test


In [5]:
#testing the dataloaders
import json,linecache
def read_dataset_reduced(file_path_read_dataset: str,data_size):
    dataset=[]
    values_list = list(range(1, data_size+1))
    for index,number in enumerate(values_list):

        try:
            data_entry = json.loads(linecache.getline(file_path_read_dataset, number).strip())
            dataset.append(data_entry)
        except Exception as e:
            print(index)
            print(e)
    return dataset
file_path_dataset="counterfact_test_2_lama_merged.jsonl"
num_samples=4999
dataset=read_dataset_reduced(file_path_dataset,data_size=num_samples) 



In [6]:
openai_vectors_dict, edit_vectors_dict, neighbourhood_train_vectors_dict, neighbourhood_test_vectors_dict, paraphrase_train_vectors_dict, paraphrase_test_vectors_dict, dataset_paired_train, dataset_paired_test=create_dataset_pairs(dataset,neightbour_control=0)
input_dim = len( edit_vectors_dict[0])  
print(f"output vector length: {input_dim}")

output vector length: 4096


In [24]:
dataset[0].keys()

dict_keys(['orignal_prompt', 'edited_prompt', 'edited_prompt_paraphrases_processed', 'edited_prompt_paraphrases_unprocessed', 'edited_prompt_paraphrases_processed_testing', 'edited_prompt_paraphrases_unprocessed_testing', 'neighborhood_prompts_high_sim', 'neighborhood_prompts_low_sim', 'vector_edited_prompt', 'vector_edited_prompt_paraphrases_processed', 'vector_edited_prompt_paraphrases_processed_testing', 'vectors_neighborhood_prompts_high_sim', 'vectors_neighborhood_prompts_low_sim', 'openai_usable_paraphrases', 'openai_notused_paraphrases', 'openai_usable_paraphrases_embeddings'])

In [23]:
dataset_paired_train[0][1]

4

In [7]:
train_loader = get_data_loader(dataset_paired_train, openai_vectors_dict, edit_vectors_dict, neighbourhood_train_vectors_dict,paraphrase_train_vectors_dict, batch_size=1, shuffle=True)
test_loader = get_data_loader(dataset_paired_test, openai_vectors_dict, edit_vectors_dict, neighbourhood_test_vectors_dict,paraphrase_test_vectors_dict, batch_size=1, shuffle=True)


In [8]:
for batch in train_loader:
    for value in batch:
        print(value)
    break
    

tensor([[ 0.0027,  0.0107,  0.0083,  ...,  0.0053, -0.0154,  0.0047]])
tensor([[-0.0021,  0.0145, -0.0019,  ...,  0.0054, -0.0233,  0.0023]])
tensor([0.])
tensor([4164])
('GAM-87 Skybolt, created by',)
('AIR-2 Genie, created by',)
tensor([2])
tensor([0])
tensor([4164])
tensor([1])


In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ContrastiveNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(ContrastiveNetwork, self).__init__()
        self.feature = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )

    def forward(self, input1, input2):
        output1 = self.feature(input1)
        output2 = self.feature(input2) 
        return output1, output2


In [10]:
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        distance = F.pairwise_distance(output1, output2) 
        loss = label * distance.pow(2) + (1 - label) * F.relu(self.margin - distance).pow(2)
        return loss.mean()

In [11]:
train_loader = get_data_loader(dataset_paired_train, openai_vectors_dict, edit_vectors_dict, neighbourhood_train_vectors_dict,paraphrase_train_vectors_dict, batch_size=16, shuffle=True)
test_loader = get_data_loader(dataset_paired_test, openai_vectors_dict, edit_vectors_dict, neighbourhood_test_vectors_dict,paraphrase_test_vectors_dict, batch_size=16, shuffle=True)



In [12]:
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_dim = 4096
hidden_dim = 1900
model = ContrastiveNetwork(input_dim, hidden_dim).to(device)


criterion = ContrastiveLoss(margin=1.5)
num_epochs = 20
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-3, steps_per_epoch=len(train_loader), epochs=num_epochs)


for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for emb1, emb2, label, _, _, _, _, _, _, _ in train_loader:
        emb1, emb2, label = emb1.to(device), emb2.to(device), label.to(device)
        
        optimizer.zero_grad()
        output1, output2 = model(emb1, emb2)
        loss = criterion(output1, output2, label)
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    scheduler.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}")

torch.save(model, 'Contrastive_model.pth')


Epoch [1/20], Loss: 1017.5143
Epoch [2/20], Loss: 514.2677
Epoch [3/20], Loss: 374.3015
Epoch [4/20], Loss: 303.1574
Epoch [5/20], Loss: 260.9980
Epoch [6/20], Loss: 232.8871
Epoch [7/20], Loss: 212.6188
Epoch [8/20], Loss: 197.5293
Epoch [9/20], Loss: 185.3619
Epoch [10/20], Loss: 175.6341
Epoch [11/20], Loss: 167.7147
Epoch [12/20], Loss: 160.9575
Epoch [13/20], Loss: 155.2666
Epoch [14/20], Loss: 150.3882
Epoch [15/20], Loss: 146.1731
Epoch [16/20], Loss: 142.4765
Epoch [17/20], Loss: 139.2749
Epoch [18/20], Loss: 136.3055
Epoch [19/20], Loss: 133.7639
Epoch [20/20], Loss: 131.4333


In [13]:
import torch
import json
from collections import defaultdict

threshold_map = defaultdict(list)

def compute_threshold(model, edit_vector, paraphrase_vector):
    
    emb_edit, emb_para = model(edit_vector, paraphrase_vector)
    dist = torch.dist(emb_edit, emb_para).item()
    return dist



threshold_map = defaultdict(list)

model.eval()
with torch.no_grad():
    for emb_edit, emb_para, _, _, _, _, _, _, row_index, _ in train_loader:
        emb_edit = emb_edit.to(device)
        emb_para = emb_para.to(device)


        for i in range(len(row_index)):
            idx = int(row_index[i])
            threshold = compute_threshold(model, emb_edit[i], emb_para[i])


            threshold_map[idx].append(threshold)


final_threshold_map = {str(k): sum(v) / len(v) for k, v in threshold_map.items()}

with open("Contrastive_threshold_map.json", "w") as f:
    json.dump(final_threshold_map, f, indent=4)

print(f"Threshold saved for {len(final_threshold_map)} edit vectors")


Threshold saved for 4999 edit vectors


In [14]:

def predict_label(model, emb1, emb2, label, threshold, generalization, locality, positive_total, negative_total):
    emb_1, emb_2 = model(emb1, emb2)

    distance = torch.dist(emb_1, emb_2).item()

    if label == 1:
        positive_total += 1
        if distance < threshold:
            generalization += 1
            return 1, distance, generalization, locality, positive_total, negative_total
        else:
            return 0, distance, generalization, locality, positive_total, negative_total
        
    if label == 0:
        negative_total += 1
        if distance > threshold:
            locality += 1
            return 0, distance, generalization, locality, positive_total, negative_total
        else:
            return 1, distance, generalization, locality, positive_total, negative_total

with open("Contrastive_threshold_map.json", "r") as f:
    threshold_map = json.load(f)

positive_total = 0
negative_total = 0
correct = 0
generalization_number = 0
locality_number = 0
incorrect_predictions = []

model.eval()
with torch.no_grad():
    for emb1, emb2, label, _, sent1, sent2, _, _, row_index, _ in test_loader:
        emb1 = emb1.to(device)
        emb2 = emb2.to(device)
        row_index = row_index.cpu().numpy()

        for i in range(len(row_index)):
            threshold = threshold_map.get(str(row_index[i]), 1.0)
            single_label = label[i].item()

            predicted_label, distance, generalization_number, locality_number, positive_total, negative_total = predict_label(model, emb1[i], emb2[i], single_label, threshold, generalization_number, locality_number, positive_total, negative_total)

            if predicted_label == label[i]:
                correct += 1
            else:
                incorrect_predictions.append({
                    "row_index": row_index[i],
                    "true_label": int(label[i]),
                    "predicted_label": predicted_label,
                    "edit_sentence": sent1[i], 
                    "paraphrase_sentence": sent2[i],  
                    "similarity": distance,    
                    "threshold": threshold     
                        
                })
print(f"gen_number: {generalization_number} and pos_numebr: {positive_total}")
print(f"loc_number: {locality_number} and loc_numebr: {negative_total}")

generalization = generalization_number / positive_total
locality = locality_number / negative_total
print(f"Generalization: {generalization:.4f}")
print(f"Locality: {locality:.4f}")


with open("Contrastive_incorrect_predictions.json", "w") as f:
    json.dump(incorrect_predictions, f, indent=4, default=str)


print(f"Incorrect predictions saved to 'incorrect_predictions.json'")



gen_number: 4973 and pos_numebr: 4999
loc_number: 19283 and loc_numebr: 24995
Generalization: 0.9948
Locality: 0.7715
Incorrect predictions saved to 'incorrect_predictions.json'
