In [14]:
import numpy as np
import json

from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset,DataLoader
from tqdm import tqdm, trange
from datasets import load_dataset
from sklearn.metrics.pairwise import euclidean_distances
from collections import Counter, defaultdict

In [2]:
dataset = load_dataset("sst2")

Found cached dataset sst2 (/Users/deathscope/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# Convert to pandas DataFrames
train_df = dataset['train'].to_pandas().rename(columns={'idx': 'id'}).set_index('id')
validation_df = dataset['validation'].to_pandas().rename(columns={'idx': 'id'}).set_index('id')
test_df = dataset['test'].to_pandas().rename(columns={'idx': 'id'}).set_index('id')

In [16]:
EPS = 1.0
TOP_K = 20
WORD_EMBEDDING_PATH = 'glove.42B.300d.txt'
BATCH_SIZE = 64
MODEL_TYPE = 'bert-base-uncased'
MAX_LEN = 128

In [5]:
def get_customized_mapping(eps,top_k):
    train_corpus = " ".join(train_df.sentence)
    corpus = train_corpus
    word_freq = [x[0] for x in Counter(corpus.split()).most_common()]

    # Reading from the space-separated file
    embeddings = []
    idx2word = []
    word2idx = {}

    with open(WORD_EMBEDDING_PATH, 'r') as f:
        # Skip first line if of form count/dim.
        line = f.readline().rstrip().split(' ')
        if len(line) == 2:
            # If the first line is of form count/dim, then we just skip it.
            pass
        else:
            # If the first line isn't count/dim, then it's a valid embedding line. So, we process it.
            word = line[0]
            vector = list(map(float, line[1:]))
            idx2word.append(word)
            word2idx[word] = len(idx2word) - 1
            embeddings.append(vector)

        # Continue processing the rest of the lines
        for row in f:
            content = row.rstrip().split(' ')
            word = content[0]
            vector = list(map(float, content[1:]))
            idx2word.append(word)
            word2idx[word] = len(idx2word) - 1
            embeddings.append(vector)

    # Converting lists to numpy arrays for consistency with the original code
    embeddings = np.asarray(embeddings)
    idx2word = np.asarray(idx2word)

    word_hash = defaultdict(str)
    sim_word_dict = defaultdict(list)
    p_dict = defaultdict(list)
    for i in trange(len(word_freq)):
        word = word_freq[i]
        if word in word2idx:
            if word not in word_hash:
                index_list = euclidean_distances(embeddings[word2idx[word]].reshape(1,-1),embeddings)[0].argsort()[:top_k]
                word_list = [idx2word[x] for x in index_list]
                embedding_list = np.array([embeddings[x] for x in index_list])    
                
                for x in word_list:
                    if x not in word_hash:
                        word_hash[x] = word
                        sim_dist_list = euclidean_distances(embeddings[word2idx[x]].reshape(1,-1), embedding_list)[0]
                        min_max_dist = max(sim_dist_list) - min(sim_dist_list)
                        min_dist = min(sim_dist_list)
                        new_sim_dist_list = [-(x-min_dist)/min_max_dist for x in sim_dist_list]
                        tmp = [np.exp(eps*x/2) for x in new_sim_dist_list]
                        norm = sum(tmp)
                        p = [x/norm for x in tmp]
                        p_dict[x] = p
                        sim_word_dict[x] =  word_list

    try:
        with open("p_dict.txt", 'w') as json_file:
            json_file.write(json.dumps(p_dict, ensure_ascii=False, indent=4))
    except IOError:
        print("Error writing p_dict.txt")

    try:
        with open("sim_word_dict.txt", 'w') as json_file:
            json_file.write(json.dumps(sim_word_dict, ensure_ascii=False, indent=4))
    except IOError:
        print("Error writing sim_word_dict.txt")

    return sim_word_dict, p_dict

In [6]:
sim_word_dict, p_dict = get_customized_mapping(eps = EPS, top_k = TOP_K)

100%|███████████████████████████████████| 14816/14816 [4:31:30<00:00,  1.10s/it]


In [None]:
def generate_new_sents_s1(df,sim_word_dict,p_dict,save_stop_words,type="train"):

    punct = list(string.punctuation)

    nltk.download('stopwords')
    nltk.download('punkt')
    stop_words = set(stopwords.words('english'))
    
    cnt = 0 
    raw_cnt = 0 
    stop_cnt = 0 
    dataset = df.sentence
    new_dataset = []

    for i in trange(len(dataset)):
        record = dataset[i].split()
        new_record = []
        for word in record:
            if (save_stop_words and word in stop_words) or (word not in sim_word_dict):
                if word in stop_words:
                    stop_cnt += 1  
                    raw_cnt += 1   
                if is_number(word):
                    try:
                        word = str(round(float(word))+np.random.randint(1000))
                    except:
                        pass                   
                new_record.append(word)
            else:
                p = p_dict[word]
                new_word = np.random.choice(sim_word_dict[word],1,p=p)[0]
                new_record.append(new_word)
                if new_word == word:
                    raw_cnt += 1 

            cnt += 1 
        new_dataset.append(" ".join(new_record))

    df.sentence = new_dataset

    if not os.path.exists(f"./privatized_dataset/{args.embedding_type}/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}_{args.privatization_strategy}_save_stop_words_{args.save_stop_words}"):
        os.mkdir(f"./privatized_dataset/{args.embedding_type}/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}_{args.privatization_strategy}_save_stop_words_{args.save_stop_words}")
    if type == "train":
        df.to_csv(f"./privatized_dataset/{args.embedding_type}/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}_{args.privatization_strategy}_save_stop_words_{args.save_stop_words}/train.tsv","\t",index=0)
    else:
        df.to_csv(f"./privatized_dataset/{args.embedding_type}/{args.mapping_strategy}/eps_{args.eps}_top_{args.top_k}_{args.privatization_strategy}_save_stop_words_{args.save_stop_words}/test.tsv","\t",index=0)

    return df

In [17]:
class Bert_dataset(Dataset):
    def __init__(self,df):
        self.df=df
        self.tokenizer = BertTokenizer.from_pretrained(f"{MODEL_TYPE}",do_lower_case=True)

    def __getitem__(self,index):
        # get the sentence from the dataframe
        sentence = self.df.loc[index,'sentence']

        encoded_dict = self.tokenizer.encode_plus(
            sentence,              # sentence to encode
            add_special_tokens = True,         # Add '[CLS]' and '[SEP]'
            max_length = MAX_LEN,
            pad_to_max_length= True,
            truncation='longest_first',
            return_attention_mask = True,
            return_tensors = 'pt'
        )

        # These are torch tensors already
        input_ids = encoded_dict['input_ids'][0]
        attention_mask = encoded_dict['attention_mask'][0]
        token_type_ids = encoded_dict['token_type_ids'][0]

        #Convert the target to a torch tensor
        target = torch.tensor(self.df.loc[index,'label'])

        sample = (input_ids,attention_mask,token_type_ids,target)
        return sample

    def __len__(self):
        return len(self.df)

In [19]:
train_dataset = Bert_dataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_dataset = Bert_dataset(validation_df)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataset = Bert_dataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

print(f"train_data:{len(train_df)},dev_data:{len(validation_df)},test_data:{len(test_df)}")

train_data:67349,dev_data:872,test_data:1821
