# Imports

In [20]:
import pickle
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

### Configurations

In [2]:
# Tokenizer and fetching embeddings
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Datasets to use

### IMDB Dataset

In [14]:
dataset_name = "stanfordnlp/imdb"
dataset = load_dataset(dataset_name)

### GLUE Dataset

In [9]:
dataset_name = "glue"

# Possible GLUE Datasets
# "ax", "cola", "mnli", "mnli_matched",  "mnli_mismatched", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"
dataset_arg = "sst2"

dataset = load_dataset(dataset_name, dataset_arg)

# Load Dataset

In [17]:
train_dataset = dataset["train"]
# val_dataset = dataset["validation"]
test_dataset = dataset["test"]

# Functions

In [5]:
def get_doc_embedding(document, model):
    return model.encode(document)

# Save Embeddings

### IMDB

In [18]:
test_doc_emb = []
for item in tqdm(test_dataset):
    sentence = item['text']
    doc_emb = get_doc_embedding(sentence, model)
    test_doc_emb.append(doc_emb)

with open('./../data/embeddings/imdb-test-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(test_doc_emb, file)

100%|██████████████████████████████████████████████████████████████████████| 25000/25000 [04:29<00:00, 92.83it/s]


In [19]:
train_doc_emb = []
for item in tqdm(train_dataset):
    sentence = item['text']
    doc_emb = get_doc_embedding(sentence, model)
    train_doc_emb.append(doc_emb)

with open('./../data/embeddings/imdb-train-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(train_doc_emb, file)

100%|██████████████████████████████████████████████████████████████████████| 25000/25000 [04:33<00:00, 91.38it/s]


### GLUE SST2

In [11]:
test_doc_emb = []
for item in tqdm(test_dataset):
    sentence = item['sentence']
    doc_emb = get_doc_embedding(sentence, model)
    test_doc_emb.append(doc_emb)

with open('./../data/embeddings/sst2-test-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(test_doc_emb, file)

100%|███████████████████████████████████████████████████████████████████████| 1821/1821 [00:16<00:00, 112.74it/s]


In [13]:
train_doc_emb = []
for item in tqdm(train_dataset):
    sentence = item['sentence']
    doc_emb = get_doc_embedding(sentence, model)
    train_doc_emb.append(doc_emb)

with open('./../data/embeddings/sst2-train-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(train_doc_emb, file)

100%|█████████████████████████████████████████████████████████████████████| 67349/67349 [09:37<00:00, 116.65it/s]


In [12]:
val_doc_emb = []
for item in tqdm(val_dataset):
    sentence = item['sentence']
    doc_emb = get_doc_embedding(sentence, model)
    val_doc_emb.append(doc_emb)

with open('./../data/embeddings/sst2-val-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(val_doc_emb, file)

100%|█████████████████████████████████████████████████████████████████████████| 872/872 [00:07<00:00, 114.95it/s]


### GLUE COLA

In [6]:
test_doc_emb = []
for item in tqdm(test_dataset):
    sentence = item['sentence']
    doc_emb = get_doc_embedding(sentence, model)
    test_doc_emb.append(doc_emb)

with open('./../data/embeddings/cola-test-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(test_doc_emb, file)

100%|███████████████████████████████████████████████████████████████████████| 1063/1063 [00:10<00:00, 101.85it/s]


In [7]:
train_doc_emb = []
for item in tqdm(train_dataset):
    sentence = item['sentence']
    doc_emb = get_doc_embedding(sentence, model)
    train_doc_emb.append(doc_emb)

with open('./../data/embeddings/cola-train-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(train_doc_emb, file)

100%|███████████████████████████████████████████████████████████████████████| 8551/8551 [01:14<00:00, 115.21it/s]


In [8]:
val_doc_emb = []
for item in tqdm(val_dataset):
    sentence = item['sentence']
    doc_emb = get_doc_embedding(sentence, model)
    val_doc_emb.append(doc_emb)

with open('./../data/embeddings/cola-val-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(val_doc_emb, file)

100%|███████████████████████████████████████████████████████████████████████| 1043/1043 [00:09<00:00, 114.56it/s]
