# Imports

In [1]:
import pickle
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

### Configurations

In [2]:
# Tokenizer and fetching embeddings
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Datasets to use

### IMDB Dataset

In [14]:
dataset_name = "stanfordnlp/imdb"
dataset = load_dataset(dataset_name)

### GLUE Dataset

In [9]:
dataset_name = "glue"

# Possible GLUE Datasets
# "ax", "cola", "mnli", "mnli_matched",  "mnli_mismatched", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"
dataset_arg = "sst2"

dataset = load_dataset(dataset_name, dataset_arg)

### SST2 standfordnlp

In [3]:
dataset_name = "stanfordnlp/sst2"
dataset = load_dataset(dataset_name)

Downloading readme:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading data: 100%|██████████████████████████████████████████████████████████████████████████| 3.11M/3.11M [00:03<00:00, 980kB/s]
Downloading data: 100%|█████████████████████████████████████████████████████████████████████████| 72.8k/72.8k [00:01<00:00, 39.3kB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████████████████████| 148k/148k [00:01<00:00, 78.4kB/s]


Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

### Rotten Tomatoes

In [4]:
dataset_name = "rotten_tomatoes"
dataset = load_dataset(dataset_name)

Downloading readme:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

Downloading data: 100%|███████████████████████████████████████████████████████████████████████████| 699k/699k [00:00<00:00, 2.14MB/s]
Downloading data: 100%|██████████████████████████████████████████████████████████████████████████| 90.0k/90.0k [00:00<00:00, 367kB/s]
Downloading data: 100%|██████████████████████████████████████████████████████████████████████████| 92.2k/92.2k [00:00<00:00, 396kB/s]


Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

# Load Dataset

In [5]:
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

In [8]:
test_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})

# Functions

In [7]:
def get_doc_embedding(document, model):
    return model.encode(document)

# Save Embeddings

### IMDB

In [18]:
test_doc_emb = []
for item in tqdm(test_dataset):
    sentence = item['text']
    doc_emb = get_doc_embedding(sentence, model)
    test_doc_emb.append(doc_emb)

with open('./../data/embeddings/imdb-test-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(test_doc_emb, file)

100%|██████████████████████████████████████████████████████████████████████| 25000/25000 [04:29<00:00, 92.83it/s]


In [19]:
train_doc_emb = []
for item in tqdm(train_dataset):
    sentence = item['text']
    doc_emb = get_doc_embedding(sentence, model)
    train_doc_emb.append(doc_emb)

with open('./../data/embeddings/imdb-train-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(train_doc_emb, file)

100%|██████████████████████████████████████████████████████████████████████| 25000/25000 [04:33<00:00, 91.38it/s]


### GLUE SST2

In [11]:
test_doc_emb = []
for item in tqdm(test_dataset):
    sentence = item['sentence']
    doc_emb = get_doc_embedding(sentence, model)
    test_doc_emb.append(doc_emb)

with open('./../data/embeddings/sst2-test-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(test_doc_emb, file)

100%|███████████████████████████████████████████████████████████████████████| 1821/1821 [00:16<00:00, 112.74it/s]


In [13]:
train_doc_emb = []
for item in tqdm(train_dataset):
    sentence = item['sentence']
    doc_emb = get_doc_embedding(sentence, model)
    train_doc_emb.append(doc_emb)

with open('./../data/embeddings/sst2-train-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(train_doc_emb, file)

100%|█████████████████████████████████████████████████████████████████████| 67349/67349 [09:37<00:00, 116.65it/s]


In [12]:
val_doc_emb = []
for item in tqdm(val_dataset):
    sentence = item['sentence']
    doc_emb = get_doc_embedding(sentence, model)
    val_doc_emb.append(doc_emb)

with open('./../data/embeddings/sst2-val-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(val_doc_emb, file)

100%|█████████████████████████████████████████████████████████████████████████| 872/872 [00:07<00:00, 114.95it/s]


### GLUE COLA

In [6]:
test_doc_emb = []
for item in tqdm(test_dataset):
    sentence = item['sentence']
    doc_emb = get_doc_embedding(sentence, model)
    test_doc_emb.append(doc_emb)

with open('./../data/embeddings/cola-test-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(test_doc_emb, file)

100%|███████████████████████████████████████████████████████████████████████| 1063/1063 [00:10<00:00, 101.85it/s]


In [7]:
train_doc_emb = []
for item in tqdm(train_dataset):
    sentence = item['sentence']
    doc_emb = get_doc_embedding(sentence, model)
    train_doc_emb.append(doc_emb)

with open('./../data/embeddings/cola-train-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(train_doc_emb, file)

100%|███████████████████████████████████████████████████████████████████████| 8551/8551 [01:14<00:00, 115.21it/s]


In [8]:
val_doc_emb = []
for item in tqdm(val_dataset):
    sentence = item['sentence']
    doc_emb = get_doc_embedding(sentence, model)
    val_doc_emb.append(doc_emb)

with open('./../data/embeddings/cola-val-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(val_doc_emb, file)

100%|███████████████████████████████████████████████████████████████████████| 1043/1043 [00:09<00:00, 114.56it/s]


### Rotten Tomatoes

In [9]:
test_doc_emb = []
for item in tqdm(test_dataset):
    sentence = item['text']
    doc_emb = get_doc_embedding(sentence, model)
    test_doc_emb.append(doc_emb)

with open('./../data/embeddings/tomato-test-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(test_doc_emb, file)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 1066/1066 [00:20<00:00, 52.09it/s]


In [11]:
train_doc_emb = []
for item in tqdm(train_dataset):
    sentence = item['text']
    doc_emb = get_doc_embedding(sentence, model)
    train_doc_emb.append(doc_emb)

with open('./../data/embeddings/tomato-train-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(train_doc_emb, file)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 8530/8530 [02:05<00:00, 67.95it/s]


In [12]:
val_doc_emb = []
for item in tqdm(val_dataset):
    sentence = item['text']
    doc_emb = get_doc_embedding(sentence, model)
    val_doc_emb.append(doc_emb)

with open('./../data/embeddings/tomato-val-768.pkl', 'wb') as file:
    # Serialize and write the variable to the file
    pickle.dump(val_doc_emb, file)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 1066/1066 [00:14<00:00, 71.94it/s]
