# Embedding the Texts

In this notebook, we precompute the embeddings of the texts using different models. This will help us to quickly load the embeddings in the future and to avoid recomputing them in each iteration and in each model training. the embeddings are saved along with the data.

Note that these embeddings are only usable when we are considering the model freezed. In the fine tuning procedures, precomputed embeddings are not used.

Execution: the code has been executed on a local machine for the main dataset due to Swiss regulations and on Google Colab for public datasets.

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

## Loading the Data

In [17]:
data_df = pd.read_excel("../../data/dataset.xlsx")
data_df = data_df.dropna()
data_df = data_df.reset_index(drop=True)
data_df = data_df.drop(columns=["Unnamed: 0"])
print(data_df.shape)
data_df_texts = data_df["final_text"]
data_df.head()

(1983, 50)


Unnamed: 0,participant_id,collective,contrast,goal,goals2,list,metaphor,moral,question,story,...,final_text,overall_sentiment_all,positive_sentiment_all,negative_sentiment_all,neutra_sentiment_all,mixed_sentiment_all,targets,text_length_all,prolific_score,prolific_indicator_all
0,5e1cf0eb65b6d3071f489de9,0.35,1.07,0.43,0.32,6.96,0.94,2.36,0.01,0.46,...,Hello everyone. Thank you. Taking the time to ...,POSITIVE,0.9569,0.0007,0.0417,0.0007,HIGH,771.0,100.0,2
1,55d06fd334e9060012e5781c,0.3,0.67,0.3,0.2,2.83,0.71,0.22,0.01,0.6,...,"Hi, I am Kathy. I'd love to be considered for ...",NEUTRAL,0.1587,0.0055,0.835,0.0009,MED,424.0,99.0,2
2,615586b009f801c3f2d4af8d,0.18,0.74,0.16,0.26,3.4,1.1,1.09,0.01,0.37,...,uh yeah I I think I would be the best candidat...,POSITIVE,0.8051,0.0164,0.1747,0.0039,MED,449.0,100.0,2
3,5847e60f73170700013697c6,0.14,2.14,0.27,0.12,3.05,0.49,0.46,0.0,1.09,...,Hello. Um I've of course a fair amount of expe...,POSITIVE,0.5761,0.1185,0.2484,0.057,HIGH,611.0,100.0,2
4,6086a11397234e7f83e4e793,0.9,4.76,0.86,0.22,7.92,0.56,2.95,0.01,0.19,...,"Okay, so I would like to thank you for giving ...",POSITIVE,0.8515,0.0016,0.1456,0.0013,HIGH,611.0,100.0,2


In [18]:
chunked_data_df = pd.read_csv("../../data/chunked_dataset.csv")
chunked_data_df = chunked_data_df.dropna()
chunked_data_df = chunked_data_df.reset_index(drop=True)
chunked_data_df = chunked_data_df.drop(columns=["Unnamed: 0"])
chunked_data_df_texts = chunked_data_df["chunk_text"]
print(chunked_data_df.shape)
chunked_data_df.head()

(4988, 51)


Unnamed: 0,participant_id,collective,contrast,goal,goals2,list,metaphor,moral,question,story,...,positive_sentiment_all,negative_sentiment_all,neutra_sentiment_all,mixed_sentiment_all,targets,text_length_all,prolific_score,prolific_indicator_all,chunk_id,chunk_text
0,5e1cf0eb65b6d3071f489de9,0.35,1.07,0.43,0.32,6.96,0.94,2.36,0.01,0.46,...,0.9569,0.0007,0.0417,0.0007,HIGH,771.0,100.0,2,0,Hello everyone. Thank you. Taking the time to ...
1,5e1cf0eb65b6d3071f489de9,0.35,1.07,0.43,0.32,6.96,0.94,2.36,0.01,0.46,...,0.9569,0.0007,0.0417,0.0007,HIGH,771.0,100.0,2,1,"Again, as I talked about those great organizat..."
2,5e1cf0eb65b6d3071f489de9,0.35,1.07,0.43,0.32,6.96,0.94,2.36,0.01,0.46,...,0.9569,0.0007,0.0417,0.0007,HIGH,771.0,100.0,2,2,So I'm pretty experienced at the moment. I'm c...
3,55d06fd334e9060012e5781c,0.3,0.67,0.3,0.2,2.83,0.71,0.22,0.01,0.6,...,0.1587,0.0055,0.835,0.0009,MED,424.0,99.0,2,0,"Hi, I am Kathy. I'd love to be considered for ..."
4,55d06fd334e9060012e5781c,0.3,0.67,0.3,0.2,2.83,0.71,0.22,0.01,0.6,...,0.1587,0.0055,0.835,0.0009,MED,424.0,99.0,2,1,"It's been about five days, but I am reminding ..."


In [19]:
my_personality_df = pd.read_csv("../../external_datasets/my_personality/my_personality.csv", encoding="ISO-8859-1")
my_personality_df = my_personality_df.dropna()
my_personality_df = my_personality_df.reset_index(drop=True)
my_personality_df_texts = my_personality_df["STATUS"]
print(my_personality_df.shape)
my_personality_df.head()

(9916, 20)


Unnamed: 0,#AUTHID,STATUS,sEXT,sNEU,sAGR,sCON,sOPN,cEXT,cNEU,cAGR,cCON,cOPN,DATE,NETWORKSIZE,BETWEENNESS,NBETWEENNESS,DENSITY,BROKERAGE,NBROKERAGE,TRANSITIVITY
0,b7b7764cfa1c523e4e93ab2a79a946c4,likes the sound of thunder.,2.65,3.0,3.15,3.25,4.4,n,y,n,n,y,06/19/09 03:21 PM,180.0,14861.6,93.29,0.03,15661.0,0.49,0.1
1,b7b7764cfa1c523e4e93ab2a79a946c4,is so sleepy it's not even funny that's she ca...,2.65,3.0,3.15,3.25,4.4,n,y,n,n,y,07/02/09 08:41 AM,180.0,14861.6,93.29,0.03,15661.0,0.49,0.1
2,b7b7764cfa1c523e4e93ab2a79a946c4,is sore and wants the knot of muscles at the b...,2.65,3.0,3.15,3.25,4.4,n,y,n,n,y,06/15/09 01:15 PM,180.0,14861.6,93.29,0.03,15661.0,0.49,0.1
3,b7b7764cfa1c523e4e93ab2a79a946c4,likes how the day sounds in this new song.,2.65,3.0,3.15,3.25,4.4,n,y,n,n,y,06/22/09 04:48 AM,180.0,14861.6,93.29,0.03,15661.0,0.49,0.1
4,b7b7764cfa1c523e4e93ab2a79a946c4,is home. <3,2.65,3.0,3.15,3.25,4.4,n,y,n,n,y,07/20/09 02:31 AM,180.0,14861.6,93.29,0.03,15661.0,0.49,0.1


## Embedding the Texts

In [20]:
def tokenize_batch(batch, tokenizer, max_length=512):
    return tokenizer(
        batch,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )


def generate_embeddings(model, dataloader, device):
    model.eval()
    all_cls_embeddings = []
    all_mean_embeddings = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            all_cls_embeddings.append(cls_embeddings.cpu().numpy())

            mean_embeddings = outputs.last_hidden_state.mean(dim=1)
            all_mean_embeddings.append(mean_embeddings.cpu().numpy())

    all_cls_embeddings = np.concatenate(all_cls_embeddings, axis=0)
    all_mean_embeddings = np.concatenate(all_mean_embeddings, axis=0)

    return all_cls_embeddings, all_mean_embeddings

In [21]:
batch_size = 32


def pipe(texts, model, tokenizer, device, saving_path):
    dataloader = DataLoader(
        texts,
        batch_size=batch_size,
        collate_fn=lambda b: tokenize_batch(b, tokenizer),
    )
    cls_embeddings, mean_embeddings = generate_embeddings(
        model,
        dataloader,
        device,
    )
    np.savez(
        saving_path,
        cls_embeddings=cls_embeddings,
        mean_embeddings=mean_embeddings,
    )

### RoBERTa

In [22]:
model_name = "sentence-transformers/all-roberta-large-v1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

In [7]:
pipe(
    data_df_texts,
    model,
    tokenizer,
    device,
    "../../data/data_df_roberta_embeddings.npz",
)

100%|██████████| 62/62 [11:59<00:00, 11.60s/it]


In [15]:
pipe(
    chunked_data_df_texts,
    model,
    tokenizer,
    device,
    "../../data/chunked_data_df_roberta_embeddings.npz",
)

100%|██████████| 156/156 [24:26<00:00,  9.40s/it]


In [None]:
pipe(
    my_personality_df_texts,
    model,
    tokenizer,
    device,
    "../../external_datasets/my_personality/my_personality_df_roberta_embeddings.npz",
)

### Microsoft / DeBERTa

In [23]:
model_name = "microsoft/deberta-v3-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModel.from_pretrained(model_name).to(device)

In [24]:
pipe(
    data_df_texts,
    model,
    tokenizer,
    device,
    "../../data/data_df_deberta_embeddings.npz",
)

100%|██████████| 62/62 [09:22<00:00,  9.07s/it]


In [25]:
pipe(
    chunked_data_df_texts,
    model,
    tokenizer,
    device,
    "../../data/chunked_data_df_deberta_embeddings.npz",
)

100%|██████████| 156/156 [19:00<00:00,  7.31s/it]


In [None]:
pipe(
    my_personality_df_texts,
    model,
    tokenizer,
    device,
    "../../external_datasets/my_personality/my_personality_df_deberta_embeddings.npz",
)