# Embedding the Texts

In this notebook, we precompute the embeddings of the texts using different models. This will help us to quickly load the embeddings in the future and to avoid recomputing them in each iteration and in each model training. the embeddings are saved along with the data.

Note that these embeddings are only usable when we are considering the model freezed. In the fine tuning procedures, precomputed embeddings are not used.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from transformers import CLIPTokenizer, CLIPModel

## Loading the Data

In [None]:
data_df = pd.read_excel("../../data/dataset.xlsx")
data_df = data_df.dropna()
data_df = data_df.reset_index(drop=True)
data_df = data_df.drop(columns=["Unnamed: 0"])
print(data_df.shape)
data_df.head()

In [None]:
my_personality_df = pd.read_csv("../../external_datasets/my_personality/my_personality.csv", encoding="ISO-8859-1")
my_personality_df = my_personality_df.dropna()
my_personality_df = my_personality_df.reset_index(drop=True)
print(my_personality_df.shape)
my_personality_df.head()

In [None]:
social_df = pd.read_csv("../../external_datasets/social/social.csv")
social_df = social_df.dropna()
social_df = social_df.reset_index(drop=True)
print(social_df.shape)
social_df.head()

In [None]:
data_df_texts = data_df["final_text"]
my_personality_df_texts = my_personality_df["STATUS"]
social_df_texts = social_df["status_update"]

## Embedding the Texts

In [25]:
def tokenize_batch(batch, tokenizer, max_length=512):
    return tokenizer(
        batch,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )


def generate_embeddings(model, dataloader, device):
    model.eval()
    all_cls_embeddings = []
    all_mean_embeddings = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            all_cls_embeddings.append(cls_embeddings.cpu().numpy())

            mean_embeddings = outputs.last_hidden_state.mean(dim=1)
            all_mean_embeddings.append(mean_embeddings.cpu().numpy())

    all_cls_embeddings = np.concatenate(all_cls_embeddings, axis=0)
    all_mean_embeddings = np.concatenate(all_mean_embeddings, axis=0)

    return all_cls_embeddings, all_mean_embeddings

In [26]:
batch_size = 32


def pipe(texts, model, tokenizer, device, saving_path):
    dataloader = DataLoader(
        texts,
        batch_size=batch_size,
        collate_fn=lambda b: tokenize_batch(b, tokenizer),
    )
    cls_embeddings, mean_embeddings = generate_embeddings(
        model,
        dataloader,
        device,
    )
    np.savez(
        saving_path,
        cls_embeddings=cls_embeddings,
        mean_embeddings=mean_embeddings,
    )

### RoBERTa

In [None]:
model_name = "sentence-transformers/all-roberta-large-v1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

pipe(data_df_texts, model, tokenizer, device, "../../data/data_df_embeddings.npz")
pipe(my_personality_df_texts, model, tokenizer, device, "../../data/my_personality_df_embeddings.npz")
pipe(social_df_texts, model, tokenizer, device, "../../data/social_df_embeddings.npz")