# Embeddings Generation

In this notebook, we aim to employ embedding methods of text and train a regression model on them.


In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import math

import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel
from transformers import LongformerTokenizer, LongformerModel
from transformers import BigBirdTokenizer, BigBirdModel

In [2]:
# NOTE: to prevent unexpected behaviors.

import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
data_df = pd.read_excel("../data/dataset.xlsx", index_col=0)
data_df

Unnamed: 0,participant_id,collective,contrast,goal,goals2,list,metaphor,moral,question,story,...,final_text,overall_sentiment_all,positive_sentiment_all,negative_sentiment_all,neutra_sentiment_all,mixed_sentiment_all,targets,text_length_all,prolific_score,prolific_indicator_all
0,5e1cf0eb65b6d3071f489de9,0.35,1.07,0.43,0.32,6.96,0.94,2.36,0.01,0.46,...,Hello everyone. Thank you. Taking the time to ...,POSITIVE,0.956900,0.000700,0.041700,0.000700,HIGH,771.0,100.0,2
1,55d06fd334e9060012e5781c,0.30,0.67,0.30,0.20,2.83,0.71,0.22,0.01,0.60,...,"Hi, I am Kathy. I'd love to be considered for ...",NEUTRAL,0.158700,0.005500,0.835000,0.000900,MED,424.0,99.0,2
2,615586b009f801c3f2d4af8d,0.18,0.74,0.16,0.26,3.40,1.10,1.09,0.01,0.37,...,uh yeah I I think I would be the best candidat...,POSITIVE,0.805100,0.016400,0.174700,0.003900,MED,449.0,100.0,2
3,5847e60f73170700013697c6,0.14,2.14,0.27,0.12,3.05,0.49,0.46,0.00,1.09,...,Hello. Um I've of course a fair amount of expe...,POSITIVE,0.576100,0.118500,0.248400,0.057000,HIGH,611.0,100.0,2
4,6086a11397234e7f83e4e793,0.90,4.76,0.86,0.22,7.92,0.56,2.95,0.01,0.19,...,"Okay, so I would like to thank you for giving ...",POSITIVE,0.851500,0.001600,0.145600,0.001300,HIGH,611.0,100.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1993,5e43021e5bb25719df134c7c,0.63,3.18,0.73,0.45,6.20,0.65,4.03,0.01,0.26,...,I believe I'm the best person to take up this ...,POSITIVE,0.683128,0.036325,0.183616,0.096930,MED,769.0,100.0,2
1994,54b60d74fdf99b1ce0367ab5,0.51,1.72,1.35,0.30,5.42,0.55,2.54,0.01,1.20,...,Hi there. I definitely think I should be consi...,POSITIVE,0.699336,0.006666,0.228330,0.065668,MED,781.0,96.0,1
1995,60fe95530ccb79107f63773e,0.11,0.12,0.14,0.11,6.20,0.19,0.98,0.01,0.89,...,"as an undergraduate student, I studied at the ...",POSITIVE,0.799731,0.000397,0.199813,0.000059,LOW,545.0,100.0,2
1996,607deec383975f3377a28b9e,0.28,0.40,0.22,0.45,5.69,0.38,0.59,0.01,1.06,...,I've always been a confident person um I will ...,POSITIVE,0.316924,0.257287,0.184135,0.241654,HIGH,517.0,100.0,2


## BigBird

In [6]:
BATCH_SIZE = 4
MAX_LENGTH = 2**12
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"device: {DEVICE}")

tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
model = BigBirdModel.from_pretrained("google/bigbird-roberta-base")
model.to(DEVICE)

texts = data_df["final_text"].tolist()
num_batches = math.ceil(len(texts) / BATCH_SIZE)

cls_embeddings = []
avg_embeddings = []

for i in tqdm(range(num_batches)):
    batch_texts = texts[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]

    inputs = tokenizer(
        batch_texts,
        return_tensors="pt",
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length",
        add_special_tokens=True,
    )
    inputs = {key: value.to(DEVICE) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state

        batch_cls_embeddings = last_hidden_state[:, 0, :]

        attention_mask = inputs["attention_mask"]
        batch_avg_embeddings = (last_hidden_state * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1).unsqueeze(-1)

    cls_embeddings.append(batch_cls_embeddings)
    avg_embeddings.append(batch_avg_embeddings)

cls_embeddings = torch.cat(cls_embeddings, dim=0)
avg_embeddings = torch.cat(avg_embeddings, dim=0)

with open("../embeddings/bigbird_cls_embeddings.pt", "wb") as f:
    torch.save(cls_embeddings, f)

with open("../embeddings/bigbird_avg_embeddings.pt", "wb") as f:
    torch.save(avg_embeddings, f)

device: cpu


100%|██████████| 500/500 [1:41:38<00:00, 12.20s/it]
