### Preparing for Embedding

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

In [2]:
#loading in data
val_df = pd.read_csv('val_data_realvalued(1).csv', sep=';', encoding='utf-8')
val_df.head()

Unnamed: 0,id,Q1,Q2,Q3,Openness,Conscientiousness,Extraversion,Agreeableness,Neuroticism,Humility
0,1,When I was the chairman of a student organisat...,I learned one of the most important lessons in...,Because I was a board member of a student orga...,100.0,1.0,88.0,100.0,13.0,7.0
1,2,I am leader and group-leader in scouts. This m...,"When I failed my first exam, I felt like givin...",For this I would go back to scouts. Last year ...,67.0,9.0,73.0,90.0,61.0,74.0
2,3,I was given a project to lead a commercial exe...,In a retail company i was given a task to rede...,I was asked to lead a team as interim manager....,100.0,94.0,95.0,69.0,61.0,91.0
3,4,I have been presented with many situations tha...,I have recently started volunteering at a cult...,Perhaps the most challenging leadership role I...,32.0,1.0,62.0,100.0,40.0,74.0
4,5,For a long time I did not like presenting pape...,When I send in an article that I had written f...,My grandparents like to spend time with their ...,42.0,9.0,1.0,90.0,28.0,98.0


In [3]:
#lowercase
val_df.columns = val_df.columns.str.lower()
#val_df.head()

In [4]:
#concatenate Qs
val_df['full_text'] = val_df['q1'].fillna('') + ' ' + \
                      val_df['q2'].fillna('') + ' ' + \
                      val_df['q3'].fillna('')

#val_df.head()

In [5]:
#normalize the values
trait_map = {
    'openness': 'openness',
    'conscientiousness': 'conscientiousness',
    'extraversion': 'extraversion',
    'agreeableness': 'agreeableness',
    'neuroticism': 'neuroticism'
}
for trait in trait_map:
    val_df[trait] = val_df[trait] / 100.0

#val_df.head()

In [6]:
#store unlabelled rows
unlabeled_rows = val_df[val_df[['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism']].isnull().any(axis=1)]
unlabeled_rows.to_csv('unlabeled_val_rows.csv', index=False)

In [7]:
#drop them for now
val_df = val_df.dropna(subset=['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'neuroticism'])

In [10]:
#save the cleaned data
# Save cleaned and preprocessed validation data before embedding
val_df.to_csv("val_data_cleaned.csv", index=False)
print("✅ Saved cleaned validation data to val_data_cleaned.csv")

✅ Saved cleaned validation data to val_data_cleaned.csv


### Embedding

In [13]:
#let chatgpt cook
import torch
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm import tqdm

# Load DistilBERT tokenizer and model
model_name = "distilbert-base-cased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name).to("cuda")
model.eval()

# Function to embed text (for short comments)
def embed_comment(comment_text):
    inputs = tokenizer(comment_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to("cuda")

    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

# Function for longer texts (split into chunks if necessary)
def embed_long_comment(text, chunk_size=510, stride=128):
    tokens = tokenizer(text, return_tensors="pt", truncation=False)["input_ids"][0]
    chunks = []

    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i + chunk_size]
        if len(chunk) == 0:
            continue

        chunk = tokenizer.build_inputs_with_special_tokens(chunk.tolist())
        chunk_inputs = torch.tensor([chunk]).to("cuda")  # send to GPU

        with torch.no_grad():
            outputs = model(chunk_inputs)

        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # move back to CPU
        chunks.append(cls_embedding)

        if i + chunk_size >= len(tokens):
            break

    return np.mean(chunks, axis=0)

# Function to decide which embedding to use based on text length
def get_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=False)["input_ids"][0]
    return embed_comment(text) if len(tokens) <= 512 else embed_long_comment(text)

# Load your cleaned validation dataset
val_df = pd.read_csv('val_data_cleaned.csv')

# Assuming 'full_text' contains the concatenated interview answers (Q1, Q2, Q3)
rows = []

# Loop over rows in the validation data
for index, row in tqdm(val_df.iterrows(), total=len(val_df)):
    # Extract full text (concatenated Q1, Q2, Q3)
    full_text = row['full_text']

    # Get the embedding for the full text
    try:
        embedding = get_embedding(full_text)

        # Create a dictionary with the same columns + the embedding
        row_dict = row.to_dict()  # Convert existing row to dictionary

        # Add embeddings to the dictionary (keys are embed_0, embed_1, ...)
        for i in range(len(embedding)):
            row_dict[f"embed_{i}"] = embedding[i]

        # Append the new row with embeddings
        rows.append(row_dict)
    except Exception as e:
        print(f"Error embedding row {index}: {e}")

# Convert the list of rows into a DataFrame
df_with_embeddings = pd.DataFrame(rows)

# Save the result to a CSV
df_with_embeddings.to_csv("val_data_embeddings.csv", index=False)
print("Embedding complete. Saved to val_data_embeddings.csv")


  0%|          | 0/28 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (543 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 28/28 [00:00<00:00, 36.86it/s]

Embedding complete. Saved to val_data_embeddings.csv



