In [None]:
'''
**************************************************************************************************************
**************************************************************************************************************
>>>>>>>>>> Script for making embeddings with >>distilbert/distilbert-base-cased-distilled-squad<< 
The script will take all the comments, longer than 512 token comments will be chunked; 
each chunk will be embedded,then the vec is made of average.
Run on Colab. If not in colab change loading method.
Since on Colab, might need installing transformers and torch
In case it takes too long you can embed 10% or more of the file. 
Mentioned in the script: change variable to embed bigger percentage of the file.
Saves output in drive. <<<<<<<<<<<<<<<<
**************************************************************************************************************
**************************************************************************************************************
'''


import json
import torch
import numpy as np
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm import tqdm

model_name = "distilbert/distilbert-base-cased-distilled-squad"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)


model = DistilBertModel.from_pretrained(model_name).half().to("cuda")
model.eval()

# embedding function (for shorter texts)
def embed_comment(comment_text):
    inputs = tokenizer(comment_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to("cuda")
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

# embedding function (for longer texts)
def embed_long_comment(text, chunk_size=510, stride=128):
    tokens = tokenizer(text, return_tensors="pt", truncation=False)["input_ids"][0]
    chunks = []
    
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i+chunk_size]
        if len(chunk) == 0:
            continue
        
        chunk = tokenizer.build_inputs_with_special_tokens(chunk.tolist())
        chunk_inputs = torch.tensor([chunk]).to("cuda")  # send to GPU
        
        with torch.no_grad():
            outputs = model(chunk_inputs)
        
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # move back to CPU
        chunks.append(cls_embedding)
        
        if i + chunk_size >= len(tokens):
            break
    
    return np.mean(chunks, axis=0)

def get_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=False)["input_ids"][0]
    return embed_comment(text) if len(tokens) <= 512 else embed_long_comment(text)


from google.colab import drive
drive.mount('/content/drive')
json_path = "/content/drive/MyDrive/filtered_pandora.json" # file name already in there; for val-set need new path

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)


rows = []

''' 
**************************************************************************************************************
Use this if it takes too long (just to see if the model works)
Change the variable 0.10 if you want to embed bigger percentage of the file

sample_size = int(len(data["authors"]) * 0.10)
sampled_authors = data["authors"][:sample_size]

for author in tqdm(sampled_authors):

**************************************************************************************************************

'''

for author in tqdm(data["authors"]):
    author_id = author["id"]
    labels = author["labels"]
    comments = author.get("comments", [])


    # averaging the embeddings of chunks for longer passeges
    embeddings = []
    
    for comment in comments:
        try:
            vec = get_embedding(comment)
            embeddings.append(vec)
        except Exception as e:
            print(f"Error embedding comment for {author_id}: {e}")

    if embeddings:
        avg_embedding = np.mean(embeddings, axis=0)
        row = {
            "id": author_id,
            **labels
        }
        for i in range(len(avg_embedding)):
            row[f"embed_{i}"] = avg_embedding[i]
        rows.append(row)



df = pd.DataFrame(rows)
df.to_csv("author_embeddings.csv", index=False)
print("Saved to author_embeddings.csv")

In [None]:
'''
**************************************************************************************************************
**************************************************************************************************************
>>>>>>>>>> Script for making embeddings from the validation with >>distilbert/distilbert-base-cased-distilled-squad<< 
The script will take all the comments, longer than 512 token comments will be chunked; 
each chunk will be embedded,then the vec is made of average.
Run on Colab. If not in colab change loading method.
Since on Colab, might need installing transformers and torch
In case it takes too long you can embed 10% or more of the file. 
Mentioned in the script: change variable to embed bigger percentage of the file.
Saves output in drive. <<<<<<<<<<<<<<<<
**************************************************************************************************************
**************************************************************************************************************
'''



import json
import torch
import numpy as np
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
from tqdm import tqdm

# Load tokenizer and model
model_name = "distilbert/distilbert-base-cased-distilled-squad"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertModel.from_pretrained(model_name).half().to("cuda")
model.eval()

# Embedding functions
def embed_comment(comment_text):
    inputs = tokenizer(comment_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

def embed_long_comment(text, chunk_size=510, stride=128):
    tokens = tokenizer(text, return_tensors="pt", truncation=False)["input_ids"][0]
    chunks = []
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i+chunk_size]
        if len(chunk) == 0:
            continue
        chunk = tokenizer.build_inputs_with_special_tokens(chunk.tolist())
        chunk_inputs = torch.tensor([chunk]).to("cuda")
        with torch.no_grad():
            outputs = model(chunk_inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
        chunks.append(cls_embedding)
        if i + chunk_size >= len(tokens):
            break
    return np.mean(chunks, axis=0)

def get_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=False)["input_ids"][0]
    return embed_comment(text) if len(tokens) <= 512 else embed_long_comment(text)

# Load val data
val_df = pd.read_csv("/content/drive/MyDrive/val_data.csv")

rows = []

for idx, row in tqdm(val_df.iterrows(), total=len(val_df)):
    author_id = row.get("id", f"val_{idx}")
    full_text = " ".join(str(row[q]) for q in ['Q1', 'Q2', 'Q3'] if pd.notna(row[q]))

    try:
        vec = get_embedding(full_text)
        row_data = {
            "id": author_id,
            "Openness": row["Openness"],
            "Conscientiousness": row["Conscientiousness"],
            "Extraversion": row["Extraversion"],
            "Agreeableness": row["Agreeableness"],
            "Emotional stability": row["Emotional stability"]
        }
        for i in range(len(vec)):
            row_data[f"embed_{i}"] = vec[i]
        rows.append(row_data)
    except Exception as e:
        print(f"Error embedding val sample {author_id}: {e}")

val_embed_df = pd.DataFrame(rows)
val_embed_df.to_csv("/content/drive/MyDrive/val_embeddings.csv", index=False)
print("Saved to /content/drive/MyDrive/val_embeddings.csv")