## In the code below we perform the following steps

- Read in the item text data (https://chatgpt.com/share/66fa7c2a-101c-800b-88a5-7334934a995d)
- Calculate item embeddings
- Reverse item embeddings if necessary (we don't have reversed items here, but this approach may be unoptimal. In case of reversed items we could use fine-tuned model as in Hommell (2024))
- Compute cosine similarities
- Store results

### 1- Read in the item text data

In [16]:
import pandas as pd
import numpy as np
# read in file with items text etc.
df_items = pd.read_csv('./Data/IPIP-FFM-data-Nov/Big5_items.csv')
df_items.head()

Unnamed: 0.1,Unnamed: 0,Item,Sign
0,EXT,I am the life of the party.,+
1,EXT,I don't talk a lot.,-
2,EXT,I feel comfortable around people.,+
3,EXT,I keep in the background.,+
4,EXT,I start conversations.,+


### 2- Calculate embeddings (and reverse code if necessary)

In [20]:
# First we create a list of models (all multilinguals here)
models = ['nli-distilroberta-base-v2',
          'paraphrase-multilingual-mpnet-base-v2',
          'paraphrase-multilingual-MiniLM-L12-v2',
          'intfloat/multilingual-e5-base',
          'LaBSE',
          'dwulff/mpnet-personality',
          'uaritm/psychology_test'] #consider adding the finetuned model for psicometrista

# Import the necessary libraries and functions
from sentence_transformers import SentenceTransformer, util

# Create an empty data frame, which we will then populate with the different type of embeddings
facet_embeddings_sentences = pd.DataFrame()

for mod in models:
    model = SentenceTransformer(mod) #call the model
    item_embed = [] #create list for item-level embed
    item_embed_rev = [] #create list for item-level embed accounting for sign
    for item in range(0,len(df_items['Unnamed: 0'])): #loop over all the items
    #encode items
        item_embed.append(model.encode(df_items['Item'].iloc[item]))
        if df_items['Sign'].iloc[item][0] == '-': #if items is negatively keyed, reverse the embeddings
            item_embed_rev.append(model.encode(df_items['Item'].iloc[item])*-1)
        else:
            item_embed_rev.append(model.encode(df_items['Item'].iloc[item]))
    df_items[mod + '_embeddings'] = item_embed #then, we append the two item-level embeddings list and give them a name based on the model we used
    df_items[mod + '_embeddings_rev'] = item_embed_rev



In [21]:
df_items

Unnamed: 0.1,Unnamed: 0,Item,Sign,nli-distilroberta-base-v2_embeddings,nli-distilroberta-base-v2_embeddings_rev,paraphrase-multilingual-mpnet-base-v2_embeddings,paraphrase-multilingual-mpnet-base-v2_embeddings_rev,paraphrase-multilingual-MiniLM-L12-v2_embeddings,paraphrase-multilingual-MiniLM-L12-v2_embeddings_rev,intfloat/multilingual-e5-base_embeddings,intfloat/multilingual-e5-base_embeddings_rev,LaBSE_embeddings,LaBSE_embeddings_rev,dwulff/mpnet-personality_embeddings,dwulff/mpnet-personality_embeddings_rev,uaritm/psychology_test_embeddings,uaritm/psychology_test_embeddings_rev
0,EXT,I am the life of the party.,+,"[0.34674913, -0.55081207, 0.08507617, 0.670100...","[0.34674913, -0.55081207, 0.08507617, 0.670100...","[-0.025607085, 0.070356384, -0.006167733, -0.0...","[-0.025607085, 0.070356384, -0.006167733, -0.0...","[0.023955211, 0.13360831, -0.06792547, -0.0026...","[0.023955211, 0.13360831, -0.06792547, -0.0026...","[-0.00971982, -0.00327074, -0.013250936, 0.012...","[-0.00971982, -0.00327074, -0.013250936, 0.012...","[-0.035022825, -0.013066397, -0.060815495, -0....","[-0.035022825, -0.013066397, -0.060815495, -0....","[-0.00032962722, -0.0177538, 0.0058812276, -0....","[-0.00032962722, -0.0177538, 0.0058812276, -0....","[0.0048945695, 0.07815337, -0.0063293874, -0.0...","[0.0048945695, 0.07815337, -0.0063293874, -0.0..."
1,EXT,I don't talk a lot.,-,"[-0.2694674, 0.3042181, 0.39024746, -0.2851175...","[0.2694674, -0.3042181, -0.39024746, 0.2851175...","[0.14313205, 0.2307171, -0.006458861, -0.00929...","[-0.14313205, -0.2307171, 0.006458861, 0.00929...","[0.56699926, 0.113508046, 0.46969572, -0.24598...","[-0.56699926, -0.113508046, -0.46969572, 0.245...","[-0.010209002, 0.042996567, -0.030689923, 0.00...","[0.010209002, -0.042996567, 0.030689923, -0.00...","[-0.043148335, 0.0033879573, -0.06372101, -0.0...","[0.043148335, -0.0033879573, 0.06372101, 0.047...","[-0.014962567, 0.011399561, -0.011021007, -0.0...","[0.014962567, -0.011399561, 0.011021007, 0.033...","[0.19216964, 0.22088532, -0.006112458, -0.0760...","[-0.19216964, -0.22088532, 0.006112458, 0.0760..."
2,EXT,I feel comfortable around people.,+,"[0.9470231, -0.40753293, -0.21923865, -0.08873...","[0.9470231, -0.40753293, -0.21923865, -0.08873...","[-0.14180103, 0.06115172, -0.006715825, -0.010...","[-0.14180103, 0.06115172, -0.006715825, -0.010...","[0.6212666, -0.0124196885, -0.2881642, 0.15389...","[0.6212666, -0.0124196885, -0.2881642, 0.15389...","[0.03974219, 0.027704086, -0.031843316, 0.0142...","[0.03974219, 0.027704086, -0.031843316, 0.0142...","[-0.056343615, -0.005362853, -0.03972117, -0.0...","[-0.056343615, -0.005362853, -0.03972117, -0.0...","[-0.043757048, -0.0009767405, -0.013096224, 0....","[-0.043757048, -0.0009767405, -0.013096224, 0....","[-0.10007501, -0.0204347, -0.0065149027, -0.07...","[-0.10007501, -0.0204347, -0.0065149027, -0.07..."
3,EXT,I keep in the background.,+,"[-0.48998648, 1.0062209, -0.014314171, 0.83667...","[-0.48998648, 1.0062209, -0.014314171, 0.83667...","[0.083620586, 0.29526168, -0.0072725243, 0.170...","[0.083620586, 0.29526168, -0.0072725243, 0.170...","[0.36545044, 0.095221445, 0.021639008, 0.21680...","[0.36545044, 0.095221445, 0.021639008, 0.21680...","[-0.0058089048, 0.025196763, -0.019330662, 0.0...","[-0.0058089048, 0.025196763, -0.019330662, 0.0...","[-0.061723292, -0.042756796, -0.039166126, -0....","[-0.061723292, -0.042756796, -0.039166126, -0....","[0.017778004, -0.017425474, 0.0014555227, -0.0...","[0.017778004, -0.017425474, 0.0014555227, -0.0...","[0.11955111, 0.2810038, -0.007934175, 0.125276...","[0.11955111, 0.2810038, -0.007934175, 0.125276..."
4,EXT,I start conversations.,+,"[0.13467762, -0.05431028, 0.7089627, 0.7752607...","[0.13467762, -0.05431028, 0.7089627, 0.7752607...","[0.021039529, 0.22547272, -0.0064717033, -0.01...","[0.021039529, 0.22547272, -0.0064717033, -0.01...","[0.60314375, -0.09390887, 0.053446878, 0.31059...","[0.60314375, -0.09390887, 0.053446878, 0.31059...","[0.01423652, 0.011185706, -0.025439234, 0.0250...","[0.01423652, 0.011185706, -0.025439234, 0.0250...","[-0.0127179865, 0.013735699, -0.058875684, -0....","[-0.0127179865, 0.013735699, -0.058875684, -0....","[-0.040492747, 0.048252754, -0.005502875, 0.02...","[-0.040492747, 0.048252754, -0.005502875, 0.02...","[0.061613612, 0.13140441, -0.005125266, -0.059...","[0.061613612, 0.13140441, -0.005125266, -0.059..."
5,EXT,I have little to say.,-,"[-0.28237122, -0.325211, -0.0024817158, -0.282...","[0.28237122, 0.325211, 0.0024817158, 0.2827868...","[0.09287483, 0.21531062, -0.010579463, 0.05504...","[-0.09287483, -0.21531062, 0.010579463, -0.055...","[0.17511716, -0.03364262, 0.3817049, -0.114562...","[-0.17511716, 0.03364262, -0.3817049, 0.114562...","[-0.008714007, 0.044261187, -0.02394497, 0.025...","[0.008714007, -0.044261187, 0.02394497, -0.025...","[-0.02040474, -0.031669028, -0.06714511, -0.06...","[0.02040474, 0.031669028, 0.06714511, 0.063916...","[-0.01009719, -0.046241168, 0.0060269665, 0.01...","[0.01009719, 0.046241168, -0.0060269665, -0.01...","[0.13203913, 0.20213689, -0.011116844, 0.00401...","[-0.13203913, -0.20213689, 0.011116844, -0.004..."
6,EXT,I talk to a lot of different people at parties.,+,"[0.9139548, -0.112622604, 0.23311839, 0.341973...","[0.9139548, -0.112622604, 0.23311839, 0.341973...","[0.0017507094, 0.08755283, -0.0016049526, 0.03...","[0.0017507094, 0.08755283, -0.0016049526, 0.03...","[0.6083912, 0.07585364, 0.025232095, 0.0737055...","[0.6083912, 0.07585364, 0.025232095, 0.0737055...","[-0.0023374858, 0.03862828, -0.019937005, -0.0...","[-0.0023374858, 0.03862828, -0.019937005, -0.0...","[-0.04884428, 0.039444968, -0.03791988, -0.055...","[-0.04884428, 0.039444968, -0.03791988, -0.055...","[-0.05440376, -0.011923728, 0.021957237, 0.015...","[-0.05440376, -0.011923728, 0.021957237, 0.015...","[0.04691301, 0.052348092, -0.0019334087, -0.03...","[0.04691301, 0.052348092, -0.0019334087, -0.03..."
7,EXT,I don't like to draw attention to myself.,-,"[0.34498295, -0.4662026, 0.5028135, -0.1587715...","[-0.34498295, 0.4662026, -0.5028135, 0.1587715...","[0.15571819, 0.19519342, -0.00848222, 0.083171...","[-0.15571819, -0.19519342, 0.00848222, -0.0831...","[0.22649868, 0.18607202, 0.5182486, 0.16191068...","[-0.22649868, -0.18607202, -0.5182486, -0.1619...","[-0.021817671, 0.0342165, -0.0146842, -0.02585...","[0.021817671, -0.0342165, 0.0146842, 0.0258509...","[-0.01762144, -0.005854436, -0.049474314, -0.0...","[0.01762144, 0.005854436, 0.049474314, 0.06645...","[0.01553657, 0.04730013, -0.010323127, -0.0227...","[-0.01553657, -0.04730013, 0.010323127, 0.0227...","[0.12065433, 0.23835629, -0.009204242, 0.02534...","[-0.12065433, -0.23835629, 0.009204242, -0.025..."
8,EXT,I don't mind being the center of attention.,+,"[0.46359622, 0.6359473, 0.3135889, -0.17184615...","[0.46359622, 0.6359473, 0.3135889, -0.17184615...","[0.08497341, -0.11349363, -0.010437649, 0.0959...","[0.08497341, -0.11349363, -0.010437649, 0.0959...","[0.4349065, 0.11094593, 0.11703996, -0.1851378...","[0.4349065, 0.11094593, 0.11703996, -0.1851378...","[-0.0009637341, 0.011016686, -0.033334136, 0.0...","[-0.0009637341, 0.011016686, -0.033334136, 0.0...","[0.023669587, 0.03059789, -0.060036898, -0.064...","[0.023669587, 0.03059789, -0.060036898, -0.064...","[-0.01574864, 0.00084833574, 0.017250763, -0.0...","[-0.01574864, 0.00084833574, 0.017250763, -0.0...","[0.12016583, -0.111295775, -0.010428333, 0.022...","[0.12016583, -0.111295775, -0.010428333, 0.022..."
9,EXT,I am quiet around strangers.,-,"[-0.009440552, -0.1906338, 1.0183346, 0.568942...","[0.009440552, 0.1906338, -1.0183346, -0.568942...","[0.08563566, 0.32203728, -0.004853804, 0.04419...","[-0.08563566, -0.32203728, 0.004853804, -0.044...","[0.31892598, 0.10273829, 0.13866132, -0.035665...","[-0.31892598, -0.10273829, -0.13866132, 0.0356...","[0.008977587, 0.022081407, -0.024167202, -0.00...","[-0.008977587, -0.022081407, 0.024167202, 0.00...","[-0.054400723, 0.0070285574, -0.003954021, -0....","[0.054400723, -0.0070285574, 0.003954021, 0.06...","[-0.02631356, -0.020563636, 0.00824927, 0.0059...","[0.02631356, 0.020563636, -0.00824927, -0.0059...","[0.09766805, 0.27118123, -0.0044350955, -0.041...","[-0.09766805, -0.27118123, 0.0044350955, 0.041..."


### Step 3 -  Compute cosine simlarities and store the data

In [23]:
# To avoid having too long names for the output datsets, we create a list of names, which we will then use to save the embedding cosine matrices
# make sure that the names here are meaningful and aligned with those of the one in the cell above.
model_short = ['distilroberta', 'mpnet', 'miniLM', 'e5', 'labse', 'wulff', 'psych']

# Below, we loop over the different models we use for the study and compute the cosine sim. matrices.
for mod in range(0, len(models)):
  # create temporary empty lists for the item and one-pop method embeddings
  facet_embeddings_item = []

  #create cosine similarity matrix for each embedding calculation approach
  cosine_similarities_item = util.pytorch_cos_sim(df_items[models[mod] + '_embeddings'],df_items[models[mod] + '_embeddings']).numpy()

  # we don't have revesed items so code below is not necessary
  
  #fill diagonal with 1. This is done to avoid efa functions reading the cosine matrix as covariance
  np.fill_diagonal(cosine_similarities_item,1)


  #store results
  pd.DataFrame(cosine_similarities_item, columns = df_items['Item'].unique(), index = df_items['Item'].unique()).to_csv('./Data/IPIP-FFM-data-Nov/cos_matrices/matrix_concatenated_item_'+model_short[mod]+'.csv', index = False)