## In the code below we perform the following steps

- Read in the item text data (https://chatgpt.com/share/66fa7c2a-101c-800b-88a5-7334934a995d)
- Calculate item embeddings
- Reverse item embeddings if necessary (we don't have reversed items here, but this approach may be unoptimal. In case of reversed items we could use fine-tuned model as in Hommell (2024))
- Compute cosine similarities
- Store results

### 1- Read in the item text data

In [None]:
import pandas as pd
import numpy as np
# read in file with items text etc.
df_items = pd.read_csv('./Data/dass_21_items_text.csv')
df_items.head()

Unnamed: 0,Number,Factor,Item,Item_simp,Sign
0,1,Depression,I couldn't seem to experience any positive fee...,Couldn't seem to experience any positive feeli...,+
1,2,Depression,I found it difficult to work up the initiative...,Found it difficult to work up the initiative t...,+
2,3,Depression,I felt that I had nothing to look forward to.,Felt that I had nothing to look forward to,+
3,4,Depression,I felt down-hearted and blue.,Felt down-hearted and blue,+
4,5,Depression,I was unable to become enthusiastic about anyt...,Unable to become enthusiastic about anything,+


### 2- Calculate embeddings (and reverse code if necessary)

In [5]:
# First we create a list of models (all multilinguals here)
models = ['nli-distilroberta-base-v2',
          'paraphrase-multilingual-mpnet-base-v2',
          'paraphrase-multilingual-MiniLM-L12-v2',
          'intfloat/multilingual-e5-base',
          'LaBSE'] #consider adding the finetuned model for psicometrista

# Import the necessary libraries and functions
from sentence_transformers import SentenceTransformer, util

# Create an empty data frame, which we will then populate with the different type of embeddings
facet_embeddings_sentences = pd.DataFrame()

for mod in models:
    model = SentenceTransformer(mod) #call the model
    item_embed = [] #create list for item-level embed
    item_embed_rev = [] #create list for item-level embed accounting for sign
    for item in range(0,len(df_items['Number'])): #loop over all the items
    #encode items
        item_embed.append(model.encode(df_items['Item_simp'].iloc[item]))
        if df_items['Sign'].iloc[item][0] == '-': #if items is negatively keyed, reverse the embeddings
            item_embed_rev.append(model.encode(df_items['Item_simp'].iloc[item])*-1)
        else:
            item_embed_rev.append(model.encode(df_items['Item_simp'].iloc[item]))
    df_items[mod + '_embeddings'] = item_embed #then, we append the two item-level embeddings list and give them a name based on the model we used
    df_items[mod + '_embeddings_rev'] = item_embed_rev

In [6]:
df_items

Unnamed: 0,Number,Factor,Item,Item_simp,Sign,nli-distilroberta-base-v2_embeddings,nli-distilroberta-base-v2_embeddings_rev,paraphrase-multilingual-mpnet-base-v2_embeddings,paraphrase-multilingual-mpnet-base-v2_embeddings_rev,paraphrase-multilingual-MiniLM-L12-v2_embeddings,paraphrase-multilingual-MiniLM-L12-v2_embeddings_rev,intfloat/multilingual-e5-base_embeddings,intfloat/multilingual-e5-base_embeddings_rev,LaBSE_embeddings,LaBSE_embeddings_rev
0,1,Depression,I couldn't seem to experience any positive fee...,Couldn't seem to experience any positive feeli...,+,"[-0.3158005, -0.3829513, 0.48537698, -0.625396...","[-0.3158005, -0.3829513, 0.48537698, -0.625396...","[-0.02480041, 0.19487093, -0.01487695, -0.0014...","[-0.02480041, 0.19487093, -0.01487695, -0.0014...","[-0.048265327, -0.16294217, 0.46215764, 0.2090...","[-0.048265327, -0.16294217, 0.46215764, 0.2090...","[-0.0122590475, 0.037404567, -0.013672275, -0....","[-0.0122590475, 0.037404567, -0.013672275, -0....","[-0.04020344, 0.048688333, -0.064000644, -0.04...","[-0.04020344, 0.048688333, -0.064000644, -0.04..."
1,2,Depression,I found it difficult to work up the initiative...,Found it difficult to work up the initiative t...,+,"[-0.18894802, -0.64916545, 0.7094273, 0.659779...","[-0.18894802, -0.64916545, 0.7094273, 0.659779...","[0.07984255, 0.18823034, -0.0055116713, -0.052...","[0.07984255, 0.18823034, -0.0055116713, -0.052...","[0.2019996, 0.033766832, 0.08607383, 0.1883425...","[0.2019996, 0.033766832, 0.08607383, 0.1883425...","[-0.016377548, 0.03569251, -0.013332962, -0.00...","[-0.016377548, 0.03569251, -0.013332962, -0.00...","[0.013373521, -0.033622183, 0.026094794, -0.06...","[0.013373521, -0.033622183, 0.026094794, -0.06..."
2,3,Depression,I felt that I had nothing to look forward to.,Felt that I had nothing to look forward to,+,"[-0.6624554, -0.611977, 0.05714376, -0.4496236...","[-0.6624554, -0.611977, 0.05714376, -0.4496236...","[-0.0048325206, 0.22798342, -0.01276125, -0.04...","[-0.0048325206, 0.22798342, -0.01276125, -0.04...","[0.07067317, 0.13515294, 0.30388242, 0.1631882...","[0.07067317, 0.13515294, 0.30388242, 0.1631882...","[-0.00169078, 0.053613465, -0.008373259, -0.02...","[-0.00169078, 0.053613465, -0.008373259, -0.02...","[0.0055553135, 0.007142253, -0.052212015, -0.0...","[0.0055553135, 0.007142253, -0.052212015, -0.0..."
3,4,Depression,I felt down-hearted and blue.,Felt down-hearted and blue,+,"[-0.5374446, 0.36082685, -0.059091702, -0.3411...","[-0.5374446, 0.36082685, -0.059091702, -0.3411...","[-0.04216304, -0.00061915664, -0.014039477, -0...","[-0.04216304, -0.00061915664, -0.014039477, -0...","[0.4459436, 0.17380352, 0.20326674, 0.39640555...","[0.4459436, 0.17380352, 0.20326674, 0.39640555...","[0.008798687, 0.035582453, -0.027884167, 0.017...","[0.008798687, 0.035582453, -0.027884167, 0.017...","[-0.056859806, 0.029409863, 0.033236753, -0.05...","[-0.056859806, 0.029409863, 0.033236753, -0.05..."
4,5,Depression,I was unable to become enthusiastic about anyt...,Unable to become enthusiastic about anything,+,"[0.81100816, -1.0644746, 0.3099635, -0.791149,...","[0.81100816, -1.0644746, 0.3099635, -0.791149,...","[-0.19479819, 0.18335259, -0.008176405, -0.018...","[-0.19479819, 0.18335259, -0.008176405, -0.018...","[0.61561036, 0.14149758, 0.14338212, 0.2752302...","[0.61561036, 0.14149758, 0.14338212, 0.2752302...","[-0.0034387005, 0.0323207, -0.013190221, 0.005...","[-0.0034387005, 0.0323207, -0.013190221, 0.005...","[-0.013530439, -0.025689524, -0.046821017, -0....","[-0.013530439, -0.025689524, -0.046821017, -0...."
5,6,Depression,I felt I wasn't worth much as a person.,Felt I wasn't worth much as a person,+,"[-0.24069123, -0.18620928, -0.31876132, -0.552...","[-0.24069123, -0.18620928, -0.31876132, -0.552...","[0.03748162, 0.15523651, -0.01225272, 0.124276...","[0.03748162, 0.15523651, -0.01225272, 0.124276...","[0.29745805, 0.4353412, 0.044720616, 0.0962538...","[0.29745805, 0.4353412, 0.044720616, 0.0962538...","[0.0014887145, 0.02753759, -0.014364549, -0.01...","[0.0014887145, 0.02753759, -0.014364549, -0.01...","[-0.030117314, 0.016044328, -0.061733473, -0.0...","[-0.030117314, 0.016044328, -0.061733473, -0.0..."
6,7,Depression,I felt that life was meaningless.,Felt that life was meaningless,+,"[-0.27179113, 0.4042701, -0.2054088, 0.1676077...","[-0.27179113, 0.4042701, -0.2054088, 0.1676077...","[0.08654165, 0.20552057, -0.016810328, -0.0242...","[0.08654165, 0.20552057, -0.016810328, -0.0242...","[0.19524333, 0.1642003, 0.16103373, 0.34806836...","[0.19524333, 0.1642003, 0.16103373, 0.34806836...","[0.0118107, 0.030455621, -0.0027262208, -0.012...","[0.0118107, 0.030455621, -0.0027262208, -0.012...","[-0.032684788, -0.04294749, 0.0043068174, -0.0...","[-0.032684788, -0.04294749, 0.0043068174, -0.0..."
7,8,Anxiety,I was aware of dryness of my mouth.,Aware of dryness of my mouth,+,"[0.08629536, -0.5110814, -0.4306075, -0.171556...","[0.08629536, -0.5110814, -0.4306075, -0.171556...","[-0.047529362, -0.17959344, -0.006625055, -0.0...","[-0.047529362, -0.17959344, -0.006625055, -0.0...","[-0.044910856, -0.25808942, 0.29242262, -0.004...","[-0.044910856, -0.25808942, 0.29242262, -0.004...","[0.012569076, 0.046004616, -0.01557028, 0.0022...","[0.012569076, 0.046004616, -0.01557028, 0.0022...","[-0.03967858, -0.03191511, 0.050942965, -0.080...","[-0.03967858, -0.03191511, 0.050942965, -0.080..."
8,9,Anxiety,"I experienced breathing difficulty (e.g., exce...","Experienced breathing difficulty (e.g., excess...",+,"[0.29889363, 0.15076897, 0.65475935, -0.037349...","[0.29889363, 0.15076897, 0.65475935, -0.037349...","[-0.19410641, -0.13367753, -0.005985759, -0.01...","[-0.19410641, -0.13367753, -0.005985759, -0.01...","[0.16454053, 0.10429108, -0.012673147, 0.41033...","[0.16454053, 0.10429108, -0.012673147, 0.41033...","[0.019833004, 0.046239898, -0.011120523, 0.017...","[0.019833004, 0.046239898, -0.011120523, 0.017...","[-0.032675963, 0.030175647, -0.030280458, -0.0...","[-0.032675963, 0.030175647, -0.030280458, -0.0..."
9,10,Anxiety,"I experienced trembling (e.g., in the hands).","Experienced trembling (e.g., in the hands)",+,"[0.33459106, -0.18119283, 0.19334221, -0.01259...","[0.33459106, -0.18119283, 0.19334221, -0.01259...","[-0.095299855, -0.008029469, -0.0070467223, -0...","[-0.095299855, -0.008029469, -0.0070467223, -0...","[-0.016462142, -0.08589119, 0.2594016, 0.53160...","[-0.016462142, -0.08589119, 0.2594016, 0.53160...","[0.030165693, 0.05655457, -0.017961523, 0.0352...","[0.030165693, 0.05655457, -0.017961523, 0.0352...","[0.00049121527, -0.023566762, 0.0293058, -0.04...","[0.00049121527, -0.023566762, 0.0293058, -0.04..."


### Step 3 -  Compute cosine simlarities and store the data

In [7]:
# To avoid having too long names for the output datsets, we create a list of names, which we will then use to save the embedding cosine matrices
# make sure that the names here are meaningful and aligned with those of the one in the cell above.
model_short = ['distilroberta', 'mpnet', 'miniLM', 'e5', 'labse']

# Below, we loop over the different models we use for the study and compute the cosine sim. matrices.
for mod in range(0, len(models)):
  # create temporary empty lists for the item and one-pop method embeddings
  facet_embeddings_item = []

  #create cosine similarity matrix for each embedding calculation approach
  cosine_similarities_item = util.pytorch_cos_sim(df_items[models[mod] + '_embeddings'],df_items[models[mod] + '_embeddings']).numpy()

  # we don't have revesed items so code below is not necessary
  
  #fill diagonal with 1. This is done to avoid efa functions reading the cosine matrix as covariance
  np.fill_diagonal(cosine_similarities_item,1)


  #store results
  pd.DataFrame(cosine_similarities_item, columns = df_items['Item_simp'].unique(), index = df_items['Item_simp'].unique()).to_csv('./Data/cos_matrices/matrix_concatenated_item_'+model_short[mod]+'.csv', index = False)


  a = torch.tensor(a)
